diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..62e965b0a44a6869ae1ca6308bd62ba8aa3d6500
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "up_proj",
+    "gate_proj",
+    "down_proj",
+    "q_proj",
+    "v_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..094c5f455bf35a3701037f8cf928766bda4e2fdf
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6471582fd50057e84dfb08f495660314c24a16a94f8ac9426da1edcd17b1603
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4fdf7989abc46c9a30ad7289defc684eaadae19e
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:582698d24359f6be8e4df167566915dd311ba98e0ec3ef337f6141b3712b3e53
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..69c3e79ecc4618eceb3a7a2c6df65a1eb4bea614
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.9719731560555253,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.4225,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 1.0156967480828982,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.3212,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.9238233668481546,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.4215,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.7973955034809719,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.3138,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.6758347176205124,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.0945,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.971727217409248,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.31,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.7867392045118493,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.1325,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.1691873896204046,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 1.1948,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 1.1988387781152483,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 1.1633,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.9327563094331902,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 1.1291,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.8979794621884146,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 1.0959,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.6473368763743381,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 1.0195,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.5888234879398717,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 1.0351,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.6835992554214257,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.916,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.6155398013768341,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.9745,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6424060942035926,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.9484,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.532550668054331,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.9576,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.6359298776562393,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.9781,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.7033509923785414,
+      "learning_rate": 0.0002,
+      "loss": 1.0213,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.6136991168036158,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8679,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.5683787919304325,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.8792,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.6975898261539225,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 1.122,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.625225000612276,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 1.071,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5842836648495332,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 1.0011,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6425014712125224,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 1.0429,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5766587264596573,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.9554,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.5155142424732911,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8836,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.555621828495599,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8526,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.692065965650676,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 1.0003,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.6149606129626048,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 1.0233,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.580044488482369,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.944,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5745893317976081,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.8352,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.4949387938042247,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8841,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.5776965982580251,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 1.083,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.5236266882773968,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.9272,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.724621871905579,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.9061,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.57728187410348,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.9413,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5937644360791755,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8739,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.5885045750020044,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.9292,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5375604144078661,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.9803,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.5224367418365979,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.812,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.5626009709995728,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.8799,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.502499897060231,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.9542,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.5909498723276309,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.9373,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.48624540596138904,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.886,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.7926471063751397,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 1.0794,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.5005786861152867,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.9054,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5011911229465077,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.9255,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.49153908455646683,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.9401,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7739765898833159,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.9633,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.5981328278567677,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.9323,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.6930023972779622,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 1.0079,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.5086530254159303,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8875,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5833312107461119,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.8608,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.60348201124767,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.9313,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.48903426313247045,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.9042,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.4897146593936715,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8525,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.5551167051993521,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.9554,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.5561672558240812,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.8759,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5074154141993045,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.882,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.6574299113732572,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 1.0118,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.6004577635455566,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.882,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.646991913244477,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.9729,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5377906520286383,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.9647,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5403821486665964,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8416,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.5958932417106001,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.9724,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.5380838249700484,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.9369,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.5767496831021328,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.9476,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.5025417082598497,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8396,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5741017467302606,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8811,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.5606179041892586,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.8185,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.5649379648112125,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.8958,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.46302038221209,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.785,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5873484897971788,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.976,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5372137888546485,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.85,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.5283284433789608,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.958,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.527865127960602,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8401,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.5918748384738277,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.8535,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.5504783750584027,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.8036,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.6339201701822237,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.9951,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.6336655475736347,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 1.0156,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.54718693044338,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8811,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.5974586650433192,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8694,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.48432957860470743,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8565,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.5501452529707288,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.9211,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.581601886863996,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.944,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.5453290568380136,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.9025,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5137363846987513,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8192,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.43975358822950616,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7649,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.6477805482411492,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8912,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.5549996594907376,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.9085,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.5729456916395933,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.8908,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.48159418626487066,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7758,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.5822518035352405,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8969,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.5243033839009505,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.8717,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.5346613252480791,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8805,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.6011773508431371,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.8725,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.4346343771047141,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.8198,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.5150804851014471,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.9334,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.7443920020926502,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8586,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.6326828101232339,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.9827,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.4786775007935448,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.8316,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.7659943463209778,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.9851,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.5138606340061297,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.8502,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.6883167701504389,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.9245,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.5710717799313989,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.8528,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.500567838603159,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.8482,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.5275356495515479,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.9279,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.42710809202511696,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7936,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.6248144346372322,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.7894,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.5274604228677718,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.9088,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.5020540127916125,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.8505,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.45423127056696233,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.7561,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.646411592003773,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.9523,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.6011236294592678,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.8715,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.6231809150699286,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.889,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.5530901333727157,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.8234,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5922878804590594,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.9207,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.47832642676613,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.8734,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.5546011631385598,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8495,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.6641640855239751,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 1.0706,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.5164084220861261,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.8893,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.6047446965306861,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 1.0178,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.5902333542356276,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.8912,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5105567603891245,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.827,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.45849702835193873,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8058,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.6350828160949975,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 1.0222,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.5953872318050398,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.9346,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.460052987155817,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7464,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.5563549169254246,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.9149,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5503538093328367,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.9234,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5528895401914546,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.9652,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.517006820603383,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.8664,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.6855622536343549,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 1.0355,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5230125244800105,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.9216,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.5450231518355951,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7938,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.5292627538308421,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.8114,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4524328604491942,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.8356,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.5826249323016475,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.9046,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5178297827336125,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.8889,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.5172877366154405,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.866,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.8026104835258362,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 1.1493,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.6496777951298773,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.8841,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.5320786252973128,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.8307,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.4383389820784073,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.78,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.6606834795469824,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.9924,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.42540856147138034,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7706,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5845185060302059,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7887,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.5721599134367784,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.8799,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5268683281377597,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.8091,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.43845575615203913,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.6761,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.43555546938609113,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.8649,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.536111821484135,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.8726,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.5347321227840028,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.8387,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.45982293352296677,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.7505,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.49801018069699443,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.9193,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.5387030949788503,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.7959,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.48118102523874073,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.8368,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.5337695716325006,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.8524,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.49259858479074997,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.8575,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.5416874456814282,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.9517,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.6131632969712694,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.921,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.5161966804309467,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.8919,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.5670918355550684,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.8829,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.5174438543801007,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.8312,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.5906430371847827,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.9143,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.5551297460220246,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.8797,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.5511750615423108,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8409,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.536473320667922,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.802,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.49954805051121204,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.8827,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.6099699365320334,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.9626,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.5299390503146365,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.9175,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.5628722071736884,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.7777,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.4729580792570179,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.8673,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5930437779821152,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.8206,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4292997621821196,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.8056,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.5944677864022473,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.9485,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.5442944872131958,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.8904,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.46835436930261953,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.8104,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.5506574284037457,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.8928,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.6866455236359058,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 1.0329,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.5153754305561259,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.8631,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.5421279505788661,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.9095,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.492670751330977,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.8032,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.5320609790810602,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.8853,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.6153617937266217,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.9551,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.5870908953633535,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.8036,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.5717943137734841,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.9023,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.5250490799889618,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.8634,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4879114155947266,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.8236,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.5500009660496658,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.9207,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.5064159311989878,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.8665,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.5204543664192307,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.8569,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.47615090357183554,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.8748,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.46986745059184,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.8208,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.5662189526408826,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.9079,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.4614771187559411,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.8955,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.4128655696266702,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.73,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.46980684931672995,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.8642,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.503751885111121,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.898,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.5246864852992136,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7843,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.6556184227171945,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.9596,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.5452081719478181,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.8479,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4689338175719907,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.8096,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.5340978809935831,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.8197,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.5104927446857357,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.8107,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.5383843138676714,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.8387,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.4971648262048712,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.8195,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.536028429366966,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7823,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.49573802462995736,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.8979,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.6504266678543889,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.9873,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.5493979306274729,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.909,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.607176966768405,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.9769,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.6026217081429099,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.9078,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.8260546535143017,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.9109,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.565042480096335,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.9683,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.6631641156663064,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.8976,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.451467688196003,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7868,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.5062873359160209,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.844,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5505012757477366,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.9056,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.48791654606213525,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.772,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.4867542459090678,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7957,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.4666455643040047,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.8445,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.7241971297993525,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.8769,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5014548367408045,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.6988,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.5485974010726679,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7829,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.6326692826101251,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7927,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.6754768867285195,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.9564,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.5253054477228808,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.8997,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.5032148716031363,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7991,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.5873720857985434,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.875,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.536716757908386,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.9323,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.5329601916499632,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.8414,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.6133109203613764,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.9213,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.5420958304531465,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7188,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.472702891895565,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.842,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.5724255421667166,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7698,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.5598104974516404,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7986,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.4895677463312306,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.8813,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.5162603592848155,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.8845,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.5869776595758642,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.861,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.5617615598123941,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.9071,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.47855681145062084,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7601,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.604374905007272,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.8423,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.597112711619441,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.902,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.49367528291776197,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7977,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.46024520939954344,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.823,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.5064679136575085,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.8547,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.4681231694973576,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7197,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5194132813288855,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7684,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.44168937865030244,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.7906,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.5363492869729305,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.8445,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.5630884118990156,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.8903,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.5421091365259456,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.7832,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.5976361727040347,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.905,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.7722641770049049,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.951,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.5342617296373594,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7965,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4493539228425194,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.8467,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.42445309368198597,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7371,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.6110142215922019,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.8311,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.4171728629188897,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.8136,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4145065262066619,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7543,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.39505478649502457,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7792,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.4750706454904371,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.844,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.5645172644167065,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.8761,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.5047096633256831,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.8484,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.5466855524580382,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7414,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4527502164076956,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7615,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.621948024176012,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 1.0494,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.5278015972675635,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7699,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.4788403912458289,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.8666,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.6796695538593274,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 1.0207,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.5094336585671372,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.793,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.44382775649854656,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7909,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.5464019050437547,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7833,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.440164784544212,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.8175,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.523340673372237,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.8794,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.46938605542367634,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7615,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.6612564034378013,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.8776,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.5198656780429122,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.867,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.5849180701499528,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.9267,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.445922051992169,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.809,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.5233120278969384,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.8476,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.4467545901087636,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.8565,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.5419164891412087,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.8012,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.49153110184299614,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7781,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.441225854649443,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7527,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.5420293577345939,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.8052,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.621160158670621,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.8041,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.6226969416632548,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7079,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.46305532658623033,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.79,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.512448960407912,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.8365,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.4574034506128639,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7266,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.4887805022058899,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.7922,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.5008127484650859,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.8055,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.6533437588644191,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.9421,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.43865513025574515,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.8206,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.500355747105389,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.7647,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.5046542253194481,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7697,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.47989589310254305,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7974,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.45532907895699337,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.7756,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.43962213578637105,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.8162,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.5606074664098125,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.8901,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.5079830206160058,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.8149,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.5085969014108155,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.8234,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.47466515207812976,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.7461,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.46289215202445777,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7507,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4921323175256616,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.8246,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.5238576407371962,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.8353,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.5521943239755719,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.8444,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.49431425624314157,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.8645,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.7208355365716979,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.9068,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.41838050675402194,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7234,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.5633323218709908,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.9846,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.5331866506375034,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.8208,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.45965572502528623,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7831,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.4637670762530522,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.8461,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.5027914914138983,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.8697,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.4593080370419945,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.748,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.534380283754602,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.8775,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.5399591394604326,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.8975,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.5168919839273327,
+      "learning_rate": 0.0001,
+      "loss": 0.7658,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.698638003373429,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.9027,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.4983715615160177,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.7335,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.4808635157821013,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7875,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.6031685623744499,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.936,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.5441544394396668,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.7774,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.47565093435532546,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.8093,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.4429665536441755,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7513,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.5070355088304355,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.8311,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.4754187913585242,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.7966,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.422923847295543,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.736,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.4359887732874065,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.7062,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.4390223413713934,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.8652,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.4118609935094733,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.7834,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.4211081945079039,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7597,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.45862514356503903,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.8133,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.5068072677401498,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.8107,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.5341515575737829,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7058,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4189755931038567,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.6702,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.5439267809005683,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.7915,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.5889477087996099,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.8669,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.429007878832156,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.7741,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.5468616463299769,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.8144,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.5383443070500786,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.8272,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.40094297792258693,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.698,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.5756884008992263,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.8481,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.43047632372839495,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.6932,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.5593530796265426,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.8549,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4780192226188432,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.7478,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.6674983319766304,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7739,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.44192395234058857,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7825,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.41801756105187,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6727,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.43331326662765,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.6991,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.5102144606411114,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.8652,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4470810685284007,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7642,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.46452889334302405,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.7703,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.5029399250037823,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.778,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.5430178947137475,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.899,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.6073099557016157,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.8754,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.5139804486458024,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.8326,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.5099495498357821,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.8307,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.5524566640249836,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.8825,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.48527919854222334,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.8023,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.5126055869339473,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.8201,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.5120298082033016,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.7424,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.5961778683910078,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.8604,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.5154840624172337,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.8931,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.4415433685673907,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.7059,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.43716725195073497,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.7618,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.44773956762975003,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.7404,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.5878875504156181,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.9092,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.40954711385606696,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.7131,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.43823375388049757,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.8534,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.5281994591442473,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.8525,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.6216439994690285,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.912,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.5369931377412475,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.8762,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.42704156401937954,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.7276,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.5406466868308399,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7516,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4561752776969113,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.7388,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.47229546061123273,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.7843,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.4325737623515653,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.7616,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.49204149089209853,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.7128,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.559562407478129,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.7505,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.6570003569302663,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.758,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.621282987201334,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.8657,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.4708716086815571,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.7436,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.6155562873756527,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.8836,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.5301712023276384,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.8209,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.4583907825180325,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7899,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.5170408420260697,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.7679,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.43221213770041583,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.8044,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.45574898169048583,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7414,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.4493726577026947,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.7452,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.49372481319737627,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.8363,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.4999818684934643,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.778,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.4717466234568454,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.7671,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.6376163660666853,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.8994,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.34455217736810373,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.6036,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.4902974603118997,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.7374,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.49017582100179835,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.7625,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.5787294205160753,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.7998,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.5165118760391255,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.7682,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.49921952578607487,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.8486,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.4209301751735459,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.7568,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.5408122468777863,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.7727,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.4796961423122955,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.8001,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.5641615170401117,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.8703,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.5165229219449532,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.8715,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.5041378256889354,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.7968,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.7716443536792185,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.993,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.4854334707630958,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.8109,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.4720714705256159,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.797,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.5897099387394112,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.9193,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4629166003638903,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.7839,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.5081906721526724,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.7701,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.464097091171769,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.7314,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.48248576515473834,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.8174,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.5692971171059747,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.7447,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.5419683355113112,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.8103,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.45507138238142675,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7874,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.481578233270933,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.8005,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.436227883746338,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.6995,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.47298332089542805,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.7025,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.6106500045165093,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.8114,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.47740320285622895,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.765,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.702214183673606,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.8412,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.5562422215336019,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.7733,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.4086239010126385,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.7158,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4250546088154585,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.6981,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.5252528262159712,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.7999,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.5250093628028863,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.7516,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.44944119057037624,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6996,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.373339469396088,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.702,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.5091404542609552,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.8324,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.4559081840121177,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.7937,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.5870073860406521,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.8393,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.41024239467456186,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.7166,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.47804588242983204,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.8401,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4462204893506717,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.7685,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.4025251470225496,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.7112,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.6927721113610363,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.789,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.4873568417670427,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.7658,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.548380044705112,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7898,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.45470480619928993,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.7298,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.653472115293909,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.9265,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.5147624023346475,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.8142,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.5852532764292365,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6888,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.38289933675282234,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6713,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5745378908728875,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.8259,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.5227230237807072,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.7907,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.48336776108580554,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.7115,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.3610806178212394,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.651,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.66297172148611,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.9544,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.44161095809461837,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.7518,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.4575633460607062,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.7552,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.722723677588711,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.866,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.575276403486101,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.8554,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.45820156110847365,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.7398,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.39389106563921494,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.7677,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.4857233278601645,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7849,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.5844203559113743,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.8875,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.47011962025880005,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.8164,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.5636903977516472,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.956,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.5482145020390884,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.8175,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.4676641774885784,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.682,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.444258649331897,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.7999,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.5017393200629316,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.7675,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.4951559521909207,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.7777,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.5125666277336168,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.7507,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.7750273597653224,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.8524,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.4861304027210477,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.7753,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.4889696238512044,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.7927,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.40222117675260893,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.7612,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.5650093895753036,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.8037,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.43315491250760535,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6886,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.5462208159205092,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.8101,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.6026763729016966,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.8718,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.4046841435686921,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.7625,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4671850132560188,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6781,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.6932221888293731,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.7224,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.4637541683704292,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.8209,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.558805849823971,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.741,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4547010934608763,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.7943,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.5749560405477635,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.9117,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.43805563386820356,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.7424,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.534952948286577,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.8127,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.6362228916834066,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.9701,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.45720848528485347,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.737,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.5789284241390055,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.7442,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.569155336267483,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.7617,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.443122709439156,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.7245,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.4574876463725705,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.7237,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.45389616141071737,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.7499,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.5023965935902118,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.7202,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.4217911819254415,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.7833,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.5839434139720753,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.8005,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.5969424275607362,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.8378,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.45194175004812703,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6436,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.47053496931751265,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.7197,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.6773280472521463,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.9276,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.5530705397733027,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.9194,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.4372528048780638,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7198,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.4732575173494202,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.75,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.4720051082167063,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7499,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.5569199783528601,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.845,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.49977114616282897,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.8465,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4653277907089936,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.8465,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.7261918299101087,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.8393,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.4956959003699411,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.7907,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.3960118701005354,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7214,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.5451865636264599,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.7114,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.48667029153582564,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.7863,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.41232333715100755,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6882,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.5627067828043002,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.8143,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.46843238997424697,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.723,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.4972667220593963,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.7925,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.4073474634815315,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6979,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.6293974773116691,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.9463,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.509981295029035,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.8322,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.5875459366532086,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.9178,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.4584914305425312,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.696,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.8617998536964497,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.9026,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.49806760390577076,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.6906,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.44225597493758884,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.7245,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.4401043803864672,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.7212,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.4323572558722791,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.7397,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.43876814969221634,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.7483,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.3798739027513692,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6692,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.5945235758940287,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 1.0929,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.4825115362383566,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.7331,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.6299236378392342,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.7812,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.5666011324808448,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.8569,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.5553744921369806,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.8124,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.510333686902676,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.816,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.49559900235242754,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.7089,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.46933367880068116,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.7866,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3778769997968228,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.7081,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.6113594137978235,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.8307,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.9287695702050881,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.7441,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.5382515041385985,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.7369,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.4159002164402231,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.7241,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.6291581385668585,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.7429,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4753877904232731,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.7765,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.5872610973791114,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.7532,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4498892421917224,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.7024,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.8123422803137543,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.779,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.41929481717214745,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.7372,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.5222859963183167,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.8549,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.6670441830142942,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.8975,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.43915978436950875,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.7522,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.4456511443247101,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6921,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.4615977748085632,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.7786,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.38678529681334994,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.7327,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.5884446912909952,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.8667,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.42588880798128614,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.7183,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.4973555744472622,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.759,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.44126780350963773,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.839,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.5496050304014765,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.7896,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.4314543635541692,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.7548,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.4939857171123341,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6889,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.6803546939689971,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.8241,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.4302650402092962,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.7462,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.43983569426056496,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.7119,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.5166886782701035,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.7505,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.7389775548358781,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.9254,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.6000213315619103,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.8382,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.4246772837815841,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.7941,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.4927044853629139,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.7817,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4437912304260506,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.7978,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.41022730357927784,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.7691,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.5113800945371163,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.8154,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.37807366451747665,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6815,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.6703509003684044,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.7944,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.49467576514381967,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.7308,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.545773796154925,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.7476,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.46733782288285114,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.8258,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.43459441201655236,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.7427,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.42114086561672265,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.7156,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4923794610529232,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.7756,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.4677723291090093,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.7597,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.5635557595289095,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.8143,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.45298397318059835,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.7112,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.5286896161816468,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.8504,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.4430491475058911,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.7598,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.4769554151753007,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.8451,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.41206085619808797,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.7857,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.4534131127881824,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.7216,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.5007808999973069,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.7748,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.7024634335260679,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.7737,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.5955987116221747,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.8271,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.46472634520405853,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.7521,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.5521989073266615,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.8539,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.38375503604814787,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6662,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.5227163985214974,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.8451,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.4462907521764808,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.7404,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.5570230980140358,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6947,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.595493579967642,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.9008,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.46527357099134,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.803,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.5069357035703678,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.8218,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.49345661890932047,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.7617,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.5005185587015021,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.868,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.5767711807104086,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.764,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.4629486587330603,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.7673,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.4937838492946445,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.7787,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.45757088915353605,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.7826,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.5804890411250316,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.8319,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.505528222478651,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6136,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.5384893329418087,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.8173,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.5356010978715702,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.8947,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.5494221178715695,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.8192,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.39234423655748984,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6581,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.41294804437037486,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6828,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.43510811411794426,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.7168,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.4369476079696727,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.7076,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.4669451243115884,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.8218,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.48882962319266837,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.811,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.49329878789885573,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.7949,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.7210438768172956,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.8171,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4174504571366966,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6792,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.4766108218070693,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.7452,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.569957272402524,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.8384,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.4438365781454997,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.7255,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.46433683282643373,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.7341,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.5478266890770399,
+      "learning_rate": 0.0,
+      "loss": 0.8425,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 503858269749248.0,
+      "train_loss": 0.8357635734558105,
+      "train_runtime": 9226.2551,
+      "train_samples_per_second": 1.084,
+      "train_steps_per_second": 0.068
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 503858269749248.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cfe11e900767f08bfbb608c972a3e25ffe8016c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e1cd9633da34311c727be24d824c74e843e44e1b
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:509f4fe725368c830f31a07e9dfa637e7409475215bca031f12ab92ad28fc1d5
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0a847979c5e08ea1c20375bbacfd924f19b9a600
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cfa623b92512d18b2427e22625319dbb311b37891dcfc42a982b687758f3ac6
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..737baaad191e95a5021aa1d7fee8d0288f129c00
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8230906866528206,
+      "learning_rate": 2e-05,
+      "loss": 1.3718,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8011482349224612,
+      "learning_rate": 4e-05,
+      "loss": 1.3884,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.6777465654408708,
+      "learning_rate": 6e-05,
+      "loss": 1.2942,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.6985842469244755,
+      "learning_rate": 8e-05,
+      "loss": 1.3011,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.6358810285065617,
+      "learning_rate": 0.0001,
+      "loss": 1.2236,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.8043615850550121,
+      "learning_rate": 0.00012,
+      "loss": 1.1222,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.8766371190581345,
+      "learning_rate": 0.00014,
+      "loss": 1.0481,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5896469063055397,
+      "learning_rate": 0.00016,
+      "loss": 1.0115,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5140042859696677,
+      "learning_rate": 0.00018,
+      "loss": 1.0032,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.47047377131530216,
+      "learning_rate": 0.0002,
+      "loss": 0.9718,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.4689357680369753,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 1.0164,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.49055471890156765,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 1.0478,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.48360570699467204,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 1.0179,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.4385882347361944,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.8839,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5842432990089418,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 1.0378,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4798987625762095,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9014,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.43385099220728984,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.9908,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.4363177117395925,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.9302,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.4184869797333112,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.9156,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4093043649081553,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.9582,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.40265123100906836,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.8491,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.40866258779980846,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.9515,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.47714968174592676,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.983,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.3725347901708648,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.9136,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.48768461879424546,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.9486,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.46702127014841616,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.9692,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.39034215128065525,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.8706,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4730253707882492,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.9078,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.3566887338551931,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.8921,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.37413026403570376,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.8723,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4494091415776092,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.9454,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4380782092404729,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.9703,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.4024207233737277,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.8925,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.40406197533093235,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.9361,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.3618341982376051,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.8614,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.3658614124884964,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.8498,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5573838774300858,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.8736,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.38557296694847226,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8929,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.40079642763917983,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.8423,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.38961763082023404,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.8835,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.4099214309122936,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.9405,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.39275472621957447,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8549,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.37890491998796727,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.9153,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.3677543896244243,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8453,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.3798879355149729,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.8136,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.39541256509109524,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8917,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.3933918528877009,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.8155,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.40698599023324944,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.8666,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.40042005946240466,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.8412,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4607168760067279,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8868,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.39972125720735685,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.8918,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4188280311405362,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.902,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.42940697765165986,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.8641,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3545329739920464,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8735,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.39694907744821073,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.7712,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.3873462097880066,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.8613,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.39944251069469355,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.8342,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.38873644333253415,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.8602,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.41970967893409644,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.8577,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.3660391396843375,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.8452,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4567680983316884,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.9641,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4347858356915908,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.9423,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.3337070071383876,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.8076,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4286830947909843,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.9527,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.35295999710631726,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.8146,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.40069838158976223,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.9182,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.428778831044889,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.9332,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.35155177053946185,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.8472,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.3393523853315711,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.8116,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.39467587805954535,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.8882,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.45738467437649594,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.9899,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.40860575562909096,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8518,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.37358375585659853,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.8671,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.3614438681293151,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.7581,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.39185167616797406,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.8273,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.3725033489438709,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.7593,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.3879109255323267,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.8456,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.3449533879291914,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.8246,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.3557304717641571,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.8002,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.39624711741320795,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.845,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4040277144139735,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.9255,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3573834496881195,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.8664,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.39861349237455607,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.869,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4122338593577007,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8542,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.38711562232847824,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.8265,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.39947214892388133,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.9227,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.36818193645695974,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.8025,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.3563867965150087,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.799,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.3863926930438656,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.9039,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.35592980299837496,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8311,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.4204313739806602,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.9226,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.3723535694546385,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.8311,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.41234782362203953,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.8979,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.4053425895286585,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.8344,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.36505412209597476,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.8345,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.382611792068928,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.8749,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.37741695374713863,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.8493,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4676602590760305,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.8432,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.34327944895308454,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.7974,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3556557830313786,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.8677,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.4186166790777886,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.8586,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.37112136732584244,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.8209,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.38959686649650355,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.8077,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.36221559570247797,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.8142,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.3408395021986205,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.825,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.46256890237184806,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.9299,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.4279687120096611,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.9313,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.499080817408835,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.9257,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.3677888976110564,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.8234,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.40300933145036566,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.8661,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.35272019193019816,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.7685,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.42072850665932426,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.838,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.3843947444096128,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.7295,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.43191105712976785,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8535,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3584146254614676,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.8303,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4020587727121328,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.8827,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.42244806005338637,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.8637,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.3490560388254176,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.7685,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.3610932978019899,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.7689,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.35696651622274633,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.8692,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.37982273980822234,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.8633,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.49338348781038344,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7882,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.41407314429068576,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.8352,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.35863668034543733,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.824,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.34638170296412163,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.7332,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.33192770455483456,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.8012,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.4018840350246986,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.8274,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.46935625670519754,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.9137,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3467457829185042,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.8057,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3713608562339026,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.778,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.2953063312947437,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.7741,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.315778283756295,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7968,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.3777293731399955,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.8476,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.36565481085206214,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7424,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.40097648303660766,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.8961,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4335469508570514,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.9337,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3571253000150674,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.7772,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.4175995621863514,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7899,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.3555584844534414,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.8058,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4478506698284743,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.8602,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3924154628163152,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.8566,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3413680221828951,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.8442,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.37597687671024504,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.7896,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.35508126757199976,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7702,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.35676098581829857,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.7463,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.34262057715324995,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.8023,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3160775424152765,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.753,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.41408430747051145,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.8722,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.3807991104363706,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7902,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3444267731692557,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7729,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3198788705719864,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.7863,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.41645947104870057,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.8442,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3473642274949473,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.7755,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.351233087536269,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7799,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.39062438800982935,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.8262,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.4661216475962772,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.8754,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.3680543907349618,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.8511,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3554554750391017,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7905,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.34003331830378203,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.8518,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.35691497758794416,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.8042,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.3680532317532297,
+      "learning_rate": 0.0001,
+      "loss": 0.826,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.4306874440922751,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.8114,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3927827556296759,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.8477,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3688233147013582,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.787,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3460381638988584,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.7861,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.3293873896538617,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.7599,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.33067678398241424,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7788,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.30272346196435956,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.7594,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.35694173144440655,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.8066,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.36394962483417836,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.6855,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.4052142574936814,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.8238,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.4466361193230218,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.7949,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.3691146825930911,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.7591,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.3884324905017403,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7671,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.37235345511558177,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.7954,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.43342144747289674,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7674,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.3601056959148259,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.6814,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.34765672643846884,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.8115,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3542918688839561,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.7751,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.43405703341062163,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.883,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.35990538609349737,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.8274,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.36566849513888866,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.8316,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.3880712586476513,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.7823,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.40070345592700185,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.8682,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.452126500075801,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.7349,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.39133686156881714,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.8166,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.30433046880932474,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.7799,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.38912324970709505,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.874,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.34571755403615634,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.7972,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3759979893898139,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.7436,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3396623833976361,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.771,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3930329153264238,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.7294,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.384939784498545,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.8072,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3670712491930503,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.8139,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.37471193485584176,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.8026,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3466481941938704,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7856,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.334681278560714,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.7457,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.3580934295092547,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.8062,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.41942104569534644,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.8307,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3044902604615677,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.6707,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.37144444775815594,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.7813,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.34248491706444256,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.8051,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.34748171044370163,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.7627,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.38512453359767645,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.8318,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.36707414698679175,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.8305,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.48217984342912773,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.9014,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.38863801931906744,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.8525,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.35518873432323794,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7778,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.34123260040379993,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.775,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4001757666899788,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7731,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.3315496723721597,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.7922,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.3258037685568246,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.699,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.3884518590202934,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.7866,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.7104627436824442,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.8089,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3013941102836104,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.7054,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3731169801442719,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.7737,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.30263267234838137,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.7029,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3659751206161679,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.8146,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.36421909907458466,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.7769,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3403495561782968,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.8074,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.4116298314841219,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.7561,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4017877316418641,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7802,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.3933048903616485,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.8302,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.4053820034136759,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.7537,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3422570001344906,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.7513,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.3672321719467038,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7552,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.38655052693729053,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.8076,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.32046006517959386,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.7549,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.45524335147940526,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.8619,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.29405543478538154,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.7585,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.38285886702055727,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.8357,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3801889952884557,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.8901,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.35279011141194516,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.7481,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3471907142593491,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7855,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.36619187308979667,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.7625,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3762428412156853,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.8157,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3171161168873978,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.7799,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3592730455514312,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7478,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.4134931519699558,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.8437,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.30546020641842614,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7224,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3454408457555957,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.7727,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.3680677506255253,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7745,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.36274461235683025,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.8305,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.4459754472676995,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.8917,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.34453176080889125,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.7476,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.31196539201987367,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7468,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.3184608885787424,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.7402,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.36443373379957883,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.7567,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.4417924711660864,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.823,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3236759284366048,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.6885,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.39242767874833534,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.9275,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3427482665469463,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7372,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3626863241157415,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.8017,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.34605782828987924,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.8484,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.4383327443511539,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.8186,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3447222581311919,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.7213,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.31970536995245263,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.7434,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3656450810361097,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7676,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.36298269369872477,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.7487,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.42152653173895593,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.8926,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.38946242385235036,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.8176,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.41804903127540954,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7996,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.31093486947817817,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.7247,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3187140163567531,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7473,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3872254324451249,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.8821,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.4051984951107359,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.7603,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.40122800893302063,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.8414,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.35538652933955217,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.7676,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3137491139738254,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.7501,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.45963025353581044,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.7936,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3348526950988372,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.736,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.39896905950270944,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7654,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.37722430967890513,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.7327,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.31478006761399024,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.7615,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4188263703262344,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.8783,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.31942616334359136,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.725,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.2997679274265762,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.7615,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.36551094625988517,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7969,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.34379292762124014,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.8058,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.35790166821440883,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.7782,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.33879760718838586,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.7636,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.31407783216446544,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7323,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.38757334898188933,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.8443,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.39258218690616686,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.8223,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3313155165113673,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.7951,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.32605323503196637,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.799,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.393570523991794,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.741,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.37378520088314604,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.7429,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3179898573134195,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.7852,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.33638562625723384,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.7518,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.36906155605346674,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.7948,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.5865294985065448,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7865,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.3357869205531904,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.8067,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.3214550435563662,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.7578,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3617697253780003,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.7783,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3851342510588961,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7932,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.34426702777854773,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.7609,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.36253585088408663,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7984,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.4054570274935403,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.8,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.34938837916613863,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.8201,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.37777115892854124,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.8213,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.37766797195751295,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7736,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.34012917606966536,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.7852,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.39195816403535333,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.7284,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.38813789832520107,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.8645,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.32915922567617034,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.7437,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3351156902277233,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.7069,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.32461016981926744,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.7698,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.34243368327755425,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.8092,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.35449923757603197,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.7537,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.37595905935200546,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.7974,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3296041419119035,
+      "learning_rate": 0.0,
+      "loss": 0.7362,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 730661894488064.0,
+      "train_loss": 0.8330854674180349,
+      "train_runtime": 9213.2243,
+      "train_samples_per_second": 1.085,
+      "train_steps_per_second": 0.034
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 730661894488064.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..08e200f2a07bbf95ef9c06ac9018bcef73182e0b
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "o_proj",
+    "q_proj",
+    "down_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..20cd56d587ecd1f3fca7e27f515774a06db21e79
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb223c3820e4d3ba11a4de32d2e8233430f8ee290e3539d12aefaa3e3db8d9e5
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b37acf92c0d29b3c161d187f94539b24e07eb742
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a0dab0cefd97719e632dbe34186b207b24721cd439d644851337f06d4b9e2aa
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b621d23102cafd7b8ffd5a04cba11d96194ecd15
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.0793138209817712,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.4104,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8674660345497316,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.2554,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.8985758119498529,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.3902,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.7246031443112888,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.3101,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.6511872684793721,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.1316,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.763616935771609,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.3126,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.6908154183889081,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.0942,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.2335001982189193,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 1.3125,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.9498545484386265,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 1.0454,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.0250421172188384,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 1.1987,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 1.0356218006105504,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 1.1966,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.7691232611946629,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 1.1452,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.6137120435062169,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9656,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.6555096542988432,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 1.0269,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.6836787264972417,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 1.107,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5915370874369371,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.979,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.6193214806287105,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.9109,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.6553819808381135,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.977,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.6607466103071827,
+      "learning_rate": 0.0002,
+      "loss": 0.9003,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.577074883316804,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8983,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.6291563797217955,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.9503,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.6851019336127845,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 1.0646,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.6473518797136671,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 1.0297,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5756352725362215,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8767,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6520852714809134,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.9888,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5591575247915785,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.9146,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.5223616842749025,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8748,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5701781592261361,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.9207,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.6596981955668625,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.9812,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.6389275851096986,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 1.0121,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5692044640211616,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 1.0032,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.6172755546909754,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.9598,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.5780978905199465,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8745,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.49979542206264266,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.8903,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.6544043706335699,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 1.0492,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5269776535593357,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.9326,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.5703236802291971,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8848,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5748680136894786,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 1.0248,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.6715969458430749,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 1.0063,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.532057621226445,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8946,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.4833052914364651,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.8794,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.5473279088698683,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.9075,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.4883523970636091,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.8992,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.5340051810138325,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.9149,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5064485228548891,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.8637,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.7975601380952108,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 1.0321,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.44587397079465824,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.8226,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.48608845583225374,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.8716,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.5198898257438284,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8802,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.8158293298366603,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8988,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.5781269309537154,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.9523,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.5945814159638966,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 1.0096,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.5087211255257024,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8375,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5910704844668925,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.7949,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.6165737968134988,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.9585,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5272556688243508,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.8688,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.48991461637614614,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.9111,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.605546363473904,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.99,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.46970358749198515,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.889,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4323321438964964,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.8242,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.6421355965110687,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 1.0748,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4745832195507423,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.8315,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.6807110202412574,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.9852,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5516621767923496,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.9619,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5442774338710398,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8259,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.6137001844755123,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.8962,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.5438522046733677,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.882,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.6269568278040801,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 1.0232,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.5683434963903127,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8979,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5196024255658687,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.9263,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.4243653568771807,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.8549,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.6071660525964858,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.9677,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.5017063724170426,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7994,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5670301642307584,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.974,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5174898485382943,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.9166,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.5727910841053342,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.9744,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.5892412963128097,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.9508,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.642838513538222,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.9288,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.5078172724450312,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.8302,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.6104299828421814,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.9432,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.5567938689379198,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.9509,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.5174359389523094,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8202,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.6102149265809388,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.931,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.488489313365436,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.9127,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.5191733814891298,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.819,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.49796814189560956,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.8484,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.5380837654524147,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.908,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5019138128050206,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8827,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.46651823153453,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7727,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4986566878404386,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.833,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.4983672326086276,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7986,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.6330160448213425,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.9544,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.44904646849391194,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7966,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.4542692665928206,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8242,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.515424314483351,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.9082,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.5697447448543441,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.9013,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.5137764674379561,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.8137,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.479919016645595,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.8398,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.49647779658563373,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.9328,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.6899167671368104,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8429,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.61597299646533,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.866,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5050759894276498,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.9003,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.5553537290912124,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.9935,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.5709926920654692,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.933,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.5749511557771967,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.8781,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.574231287799876,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.8559,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.4575436155659373,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.8057,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.512997079291813,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.9096,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.4543950317580882,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.8099,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.7415531993835159,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.92,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.5031015613723627,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.8489,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.5069331186376628,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.8417,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.5296095779215624,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8415,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.6174068211456757,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 1.014,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.5038692627979767,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.9044,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.5174851061554335,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.858,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.5117319530131302,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.9151,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.55146182969168,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.9628,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.47125698940698507,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7792,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.48183861719102145,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.81,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.7076884238374276,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.9862,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.5277239123951805,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.8455,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.5500019878771762,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.9344,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.5681596854522988,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.8738,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.537228959356412,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.9068,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4068580570219105,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.7808,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.5709251278452853,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.9829,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.5220868006365708,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.8227,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4291156529108893,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7303,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.5244275087026982,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.8909,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5749167648537551,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.9396,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5006517894046707,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.8603,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.42912502006697223,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.827,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.6563250170391516,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 1.0645,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5071156739289818,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.843,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4443624151432539,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7973,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.5378395523138448,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.8218,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4372134744896649,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.8177,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.5704501228806174,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.8947,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5534021620896453,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.9012,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.5088507443455628,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.8315,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.6305342270801653,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.9733,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.5556030663549947,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.8774,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.584957212445808,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.8956,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.49116410611405603,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7761,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.6558956525178432,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 1.0159,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.46229926694528667,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7958,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.533918210765701,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.8272,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.5525524093954354,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.8472,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5268413236464946,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.789,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.6034098838618137,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.768,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.5123820139406852,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.8368,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.5221297704321616,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.876,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.5443635828037163,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.904,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.4127347153247367,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.7049,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.5058913241758101,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8628,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.5241538262739248,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8588,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4339969221990366,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.8594,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.5410233290873173,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.8184,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.5643039351700077,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.8973,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.5070570954794127,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.9373,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.6431993631294437,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.8834,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.5463897311592459,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.8516,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.5510992888253026,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.9278,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.48576945131497623,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.8189,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.5455089581962679,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.849,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.4671115204530441,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.8106,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.606453541968653,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8329,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.5246409469178386,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.8416,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.5325599880382337,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.8032,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.586907938828374,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.8808,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.6172095666111331,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.8312,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.6019656760906752,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.8424,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.49991172993633143,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.8421,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.7448569391878513,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.8937,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4519078160719926,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.8147,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.5294837761405404,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.8715,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.4543029332406574,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.8011,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.5030499995735945,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7599,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.5249044057354469,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.7863,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.5367192267025025,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7586,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.6046994598165022,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.8595,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.5511975892576534,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.8884,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.47021844068648017,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7914,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.5609037523683333,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.8113,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.5278715536138033,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.8919,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.47435584083177773,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.7481,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.668966210084083,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.9043,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.4942687214975865,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7966,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4764422141715473,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7993,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.6016523774814396,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.9557,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4661173796072379,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.8348,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.6051015964428406,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.9355,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.4578947855365344,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.8189,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.4509433112679857,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.8225,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.5235831584168316,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.8879,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.44986301429924835,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.8279,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.5490905986071285,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.8937,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.5282971310739915,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.8387,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5582842618729132,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.8755,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.4974797052185431,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.702,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.6767520835439186,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.9832,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.49096721822733147,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.834,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4336794454311267,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.7807,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.561256529665769,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.8304,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.5346326077710617,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.8853,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.5575705612708424,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.845,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.6113035100414476,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.8369,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.5531036051454451,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.8495,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5191241536194888,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.8543,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.6609229189584678,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.9303,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.523613656193414,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.8125,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.6000257986217872,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.9602,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.5284405089059923,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.8679,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.784321921220287,
+      "learning_rate": 0.000152669141192587,
+      "loss": 1.0552,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.5348576999816136,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.8425,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.7391155392801806,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.9355,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.46575258380512435,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.76,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.48666315272804594,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.8717,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.565648946468228,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.8518,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.5309309049154313,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8486,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.4684232580120976,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.8592,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.4764313022388415,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.8644,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.8136632652056193,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.9112,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.6438521631504122,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.8526,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.5102341931924466,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7808,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.5940550280405117,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.8275,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.623552086471691,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.9372,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.5378567905788513,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.8185,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4925727020488738,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.8259,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.6315840232859294,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.8099,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.5932994881382887,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.876,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.6004134023890813,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.869,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.5748874245676628,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.8937,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.4777508163658265,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.84,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.5437019804171094,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.8783,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.6389138378535143,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.8807,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.4882309118075003,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.8168,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.5168071744262905,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.8979,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.49827826762164956,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.8399,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.5702744992964714,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.8623,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.5576641561434929,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.8827,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.46111048315752434,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7337,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.6104351142056681,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7917,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.6977294844697014,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.9764,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.5464742075267145,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.8489,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.4408710931040891,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7347,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4550331663208126,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.8172,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.47431682121182805,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.755,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.6035254807715448,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.9423,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.3751226324854456,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.6642,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.45736782627361144,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7436,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.5145611235864497,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.8351,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.9386687957598261,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.8669,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.5862045163334885,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.9092,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.7037646177150105,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.8805,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.8613306434547313,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.9224,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.43063135054561263,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7473,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.4726821562111012,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7395,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.5654880939742353,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7463,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.532729889223182,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.8258,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4296463182758616,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.773,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.3947029204695878,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7224,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.46957175369507925,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.8126,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.5067396710596804,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.8827,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.4153569581713736,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7829,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.5163079913910601,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7452,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.47426234833369857,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7982,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.5988256168061469,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.9765,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4728667413328611,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7693,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.4831701154962461,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.8936,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.6182426306462532,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.9779,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.5004947438392414,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.8361,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.4575584899019911,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7898,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.5593064170621754,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.78,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.48521475508879675,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.8706,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.5383484682602684,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7173,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.46203346435210657,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7481,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.6076992808818641,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.8122,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.5864862009753786,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.9199,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.5008180806943826,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.739,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.46245114842878776,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7855,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.5635026031048164,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.8507,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.633749384863932,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7524,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.48959796070667255,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.8805,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.599644675537256,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.9511,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.39557107991372126,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.6824,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.5114523104146805,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.8748,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.4603270033213667,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7276,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4850670420577096,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7544,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.4615560420158054,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.8094,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.5199828460695506,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.8646,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.4706257553077559,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.8289,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.41943877343889957,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.7521,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4716217258820339,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7627,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.6786834378679842,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.8907,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.43217816337126086,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7732,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.46949227376853764,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.9017,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.4661775390489814,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.714,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.46140392359972254,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7431,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.4550459454163621,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.8215,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.4415086849559902,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.8797,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.598195900084871,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.8701,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.4465807702003201,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7488,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.6155362524306691,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.8282,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.4180641928461069,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.6957,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.5513503369328582,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7914,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4842650496159258,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.7634,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.5263091081666915,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.8129,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.5938506329556743,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.9201,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.551330984952594,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.8799,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.7601199557677949,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.9105,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.4303591705924048,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7641,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.5397792424413276,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.9404,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.4887982118564506,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.8053,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.4673734981798477,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7912,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.4851329513640223,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7895,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.4348341884313913,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.7757,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.5619573417540332,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.8219,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4448837026067695,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.8214,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.4412528222008381,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.8126,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.49795418352279724,
+      "learning_rate": 0.0001,
+      "loss": 0.8511,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.6646338270787656,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.9252,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.6047251048973974,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.8856,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.4940920844742925,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.8153,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.6814053429906266,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.8841,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.433215558732901,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6603,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.44957536906911494,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.716,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.4914217112657972,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.8497,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4750281054941176,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.7759,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.4605094039390799,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.7512,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.5301471784253153,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.7561,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.5156702413525251,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.8423,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.44662040635807865,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.7822,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.4208428464465991,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.7097,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.4355712137687321,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7986,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.4928953933111074,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.7569,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.5224733963670573,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.8273,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.5771379087578413,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.8248,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.48450806448263845,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.8014,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.481103005933363,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.8022,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.5538219740275729,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7581,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.4418676812397226,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.7544,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.5050698181061205,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7913,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.4767268572887257,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.8206,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4146839368127928,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.8135,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.49714856909001787,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.7661,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.4645738096617371,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7932,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.5418003671606643,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.8437,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.49298359240777184,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.7813,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.7609953729698271,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.8185,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.44183019732999196,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7714,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.4088592388689686,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.7494,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.40975797453002993,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7723,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.500990734539649,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.7737,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.8763033227557493,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.8078,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.4098708175269476,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.7038,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.6136809658421051,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.8555,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.6039742316669718,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.8918,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.5946975858000318,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.7445,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.5243833434113363,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.8365,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.696009625451111,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.8724,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.6203198423638049,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.8991,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.431515175621456,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6857,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.519278586203387,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.784,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.49743930237280587,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.8482,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.5849132524626789,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.8286,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.46552588262598193,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.856,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.4482646417650763,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.7512,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.42486744603141685,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.6975,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.43976643777741153,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.7258,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.6272719537569044,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.899,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.3986327415141386,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.7104,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4955115549656417,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.8892,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.49512595703273493,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7756,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.5905507667064706,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.8908,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.44001761796947125,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.7847,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.484668167723542,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.8202,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.5061493652802893,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7659,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.459908723396548,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.7429,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.5311269649573576,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.8575,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.4272449890215387,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.7761,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.47308670849243734,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.8051,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.47459530007763184,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6891,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.42918346769674953,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.7558,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.4843615427233137,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7177,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.3931012009976337,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.6856,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.6674786874769335,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.9333,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.403434388808353,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6809,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.42865534645955083,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7807,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.5531121427660198,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.8009,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.40286712803632035,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.836,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.4687974343547248,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.8253,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.47197844007080497,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.7944,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.486408237962315,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7509,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.5869738610437195,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.7953,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.43457878847863257,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.6897,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.5764860701204353,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.9262,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.4151703618751473,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.7574,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.46799090298160206,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.7309,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.4767199781872603,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.7432,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.5338210338772973,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.8245,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.44315164336724483,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.7933,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.5232242045280784,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.8696,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.3761336773443223,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.7843,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.49015333408077183,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.8156,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.4949869156738355,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.8448,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.5513926873484194,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.944,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.5076373807708922,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.8312,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.5221652087488239,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.8048,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.7909223310975126,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.9155,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.5050567694945293,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.7902,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.4933778254758736,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.8708,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.6085020099507188,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.9631,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.48881092432081136,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.7942,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.5328506602391134,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.8172,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.49480227356004286,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.7833,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.48542832605280173,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.8107,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.47554761567105164,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.712,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.5865668935576726,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.9088,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.41550042607241994,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7112,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.40232320456189885,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.7461,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.43558339499848475,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.6831,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.44795635239543685,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.7535,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.5688062900456592,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.7766,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.5411100645268582,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.8379,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.785173955175749,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.8736,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.5694372419439617,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.8607,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.44414458225125525,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.7479,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.38581058519584893,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.6847,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.5764557383958909,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.8552,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.44431394707674726,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.7253,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.46415861733429314,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.8071,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.4040067644632014,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.651,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.47156609100041014,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7547,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.4751331987703744,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.7702,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.5570235075279028,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.8715,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.4242586984566103,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.7714,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.4285280557058061,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.7948,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4644510958600794,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.7333,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.4521746592306217,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.7224,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.5913011471811631,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.7531,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.453332627125057,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.7774,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.5313908009606292,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7977,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.44955174895943734,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.7746,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.6126618976150218,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.8736,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.5822202712721202,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.9148,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.4986228890404806,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.7716,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.4007906896378964,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6834,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5493506440801259,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.8659,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.49276563271224727,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.7534,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.48320789627580496,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.8172,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.41936074078974206,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.7933,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.5372564975322116,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.8315,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.5801661809847876,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.7001,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.4837845296948569,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.8001,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.7266773227848353,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.94,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.550336617198042,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.8747,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.5502687804150279,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.8737,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.3770404511005428,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.658,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.5400732162785093,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.8436,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.5625985678957215,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.8116,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.4990853595000655,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.8323,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.5413158096901148,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.8623,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.44364974048301387,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.7717,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.4818165245901257,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.8421,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.48078247572273736,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.7534,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.7066838487989124,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.8655,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.5251245254632664,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.8004,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.5895314067131298,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.7863,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.42127664447603985,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.6784,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.4691636152593432,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.8385,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.4501354628353762,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.7619,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.415124794643123,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.7181,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.5627594769488438,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.9042,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.44490947481959,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6357,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.5817279105746219,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.8829,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.6587378305549441,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.8005,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.42424532031828477,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.7305,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.40489764871202116,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.7282,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.5056254245102135,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6816,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.44314862665401866,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.7087,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.4988712835436097,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.7699,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.5194393424400168,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.7883,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.5655293463573838,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.8512,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.4573170674666667,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.7258,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.4717436563473585,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.7664,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.7600668417367207,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.9817,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.454285204781225,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6792,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.5134769941764181,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.7161,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.41445927327481324,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.7334,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.4469742005032891,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.7311,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.4412576821600429,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.7303,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.46287664763589803,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.816,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.45669162759999254,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.7279,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.36611990224798674,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.7352,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.5656992758486998,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.7765,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.6842431356491339,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.8718,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.4800259822183707,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.8331,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.4744162443639759,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.7533,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.529271868197914,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.9258,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.5646985855066731,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.8823,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.4240601302452028,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7133,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.4573956395152056,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.7949,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.4769725950279453,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7622,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.5510634691154779,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.8081,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.5682105476060857,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.917,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.5277516399630586,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.8857,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.7500931605600588,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.9307,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.5040665929325822,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.841,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.39455094792522016,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.6728,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.5572000427210648,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.7578,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.5187875614390844,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.774,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.4405333764702291,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.7851,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.6529238235867248,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.7798,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.4882094578728378,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.7813,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.47297380654401905,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.783,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.41791290137820514,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.7383,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.614821635899501,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.8027,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.5629745897752465,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.773,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.6034340485024299,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.9058,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.45977861182601576,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.8149,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.50221813379731,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.736,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.5073291923422781,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.7513,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.5072707035921479,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.7421,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.46582944938080906,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.717,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.3939880605472184,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.7486,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.41842556533290937,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.7163,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.45976787759587967,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.666,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.5741122959132184,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.7962,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.5574064273608463,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.785,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.5863087560710555,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.819,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.5600281242293509,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.8736,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.461928830964332,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6682,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.4762813675960855,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.823,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.580988891335986,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.7691,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.4721345192605899,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.7923,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.4660525584882778,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.7715,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.532626625364438,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6877,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.5857105608963418,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.8983,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.46271941466564903,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.714,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.4497277199017365,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.7407,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.5850063104492209,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.7714,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.5618659006849629,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.7518,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.511506487763565,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.7431,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.450060820779741,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.6743,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.40282916698031773,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6709,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.41181490509250535,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6955,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.5045281883413828,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.7732,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5191836427394763,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.8452,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.428564300937728,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.717,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.5153289123449012,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6808,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.4616447424916075,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.7531,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.39681321635925004,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.7446,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.7129929489595015,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.8922,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.44793949504325403,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.7386,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.5872252829296346,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.8107,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.41757767107027727,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.7552,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.5278586013338012,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.8128,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.4764369269232915,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.8243,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.445195581809496,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6867,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.49976681029923836,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.7059,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.4124258078264793,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.7264,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.4499532492200171,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.79,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.47209635231334507,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6334,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.4856682554711025,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.7553,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.5662153645220434,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.8279,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.4090240938346095,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.7344,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.537789948994782,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.828,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4332640950791847,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.8001,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.43843217972504145,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.7406,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.445514088395683,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.7595,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.4275969003141511,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.7319,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.6181916419087072,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.8866,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.4964113303224607,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.7641,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.5931750787393189,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.75,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.4920843039984812,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.7729,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.37980050845658425,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.6829,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.3963894276679346,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6427,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4154526430122979,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.6851,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.5199993448362794,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.7961,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.5750088863420132,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.8081,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.5140225691311261,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.8013,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.49898354854720056,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.7381,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.5249545174252546,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.729,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.4401588184940419,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.7629,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.4189629890238096,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.7666,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.5006645773204051,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6968,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.5632707912819288,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.8454,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.6024324926308866,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.8279,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.582047873817297,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.7981,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.4564155317343173,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.7074,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.6058058693949488,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.8158,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.4181397209604915,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.7312,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4614300144965895,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.7338,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.4846363832282807,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.8036,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.4609145529752502,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6841,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.5076128767123109,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.8411,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.485767970970762,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.761,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.5029976589432248,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.7573,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.46078550545735925,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.8573,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.5125683254773828,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.8219,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.7099555205627927,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.8329,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.5257512987120833,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.7691,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.44955284490495473,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6724,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.43068643981099336,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.7384,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.7404433663435687,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.8677,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.5401203008083829,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.727,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.5506770140027683,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.8123,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.5093788680691201,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.83,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.6168736957077788,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.9002,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.4196641464596395,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.77,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.4784595580207899,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.7212,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.48297015578133545,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.754,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.3956503814631328,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.7458,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.5010590810156292,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.8102,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.4906446661999585,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.7993,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.517825027389168,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.8563,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.4786367198289518,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.7457,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.48317436212573805,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.7838,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.41016938444393536,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6589,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.5792843604463185,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.8008,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.4562185264492372,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.7481,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.5315806159191756,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.8277,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.4996324184516367,
+      "learning_rate": 0.0,
+      "loss": 0.7711,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 507159541022720.0,
+      "train_loss": 0.8328443063735962,
+      "train_runtime": 9270.2391,
+      "train_samples_per_second": 1.079,
+      "train_steps_per_second": 0.067
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 507159541022720.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..04695a99b4d17a66e0122de85a215f376b3dc301
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "up_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "down_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7ef969b1572f1f08788458dca64baed71cf5ca1b
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b97c5d7de491d336dcc2b1c913eec57bc2ade51a9e0214178292b044b106f4bf
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..71c5456d9b80fcede1abb6e90ce59e959383fae8
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:510b7354cbdb44580b8fcff4c0cba0358329aad7b5f7e48dcb45928b558fe28f
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..03527f65109970f26c67a8a5de13dcee2ea45072
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8210982159609682,
+      "learning_rate": 2e-05,
+      "loss": 1.3329,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.7751787218518346,
+      "learning_rate": 4e-05,
+      "loss": 1.3721,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.6646539221769208,
+      "learning_rate": 6e-05,
+      "loss": 1.3088,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.7036195246304789,
+      "learning_rate": 8e-05,
+      "loss": 1.3377,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.6168233543536914,
+      "learning_rate": 0.0001,
+      "loss": 1.1919,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.8492868526153039,
+      "learning_rate": 0.00012,
+      "loss": 1.2512,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.7936611575617178,
+      "learning_rate": 0.00014,
+      "loss": 1.0697,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5862560941923312,
+      "learning_rate": 0.00016,
+      "loss": 1.0911,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.49328071163938675,
+      "learning_rate": 0.00018,
+      "loss": 0.9762,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.4796953108328627,
+      "learning_rate": 0.0002,
+      "loss": 0.9269,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.47431010608538954,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 1.032,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.4707062597281157,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.9748,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4880961566615289,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.9767,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.4658440438656818,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.9179,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5344941086238363,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 1.0129,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5007192463357719,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9953,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.44861347739015633,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.8987,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.40432515099975497,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.9949,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.42728242685352247,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.9649,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4641229091681979,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.9609,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.3913009193318844,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.8997,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.3960221279711468,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.9046,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.4893549306818581,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.9538,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.3943436800759236,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.85,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.49281687959276665,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.8904,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.45164645594942804,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.9793,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.3884822532677797,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.8047,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.41101776336511836,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.9077,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 1.3086028478450786,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.9363,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.3375555003941169,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.8523,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4356929720245981,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.9504,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4558256010915256,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.9619,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.4182066475020067,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.853,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.42581671871982196,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.9528,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.39591765873363666,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.9005,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.39119354380322896,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.8988,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.40090914965423335,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.8758,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.3945361491838484,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.9393,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.4394341724704068,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.9428,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3727456121947463,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.8748,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.37234199586270694,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.8736,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.39842513634466276,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.9022,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.38128602501461806,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.8208,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.38025302062603994,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8832,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.34279246176384087,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.79,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.39891015144038994,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8637,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.33599563185761405,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.7904,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.405639307195739,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.8904,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.347008173492179,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.8188,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4053373760955314,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8767,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.4071076068532092,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.8753,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.3947573279192353,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.9461,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.41593394777293125,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.8543,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.34556265770047007,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8457,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4135992491712443,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.8471,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.36656186910259925,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.8315,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.41888747530592474,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.9118,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.36251863953206737,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.8626,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.3748079918860433,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.9207,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.34699582513061866,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.7823,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4324436996517203,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.9012,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.40830183945729825,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.8866,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.35645475792133624,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.8246,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.39501413519318324,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.8903,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.3403250694314037,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.8002,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.38578492015884985,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.8823,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.42912996214481597,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.9337,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.33840309416398107,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.8152,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.36261675945692257,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.8119,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3978630764800173,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.8884,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.4067690308983075,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.888,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.39761946908218565,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8734,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.416514669804446,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.8813,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.3831820092966067,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.8015,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.40531349166429315,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.8021,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.3757725806826467,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.7819,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.3973182683957758,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.8807,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.3251584668348043,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.7627,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.41127123846488717,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.8387,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.37661663005517965,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.8423,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.5786386678521719,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.8949,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.362252109033277,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.8644,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.3786074251460458,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.8161,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.38704773419959004,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8054,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.36687924172159914,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.8053,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4532905914552837,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.8496,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.3868107415971005,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.8292,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.45004745878752395,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.8386,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.3601950439013075,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.8144,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.39414164051104805,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.7563,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.4162252396854745,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.79,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.37960017989400824,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.8201,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.3838649671026415,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.8362,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.43789279418131016,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.8057,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.33824523290379066,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.7832,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.38508464229871187,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.8825,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.3872280278580832,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.8568,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.3710845404668928,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.8433,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.3601850097818311,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.8426,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.36336268281281814,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.8459,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.4007311050026622,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.8332,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.3279742689663737,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.7898,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.3937884919475543,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.8439,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.4143083432212905,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.8246,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.35676836384855826,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.8377,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4418839450505997,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.8546,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.38722861928111846,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.8938,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.4798810748731012,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.933,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.4192610184794806,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.8314,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.37992908787234797,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.8502,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.3537079585046425,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.837,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.4947879619869325,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.8655,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4144273342987794,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.8024,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.42810201253462243,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8646,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.43495581513946346,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.8019,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.40514388555481956,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.8307,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.5344793506502119,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.8583,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.3591362856868415,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.8422,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.43800340514585073,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.829,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.3546231000833001,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.8502,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.3824038358260862,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.8601,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.3773025369439645,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7483,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.40841295198664107,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.8981,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.32297152126914114,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.7604,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.38908595736780166,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.8377,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.30749072492921053,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.6953,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.42279066677539,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.8381,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.46835688525644387,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.8801,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.38170959986063163,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.8177,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3748992249270009,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.7318,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.32117252663093104,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.7911,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.32643252918835663,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7565,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.34707082290937796,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.8179,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.37747643787259155,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7647,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4042549495002011,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.8638,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.40682592869078416,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.9235,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3394181019470551,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.8029,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.37379683333378905,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.8183,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.37760782532653564,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.732,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.42926915905667945,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.8645,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3353578654728262,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.7573,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.36702732140537525,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7855,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3945829288474564,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.9015,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.3384991496539021,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7672,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.36235446616831335,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.7298,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.37275162386027966,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.8231,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3351601172488379,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.78,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.4177215304468739,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.8148,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.31882281813401364,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.8256,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.34927524739208016,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7206,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3272143865687314,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.8422,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.38082561425167516,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.801,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3683618334975772,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.754,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.3460529686559446,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7703,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3906059155144969,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.8635,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.4498300529094466,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.887,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.34890284462378934,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.8415,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.35283761184137985,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7897,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.3383893586354043,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.7699,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.37944587951981884,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.8168,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.35361574612426283,
+      "learning_rate": 0.0001,
+      "loss": 0.8237,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.49601519414630724,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.8944,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.4329854234900553,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.8409,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.33753405466010045,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.6823,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3641968735452542,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.7999,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.3412252144703986,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.7485,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.36557961816073514,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.8047,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3143242351094908,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.7417,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.37740381793108085,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.7847,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3756001432572221,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.8045,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.3886531783560117,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.7739,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.35961818663851197,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.7682,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.34291998146787495,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.8154,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.365623213677616,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7723,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4154693198085254,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.8059,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.41853675355977066,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7936,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.3029988536971098,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.7538,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.35015284331125907,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.784,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.36899119805196373,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.7712,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4635909482238713,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.8151,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.39324815381124656,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.8479,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.3790446343866871,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.7849,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.38585647860183997,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.8062,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3851371620272845,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.8341,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.33007054542330805,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.7197,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.42174584227102946,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.8017,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.33067818645649066,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.7886,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.42265554001971417,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.8238,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.34908037692220767,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.7928,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3700495310823468,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.7491,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3535456489525407,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.8094,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.35992816828147584,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.747,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.34357552011988995,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.7329,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.4017091190478587,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.807,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3092472351482309,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.7309,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3737027071808558,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.8191,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.36607238095868505,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.7998,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.4042496034179054,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.773,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.37764639494375285,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.8055,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3348761239337774,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7443,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.37626992146366794,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.7826,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.3665662199313722,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.8335,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.33785931924439677,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.7972,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3880113242436853,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.8896,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3721493819848271,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.8183,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.5198646512996709,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.8566,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4065329950447661,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.9147,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.37859745509497567,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.8042,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3607765773764099,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.7938,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4092950714526401,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.806,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.30432228901089703,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.7262,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.37088903131596684,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.7169,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.3904681317398879,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.8104,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.46898322895625755,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.8688,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3006038153490141,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.7169,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.37810250308131216,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.789,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.2989111481815409,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.7303,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.32911143249763547,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.7592,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.35549746840897417,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.8175,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.31577828848903333,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.7581,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.38150936709065336,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.7383,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.352873952674236,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7899,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.3729395002008322,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.8267,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.38612103435747785,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.8438,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3343773300394658,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.7766,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.34821484213233017,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7869,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.34452187805524936,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.8142,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.389690433062319,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.75,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4645823764551201,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.9086,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.3374398934171241,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.7671,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.4111564099874274,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.83,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.4082259005237418,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.8499,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.35262858393716395,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.8093,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.41355266235382226,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.8087,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.41241448924179946,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.7911,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3279954208792422,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.7616,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3710790783951858,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.7417,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3638956295243486,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7734,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.45340861210629174,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.845,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.32432848532039027,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7292,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.33285800890422707,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.6951,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.36168512959628546,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7777,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.3755110018820757,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.7907,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.4746969071550545,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.8731,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.3485421714780257,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.7017,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3522895092881672,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7341,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.3171714951862727,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.778,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.30854770168466894,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.7326,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.4491439626115905,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.8296,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.37090037131233355,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7953,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3834261067678261,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.9055,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3255432432056942,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7592,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.38151180353083647,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.7883,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.43340018294551996,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.9072,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.4692009486980658,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.8892,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.34906122041207605,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.7177,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.357075923035099,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.7832,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3853853663460861,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7875,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3162997323586251,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.7652,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.5058370501510021,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.788,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.38934979473108083,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.8643,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.36714032393813667,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7469,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.3488248102724531,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.7318,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.294650221806305,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7344,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3540730290394561,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.7354,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.4226203562757057,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.808,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.4014616869239377,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.7749,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3934215620886321,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.8003,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3353659014526165,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.7851,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3700720016155327,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.7941,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.31100504311573046,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.7298,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4172712953645561,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7686,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.34667441316708114,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.7106,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3004518824396042,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.6887,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.38133602184118887,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.8172,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.33116090594086295,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7005,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.3188298094332972,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.7534,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.4403999231950908,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.8213,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.3608087635662593,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.7844,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.40474388045501003,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.8244,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3389637770265784,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.7018,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.30643971186231883,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7619,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.3661184834317572,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.703,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3671129154556767,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.7824,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.34883381196841734,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.8148,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.32439361737968153,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.7575,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.3866839936796524,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.8134,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.3825547694903366,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.7618,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.31827546790247363,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.7321,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.29623028045498556,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.6688,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.40429655628990896,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.8106,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3650008962489634,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7758,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.34654671660631003,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.7488,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.3417601789633415,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.7376,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4766922892989375,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.8448,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3764754677513324,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.756,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3744144624463219,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.7769,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.41513981451373816,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7734,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.36679556444218603,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.7664,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3859580111548229,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.7639,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.34312029909834285,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.8412,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.4680442538250724,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.8093,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.3160450323270329,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.709,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.45915080458361,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.8064,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3737969883986542,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.8277,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.3789680381062684,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.8404,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.35233074082938165,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.7446,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3337846639484774,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.7811,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.35005314677512966,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.8336,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.38107841486064986,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.768,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.37188047064663365,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.7323,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3528962453383034,
+      "learning_rate": 0.0,
+      "loss": 0.7924,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 736497759813632.0,
+      "train_loss": 0.8298983348485751,
+      "train_runtime": 9149.0946,
+      "train_samples_per_second": 1.093,
+      "train_steps_per_second": 0.034
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 736497759813632.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d4056ad0dbfa8e52c3d650ef42c4176842a29a2
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "up_proj",
+    "k_proj",
+    "down_proj",
+    "o_proj",
+    "gate_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bd05a13aa1ce47d33a2febabc444d886ceb92f02
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61659b7548812b91a870a85aa48869803be2f9ca23408f3b764320bcca5a24cc
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..df16aa2da0167ff7227b487b3ad71db9acb74e88
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf545e27d23f76953d9bdd19703edae1ee5f1f56973e863e6006b8a69c875de6
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..fac7f60a2000570e9c2cc10cf22fb9865077b415
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.0096366749232497,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.4374,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8948340509042738,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.2714,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.9673649083623338,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.4828,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8440321862225117,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.3253,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.7091743584914151,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.0561,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.8684015209033978,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.262,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.8160514552806475,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.137,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.0206577145842695,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 1.2182,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 1.2355330127135409,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 1.0205,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.3760136594787162,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 1.1951,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.9226219355030841,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 1.2095,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.6047620007662615,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 1.0607,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.6223476017826282,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9295,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.6634867557367036,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 1.0089,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.6942121550576138,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 1.1134,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6517191782439055,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.9435,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.5988935455903787,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.9725,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.7282000378549868,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.9245,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.6923516692944924,
+      "learning_rate": 0.0002,
+      "loss": 0.8958,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.7377312637478807,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.9923,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.6991529429653036,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.999,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.8442525167505112,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.9775,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.6807612405968998,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 1.0393,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.615363691022571,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.9465,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6560600622456343,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.9816,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5510437232602844,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.9114,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.4954059060906038,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.898,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.6151884500603105,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.9211,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.6750804410936759,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.8359,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.6368391831488008,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 1.0608,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5590401200511067,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8982,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.580488933744179,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.829,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.5417957562757602,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8353,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.5212173330120858,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.9113,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.5403546577518331,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.9352,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5861554161890207,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.7743,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.47500481352954615,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8628,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5702370795138746,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.969,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.699121456839057,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.9597,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5030183653307678,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.881,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.4884693592955745,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.8358,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.5321361462838011,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.955,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.5495909582071099,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.8622,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.6073671597800279,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 1.0347,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.4728710468686486,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.7384,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.7776863529098328,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.9723,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.4965532465566332,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.8489,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.6419371462528843,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.9119,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.4900525424240958,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.918,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7854429462112946,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 1.0194,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.6104771933997867,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.9132,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.6330639296462902,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.9517,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.5137018102790541,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8625,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.567695702179213,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.7726,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.6488471177320068,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.9213,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5733470182435702,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.9435,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.5921455538031671,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8527,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.6450522077889146,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.9521,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.4846588081306062,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.8395,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5775568058546156,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.9154,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.7377113840637111,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 1.0698,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.5605229143638871,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.8817,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.7495342647669407,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 1.0097,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.550991870286073,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.9113,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5425725126763408,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8622,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.6512932570483941,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.9442,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.5568640098957766,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 1.0085,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.5735714231784806,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.9677,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.5629368451300993,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.9292,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5221544806967634,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.807,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.5097003015787136,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.8862,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.7130106931007483,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 1.0205,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.5909339026171039,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.8005,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.6205480027240491,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.9101,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5936367748834522,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.8914,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.6143835874726807,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 1.0257,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.628841593688209,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8988,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.65749905037523,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.9804,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.512109193461066,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.8041,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.5743118892441462,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.9568,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.5843364516301627,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 1.0273,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.48304850535774974,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8177,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.5673332343154109,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.9013,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5256210172682816,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.9962,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.5183245117161408,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.879,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5788927945560342,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.98,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.5578708655181941,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8087,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5111618044176756,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8549,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.5162760624599195,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.8345,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.6038786090279482,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8071,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.5764856734582621,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.8512,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.6015667973666334,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.9051,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.5150471254958291,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.8612,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.5666214066987976,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8885,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.5588852331852312,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.8689,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.619713421628621,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.9909,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.5268564220803525,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.8578,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.5514584533393524,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.8725,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.47718826031070855,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8619,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.6802385479715022,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8541,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.5706682607862557,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8378,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5547872781612694,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.8905,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.6894764479614839,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.9693,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.5932521455664701,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.9117,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.5870941936867601,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.8487,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.6797262861796305,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.9535,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.4716956058454649,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.8018,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.49547117978913946,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.8445,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.4593534956395072,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.8172,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.6000493180254652,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.8511,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.5423254674909876,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.9555,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.49179013108092784,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.8105,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.5168368220680261,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8747,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.6167191146008828,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.9368,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.5340237099838037,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.9434,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.46618744924150934,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.799,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.45713405322430556,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.8733,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5280690404047126,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.8664,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.4986696223613787,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.9274,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.482812371619118,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8653,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.6464756221598743,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.9778,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.49372856222969597,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.9034,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.6101374114608618,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 1.0121,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.5451785990596633,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.9122,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5325921981933636,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.8264,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4374769818891512,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.7988,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.5792771553367638,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 1.0061,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.5482916874485357,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.8481,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4732609287832736,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7663,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.568902212282601,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.9781,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5611066018344876,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.9428,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.569003642407584,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.9228,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.49019024925155885,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.9015,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.7476848004489727,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 1.101,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.4729681535702118,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.8703,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.41394830893342704,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7054,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.5595697193386046,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.863,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.49087790464884334,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.8048,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.5921116083659788,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.9277,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5938076846018375,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.8797,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.5213244559998247,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.8714,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.7057083039155069,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 1.0481,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.47407514110812166,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.8425,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.5417477528697916,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.8626,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.41983025228173904,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.755,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.7076736950806536,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.9116,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.5234432801669066,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.811,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5968616055060801,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.895,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.793915770912191,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.8595,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.6725638184434034,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.8935,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.6972977360166016,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.8783,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.5313749491420932,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.8844,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.5373046766235955,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.8982,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.5335095314794713,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.8746,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.4444566104179798,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.7231,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.5028375661309075,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8662,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.4991388775797579,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8282,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.47893484287330357,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.821,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.5067770890271315,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7882,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.5165413509356288,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.8471,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.5066161228181565,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.9031,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.547029143598491,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.8558,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.5612378280968916,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.9072,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.6076830542007384,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.9033,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.5216970388907911,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.7836,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.54503973121516,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.9014,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.5231650990525085,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.8632,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.5497366992380714,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.9476,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.487884829363121,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.923,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4527026078347616,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7776,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.5636470844990181,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.9261,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.5861311972784812,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.9221,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.5761782641318623,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.8741,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.4680203522610832,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.8434,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.6785300369860278,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.9182,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4488327447605338,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.8132,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.5089109530270849,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.9513,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.5302057448662424,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.9494,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.46947064555349594,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7372,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.5028005654161093,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.7989,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.567498817932682,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.8853,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.4942081354946293,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.8184,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.5062989816972167,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.8978,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4493194997623283,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7454,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4896729559250561,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.835,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.547081954048003,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.9449,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.5275769947570887,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.8247,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.6796089522335953,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.8879,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.5234490468144752,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.8826,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.5277313141615699,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.8201,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.5308683592824113,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.8342,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.48418803817401984,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.8408,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.5928255690072303,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.9769,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.5171192071858057,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.772,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.4809784376625068,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.8239,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.5779211230055995,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.8529,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.47891105435481995,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.8373,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.4735620706120006,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.8068,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.495276098160408,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.8535,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5419221465233577,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.9133,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.4504332420679137,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7281,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.5600527896983009,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.92,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.6393077989847463,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.8677,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.41150204685915415,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.7436,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.556864628656174,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.8779,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.5291233938095491,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7963,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.5046584072913672,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.8654,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.5240103584920848,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.8112,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.5044515321240335,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.8705,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4938024171655887,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.8458,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.5781772950745199,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.9357,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4885521758927064,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.8204,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.5716873482544789,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.9514,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.5310263888459398,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.8385,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.7282643031114008,
+      "learning_rate": 0.000152669141192587,
+      "loss": 1.052,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.5388177337573673,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.8633,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.6082153413889909,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.8942,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.4464166985290885,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7651,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.5067701779711128,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.8417,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5422232377837102,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.8779,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.5080300860627728,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8594,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.5405630172426878,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.8484,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.469049466904632,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7741,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.8183089260837014,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 1.0074,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.565514371396207,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.7484,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4560321672187736,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.8308,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.5094977032065641,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7723,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.5465252160330539,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.8305,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.49877118353457706,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.8814,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4757523663635892,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.8468,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.570445975575914,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.8768,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.5220926560957806,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.8339,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.5963199291731878,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.8413,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.6539066454841529,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.9081,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.5330887389011021,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.8046,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.5286425366443326,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.8227,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.5786370821844616,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.848,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.4410630748313712,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7533,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.5041131428774268,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.9019,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.5778662455305288,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.9103,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.5195096672679763,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7727,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.5788932105493327,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.9388,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.5003143934702647,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.844,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.5090658710909317,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.8011,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.6686484237982562,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.8939,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.4700819361928644,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7804,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.477707037218358,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.8647,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.5100411750377246,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.8308,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.5520174664334276,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7925,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5345884806512073,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.8837,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.4212316530799322,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.732,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.4897164868688977,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.825,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.49605395233621785,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.865,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.6625415135845697,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.8593,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.553170643987391,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.8705,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.6643025128072408,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.9046,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.5894795402676914,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.9416,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.48140328572847857,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.8482,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.40453568550694935,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7391,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.5164507723111476,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7956,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.4615978212018176,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.8318,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.3542694167031475,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7276,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.4704020278094193,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7627,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.51176404471867,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7088,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.5172875242169087,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.8318,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.4818432249996395,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.8178,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.53278599717518,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.793,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.5089707307645357,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7884,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.5844341690431064,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.9081,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.47824424220999784,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7801,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.4688946946883383,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.8726,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.5863951665528098,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.9242,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.4986765195767016,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.8647,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.44194973191631093,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7731,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.524763930809003,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7547,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.4791715166387513,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.8751,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.5291947028572765,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.8063,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.5291253034564499,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.8164,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.6087123742013226,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.8764,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.5088002178570323,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.9018,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.5482575319396714,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.7488,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.4918049872566382,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.785,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.48992359397276064,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.8477,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.47183065490104775,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7776,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.48219118142939943,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.781,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.5623320974572162,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.8046,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.4304120017785138,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7408,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.5138640645168956,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.8282,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.5168258684089355,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.8046,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4901583230171197,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7845,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.4501375535429379,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.8194,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.5752810272957825,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.9429,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.4843449013623304,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7618,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.4389650913586527,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.8281,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.38966803549086076,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.6863,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.6881335554250402,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.8981,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.4368376239595057,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.8062,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.47980937375435045,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.8022,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.5063687813395686,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.8474,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.517176521428431,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.8625,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.525563910745772,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.8449,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.43808905380694463,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.8178,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.5380704534450393,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.8145,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.46864468049328967,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.8781,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.6074831901623694,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.9485,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.41120141930841214,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.6888,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.5125835564124221,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7902,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4531825151746322,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.7998,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.5178564006244549,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.8318,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.5414206162664771,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.8915,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.5253911905963955,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.839,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.624540846119554,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.95,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.39705628784380886,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.6871,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.4799080580696934,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.9107,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.5105695206974173,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.8726,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.49379073641651894,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7204,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.4591049058322344,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.8084,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.48025358544189095,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.8382,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.5409338192777509,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.813,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.5043038446677717,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.8907,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.48909846693864667,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.7788,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.42189383231534316,
+      "learning_rate": 0.0001,
+      "loss": 0.7254,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.6498002750364892,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.9458,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.5860702994817408,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.8266,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.5226966210384961,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.8119,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.6643669732939301,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.9326,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.43800478347384875,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6661,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.5243694827992629,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7644,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.45520338789668585,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7966,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.43992966853064996,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.708,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.48807994095775004,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.7901,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.40728153509013215,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6695,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.5246214434675819,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.8237,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.4145944652837226,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.7821,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.385472132727159,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.7369,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.4082563502752439,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7811,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.4961172882055245,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.7675,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.5595008692081654,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.8228,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.5736206767177207,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7259,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.48020249717554114,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.7927,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.45753509008065996,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.742,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.5851829562502802,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.9126,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.4498037450909005,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.7648,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.4802454965679165,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7847,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.541887107283236,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7783,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.37235854465942453,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.715,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.5467283085990748,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.808,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.4112013023922369,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7234,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.5217650746703775,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.8946,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5472679043608458,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.9063,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.5696625213498148,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7161,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.4604926826480909,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.899,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.398273916569123,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.7019,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4701540042555817,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7066,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.5286874784470306,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.8726,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4697754629410804,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7847,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.47148247200066923,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.7888,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.7228789592171904,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.8493,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.571409757185734,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.9002,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.6996011540301896,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.9761,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.5315193279605486,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.9298,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.5173413967085707,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.7964,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.49680647790677457,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.8238,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.4823969047811752,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.7636,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.5240306301645788,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.824,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.5044596704389451,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.8629,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.6192386484258986,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.8826,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.44443525426370883,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.826,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.3859963085998379,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.693,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4070405251672028,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.717,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.5190833271778817,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.7967,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.547840043801605,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.8314,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.3432880095460033,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.647,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.5316470427477119,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.863,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4811809918367705,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.8589,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.587522593255453,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.9239,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.5101390040859074,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.8205,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.47299161769213066,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.7769,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.4980906621494592,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7362,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.45084007543847454,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.8285,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.4836606210247619,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.7968,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3951160758966632,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.7392,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.45778817160921503,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.7986,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.4843763278989223,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.7425,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.45512976181686055,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.7525,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.7081591028683811,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7895,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.39282677609329236,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.7479,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.5164174239408281,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.8474,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.45594928656170175,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7878,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.45416952920551357,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7734,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.5455294041502883,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.8201,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.35790769735856265,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.7394,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.5229477084868099,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7682,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.47717369586725983,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.7614,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.5037374425648201,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7593,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.5954151705753504,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.8331,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.40732967945556364,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.6671,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.5883567547120468,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 1.0437,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.36897944283799283,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.669,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.5462323767313574,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.7599,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.5457773022621287,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.8281,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.5010164358058302,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6825,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.4512123705344787,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.7603,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.48649071526542725,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.898,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.39953097613155114,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.77,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.4872181690140072,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.8275,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.43255402627612966,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.7495,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.6018477874503864,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.8628,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.5181961379417853,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.8858,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.47762880599177837,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.8257,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.666641390882161,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.8793,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.5921649166111956,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.8918,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.5273286616462989,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.8519,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.5399503768360546,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.9754,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4764747755157641,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.7807,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.4952819078850315,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.7823,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.48267213098026895,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.7564,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.4463758435676959,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.7389,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.43327085763464696,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6847,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.6036131191772912,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.8528,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.42564486228965653,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7601,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4677345793076768,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.7813,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.4323200788320825,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.6346,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.4308229821971401,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.7071,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.5102181049077796,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.8207,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.5516546810893967,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.7675,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.6743727371325823,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.8203,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.6068926449512423,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.794,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.7362811712148732,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.7252,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4210000906061205,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.7346,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.5725440779433182,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.7607,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.5351904912564988,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.6999,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.425204369840092,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.7229,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.4356200944892181,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.7624,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.46971976933807047,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7964,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.4742280691806812,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.7456,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.5749318956736674,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.9022,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.44099195153929255,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.7516,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.45087710875438175,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.7756,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4574012541894237,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.7861,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.4044503407198491,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.687,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.5482144557534906,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6576,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.4072979799478154,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.7551,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.5519913893567424,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7751,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.427572422181255,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.7599,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.5955275501300678,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.8783,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.5517377691278346,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.8424,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.5541228187635037,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.7198,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.3991511164885935,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.732,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5060165936708827,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.7768,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.48672533089925063,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.8013,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.5253671257368212,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.7999,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.3633770596188525,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.7074,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.449494471923052,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.8176,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.432320602559632,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.684,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.4434694550352049,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.7871,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.8206516083660113,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.9475,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4937657086859226,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.7522,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.49440865555601776,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.7201,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.4175034107778847,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.7441,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.5157776265243595,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.8169,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.5661252354929321,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.8418,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.4601424298620791,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.7333,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.526257932261844,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.9005,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.49758181851348093,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.7995,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.4743048138130881,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.7818,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.4107871656143851,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.7582,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.6125756169265194,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.8057,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.4956432012606571,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.768,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.5597263000321794,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.9382,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.39974074026990225,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.7135,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.4944138228262104,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.7995,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.45886221709901115,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.7852,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.4516610392627582,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.7456,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.557783936084678,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.8968,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.4856834956172401,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.7805,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.6565524280130758,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.8707,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.6710177605881805,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.7805,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.42889943601873365,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.7535,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4333118125395268,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.7255,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.5839841393173936,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.7486,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.4521895534934514,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.7484,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.520280552073544,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.812,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.45670805261710384,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.8,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.5335285014546386,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.922,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.4881079791204276,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.7751,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.44987938879675854,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.7279,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.6833009708779766,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.9617,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.4250791221610793,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.7148,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.5399826159178152,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.7702,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.43743477925604757,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.7157,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.40297369122305954,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.7481,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.49175256105542803,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.8124,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.4373359544357551,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.737,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.54633843763496,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.7988,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.41604550782919175,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6769,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.5664311821278589,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.7408,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.5897444993781782,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.8862,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.5089631988157485,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.7385,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.5607696948444847,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.8308,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.5036784353906988,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.8341,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.4874499905555397,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.7482,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.4893206979563326,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7605,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.46314958256468436,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.7855,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.461567993549935,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.6404,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.6181078155420232,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.8067,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.4750427640337518,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.6885,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4778704552565716,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.7967,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.8052259392554215,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 1.1212,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.6065221233351177,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.7793,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.4023324353845839,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7498,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.43096596907532625,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6619,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.5331475367249551,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.7365,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.46125235258257047,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.7711,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.5770787921259529,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.8073,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.35681059458476244,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6854,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.47730334936577407,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.8211,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.446073273341204,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.7873,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.5828635553989929,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.9511,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.6094073934049821,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.794,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.6164796392342045,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.9665,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.42342672248834473,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.7665,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.6765032297992837,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.8345,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.5064304818411385,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.707,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.40089275275048747,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6586,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.5483517201907702,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.7828,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.360374178412597,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6721,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.4454048336442163,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.774,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.3721989071970144,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6956,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.5985047489125143,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.9089,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.5237948237809888,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.8306,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.8648512711778988,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.7997,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.6197763413747968,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.7867,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.42470141034649467,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6804,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.4856713864563471,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.8431,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.5330578663809168,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.8066,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.4947389161075935,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.7729,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.41545998015118474,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.7083,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.485544883719098,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.7167,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.6386894810307258,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.7929,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.46002750542797544,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.7106,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.46868454363851236,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.7373,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.7144007673862546,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.7063,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.5069051513657338,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.7728,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.5803596449574296,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.767,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4182408173362448,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.7044,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.4354868248843499,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.7385,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.41941309017306944,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.7395,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.5371051126727617,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.7964,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5771696475203609,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.8346,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.4382500643697019,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.693,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.43605868921778695,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6877,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.4165356960116476,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6415,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.4129667645934099,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.7575,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.6351768163984158,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.9496,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.41141560339732675,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.7321,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.5023405131637189,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.7426,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.42821444755882565,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.7429,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.5459880627672273,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.9019,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.46115758430714615,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.7585,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.44139000235319503,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6966,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.4543809462072909,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.7594,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.4495748022007052,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.8066,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.46347153258550544,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.7445,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.5280086571585247,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.7395,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.5660248162122429,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.8576,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.6919236328983153,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.8169,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3978352567607781,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.7173,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.46468456208961195,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.7637,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.5054514384703425,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.8906,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.42486805180548026,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.7185,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.49231465713121064,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.8126,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.4264580566405121,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6775,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.5889294212994212,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.8009,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.5146199091910297,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.7888,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.5839724540785394,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.7026,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.4683585152611421,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.7459,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.4079869469017918,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.7465,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.46769691407696456,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.7662,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4261406956454604,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.7229,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.4667058373952707,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.7459,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.6382869126429721,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.8836,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.4921557339698548,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.7706,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.48839344968871246,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.7637,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.5485656778087554,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.7947,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.5328073776988045,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.8176,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.4746265132334152,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.7356,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.5354721956600221,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.8329,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.5196997436322807,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.8083,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4919705407620299,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.71,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.5509294451771631,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.9046,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.4322044034560731,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.7115,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.5441241973237635,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.7648,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.42712086843003794,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.7394,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.5224691303715251,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.776,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.5054549573817237,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.8071,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.5423938658985061,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.7444,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.5593259969244396,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.8354,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.6699218671190321,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.717,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.549979447808889,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.8225,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.43589293457086764,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.7761,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.5393452630062003,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.8387,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.7906770173799234,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.785,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.45367166208698123,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6509,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.41705664578435075,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6741,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.5138034548727115,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.7611,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.6706363126180388,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.8844,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.5175012983585358,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.7132,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.47568378499503156,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.7699,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.5267143620594829,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.7824,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.5540117150452298,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.8809,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.36858401023066945,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.7154,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.4195343631350608,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6821,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.4821407975273321,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.8118,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.4541170099073794,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.769,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.5000787751047758,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.8309,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.46285766823963,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.7921,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.48469196593187697,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.8265,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.5571591206945543,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6982,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.43003274517721807,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.7066,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.4022428797384306,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6786,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.5569976399581428,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.7958,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.45222096782817917,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.7226,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.48513423816842716,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.7292,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.47414292362264654,
+      "learning_rate": 0.0,
+      "loss": 0.7807,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 508727619158016.0,
+      "train_loss": 0.8337012174606323,
+      "train_runtime": 9258.3008,
+      "train_samples_per_second": 1.08,
+      "train_steps_per_second": 0.068
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 508727619158016.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..13695ff226df5a430b650c69efd6adeabb849f6d
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "q_proj",
+    "o_proj",
+    "v_proj",
+    "down_proj",
+    "gate_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3da89ebe1059f2e50e626c9fc0b98bdbce1e8bf8
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60e78440e8ad5c216e1bbb491649fb73d8e9423283d6831d816658f013ddea60
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a041162bf517056ff0271fabae4667a07b114ee8
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d89c8f41acde27e42e9a8a3b4e1df3f1d80d5199b474eab8c9904c815c8b6d0
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..180381c7f91a190e1985a0295e94966b0b2a4bcc
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.7954520844133007,
+      "learning_rate": 2e-05,
+      "loss": 1.3544,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8486736733468345,
+      "learning_rate": 4e-05,
+      "loss": 1.4258,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7231129434225938,
+      "learning_rate": 6e-05,
+      "loss": 1.2565,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.6549525568483539,
+      "learning_rate": 8e-05,
+      "loss": 1.301,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.6713937782391836,
+      "learning_rate": 0.0001,
+      "loss": 1.1783,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.7775279811022765,
+      "learning_rate": 0.00012,
+      "loss": 1.2018,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.8210500566974728,
+      "learning_rate": 0.00014,
+      "loss": 1.0322,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.582072473996548,
+      "learning_rate": 0.00016,
+      "loss": 1.0719,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5332524469921555,
+      "learning_rate": 0.00018,
+      "loss": 0.9937,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.4652344399947701,
+      "learning_rate": 0.0002,
+      "loss": 0.9582,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.528479459724629,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 1.0025,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.48862298453038183,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 1.0045,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4788245246121141,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.9681,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.44036289873008344,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.9215,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5449869090192805,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.9593,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4237521722332184,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.8736,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.4168713872622836,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.8863,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.45264982727147834,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.8696,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.3929576698684051,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.916,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4415540009570294,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.9198,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.3812265501303348,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.8966,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.4047740643644689,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.9509,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.394670597027155,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.8582,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.40883800080500177,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8873,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.4945237937729557,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.9728,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.4621016693300622,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.9232,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.6732021018661585,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.8132,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4422468298010891,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.9266,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.40220173065598824,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.8905,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.36918169793645084,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.8733,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.5597457386366598,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.9705,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5595809993867294,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.9519,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.4605324530129531,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.895,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.42815914241954167,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.9781,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.4124008245937204,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.8654,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4388119798381529,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.9486,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.41876874470531383,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.8528,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.4063934253065074,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.9519,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.4281987478958129,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.931,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3995519502824548,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.8706,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.39938546863455104,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.9081,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.41849656692135495,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.9316,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.38704750788751674,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.9122,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.36713908717782284,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8173,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.386332629442072,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.8054,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.40175758635877834,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.871,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.3980262653132001,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.864,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.4340588852915137,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.9131,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.424408064468781,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.8575,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.42662287831550505,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8401,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.4018922149491412,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.8477,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4547104194477866,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.9216,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.46878889945327834,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.8866,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3901101154924772,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8071,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.3786177626239962,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.8155,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.37399747556550167,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.8673,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.4476713311081534,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.8913,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.3695813169482114,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.8532,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.3740852677452882,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.851,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.37805895790025723,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.8893,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.48550367409643636,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.9271,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4261985087419851,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.9485,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.3506821138816665,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.7993,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.39926954240428475,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.9094,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.369474669842929,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.8639,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.38412612507425553,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.92,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.6236483911477445,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.9896,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.40709892702353573,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.7737,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.38384691117683545,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.8189,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.419002960060546,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.8892,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.4317304658707876,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.9425,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3605359976609917,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8432,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.38387195033721394,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.8127,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.4115773141922566,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.835,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.41682944760695084,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.8527,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.44021561768781686,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8634,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.3788641404846892,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.8657,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.35062726715426834,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.776,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.3640559161150146,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.8121,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.35673860083372294,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.8013,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.36719254075722085,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.864,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.39931819574163646,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.8932,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.3982022670626305,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.8254,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.3997476767736355,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8817,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3463507690011195,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.8348,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4200196272874818,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.9056,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.3755782720326349,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.8419,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.3981270184947585,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.8491,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.37218427069658755,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.9352,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3395579879759133,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.7546,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.3673793454918801,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.833,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.34780686537520417,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.8021,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.38425962561813065,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.8679,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.42936546302655365,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.8377,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4267179181570991,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.8357,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.36916905077663476,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.8232,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.38729223078060027,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.8608,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4065510182299363,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.8215,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.35357551053351133,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.8115,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.0001838046966431,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.8702,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.40193375582947716,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.8128,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.3935235182421146,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.7955,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.6013285238613454,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.8225,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3787137979507313,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.8257,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.38425572159674526,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.8475,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4006328412662351,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.8648,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.4086678609977253,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.8863,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.4881485864286447,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.9356,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.4025668507825546,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.8127,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3710129691466552,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.8428,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.4003824576144442,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.8425,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.5018583001311245,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.8779,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.36121263099811374,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.7713,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.4251687064188239,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.7802,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3569667148399426,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.8449,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4066427039698594,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.8426,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.44110171219803324,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.8606,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.37113662406191406,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.8036,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.36927548404036525,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.7892,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.3628801931909292,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.8945,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.3881852305317355,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.8457,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.3493063856843603,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.8096,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.4238637282608415,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.8239,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.35268751443414337,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.8348,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.39291285524686237,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.8241,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.34506213306561334,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.7677,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.4203851880192952,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.8461,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4474038709406084,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.8768,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3778914426775528,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.8762,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3583604676991779,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.749,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.3201977553502979,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.771,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.327874635613636,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7247,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.36654127787678314,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.8057,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.37665903879585244,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7767,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4034764628456416,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.8348,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.3907539802126158,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.8872,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3569846944905879,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.8112,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.36188126942655424,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.8019,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.379019802148045,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.7994,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.41234904215035534,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.8809,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.38868434007296393,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.7578,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.350346834606368,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.8003,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.4135182041369236,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.7891,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.3447428610535347,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7747,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3659043229276162,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.7847,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3627598928666497,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.8729,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3316910349747776,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.7812,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3946570922891007,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.7865,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.3359636799382807,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7965,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.37137439376427284,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.849,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3574031642165924,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.825,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.38556543326751924,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.8379,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.38805213476282924,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.8097,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.3574217944877886,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7886,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.39271223057544774,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.8542,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.4324299524551468,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.8882,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.32450275667877587,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.7915,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.37732713298950105,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7915,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.3406928749347663,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.817,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3751715626743408,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.8437,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.33757211373718954,
+      "learning_rate": 0.0001,
+      "loss": 0.7485,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.46079705496359785,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.8788,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.4271610568124271,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.8616,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3730244116708226,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.713,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.32767683574358164,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.7503,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.32244567576195554,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.7239,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3397126044271277,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7984,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3012960591552935,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.7531,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.37938450909301,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.7915,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.38701381132864615,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7544,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.38690509598881456,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.8261,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3662178194553636,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.7712,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.325465438950959,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.7461,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.36372519207365633,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7577,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.39791618344605173,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.8952,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.39106877210397273,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.8067,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.32165292969442727,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.704,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.36154222910864736,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.8184,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.42719982445728705,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.8165,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.5246656865774878,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.9235,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3889988384699533,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.8564,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.34293279991160774,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.7917,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.43448804859625373,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.8397,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.37332421671048194,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.847,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.2847408908726539,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.7008,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3793630586511894,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.8073,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.3314978495730383,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.7501,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.38192244839668066,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.8806,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.35144673379944147,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.7983,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3320272570958794,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.7778,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3245100936187699,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.7638,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3487129086320174,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.771,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.35931606259571913,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.7663,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3367390991318867,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7966,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.33183255910992066,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.78,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3264528270630262,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7772,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.3517378205790495,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.7642,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.431338962220006,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7927,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.3731631592093385,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.8503,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.33909105312590193,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7154,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.36200420260594113,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.7569,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4149004455820632,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.8286,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.32292850370856624,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.8015,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3771935747682809,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.803,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3541848583253109,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.8545,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.44832715565151354,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.8812,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.37472326339826867,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.9129,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.35228666839882417,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7832,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3350939709150189,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.7491,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.38948109383274765,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7651,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.32094235443298985,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.769,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.30333546694103175,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.6676,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.36011644742045107,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.7948,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.4501270149702274,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.8075,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.32443270170477734,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.7327,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.35437103043306173,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.7306,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.32026538320410597,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.7387,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3565566202307532,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.7722,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.3770279902011318,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.8262,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3225427774272655,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.7808,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.3643305874741898,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.6751,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3495365472341216,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7634,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.3865725013932627,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.8201,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.39371326689707414,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.783,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3343399455415386,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.7558,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.37018368726419704,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.8034,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.2917671574400929,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.7608,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3149055123541856,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.7376,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.48194732886134795,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.8543,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.3201762925229535,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.7311,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.3832969139432672,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.8361,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3591778802980467,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.8246,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.33711687357568193,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.7911,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3992050715809478,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.784,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3717791655175765,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.8544,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3266902771814692,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.758,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.35563021204215095,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.7699,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.379714101588036,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.8409,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.47451149824579747,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.8289,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3188599450071592,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7431,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3537473485993502,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.7519,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.35260123842359997,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.8052,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.3782134918689213,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.8498,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.4240797008066036,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.8504,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.3493922027357962,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.748,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3109975196467063,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7332,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.3490386685549296,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.7745,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.36175624431520176,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.7388,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.4367153475876239,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.8179,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.39966660383957925,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7862,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.39454280119505963,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.7943,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.34240338202982135,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7764,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3836219267710939,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.7249,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.3545049008632808,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.745,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.4782319117565856,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.9546,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.30912276688779844,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.7098,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.3564793276918867,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.7553,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.34525121870000597,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7499,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3349420082155695,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.805,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.433540567668089,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.877,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.3801150070687427,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.8694,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3738768456908641,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7737,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.3801506817792511,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.7256,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.29294064483194054,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7278,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3574084890505441,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.8017,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.44325322418927066,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.8188,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.33465511889302607,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.7404,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.35582143701746155,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.828,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.32908366694518915,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.7496,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3923286422615176,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.7605,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.32871379992022076,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.7252,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4464878317171815,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7487,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.37431391710968503,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.7393,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.31383965433569194,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.7417,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.40622594368739234,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.8212,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.312688075077189,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.6931,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.2951795837647385,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.7062,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3906241449752177,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.8502,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.33976393181898756,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.7466,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3645631566569671,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.8331,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.32171725161543063,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.7301,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.3215222867159588,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7803,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.3930936220455362,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.8015,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.4009757739317676,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.7702,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3509902652942518,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.8313,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.4823527769540213,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.7737,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.37189019955946206,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.7485,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.3929380712488443,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.7541,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3230172315650001,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.7514,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.331298596116422,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.7476,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.41208820254007095,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.8253,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3540797011234827,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7754,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.40367747859127645,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.8121,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.36985043876259127,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.7875,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3757213402313114,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.7626,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3519319526157233,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.8127,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3733908580196815,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.7547,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.36965768774397506,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7968,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.392672840473129,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.7971,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.35145171210243764,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.7712,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.3557470819008264,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.8122,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.39875493221618996,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7222,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.3398025671483795,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.7241,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.44734223609044316,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.8064,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.36261141044117756,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.7815,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 1.1171234591773465,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.7992,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3255664161802146,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.7498,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.34989009087457607,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.8037,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.33217965843871977,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.8156,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3603261858382364,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.7071,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.3558631704357258,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.742,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3200726372631764,
+      "learning_rate": 0.0,
+      "loss": 0.7306,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 739882583326720.0,
+      "train_loss": 0.8309416449986972,
+      "train_runtime": 9163.8195,
+      "train_samples_per_second": 1.091,
+      "train_steps_per_second": 0.034
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 739882583326720.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e03b45cf925e222cf06ea1a1800cc06b498db25f
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "gate_proj",
+    "k_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a927f46d45d5d66d44ba52e15be6346557d16204
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:807df09410a8bac8441b10edd939efedcbdf49e2206c5d73aed6963a3087c65e
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2c15821cbe92871abfc72a15ea48fb61c63f13ec
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a957a55853852e70c4e4559e96f956dcefc8993421eea81bbc558164c8b426a
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8246e58fb9fa0d592b4db2af835e1500da02c9f0
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,8792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.8254008371684874,
+      "learning_rate": 5.263157894736842e-06,
+      "loss": 1.2336,
+      "step": 1
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.9001692078599441,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.1506,
+      "step": 2
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 1.1122164974039503,
+      "learning_rate": 1.5789473684210526e-05,
+      "loss": 1.4268,
+      "step": 3
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8966554613374759,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.4789,
+      "step": 4
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.8525456477268124,
+      "learning_rate": 2.6315789473684212e-05,
+      "loss": 1.1729,
+      "step": 5
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.7669096680011646,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.354,
+      "step": 6
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.7345567162332799,
+      "learning_rate": 3.6842105263157895e-05,
+      "loss": 1.2388,
+      "step": 7
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9536980255199109,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.4101,
+      "step": 8
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.9415953438208667,
+      "learning_rate": 4.736842105263158e-05,
+      "loss": 1.145,
+      "step": 9
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.8529883237546517,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.1384,
+      "step": 10
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 1.126915375873681,
+      "learning_rate": 5.789473684210527e-05,
+      "loss": 1.2245,
+      "step": 11
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.9167812550021163,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.127,
+      "step": 12
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.8528831790324976,
+      "learning_rate": 6.842105263157895e-05,
+      "loss": 1.0553,
+      "step": 13
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.9532646265952134,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.1337,
+      "step": 14
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.880662805694576,
+      "learning_rate": 7.894736842105263e-05,
+      "loss": 1.1473,
+      "step": 15
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.9702786952678096,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.9314,
+      "step": 16
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.7029363111637975,
+      "learning_rate": 8.947368421052632e-05,
+      "loss": 0.9208,
+      "step": 17
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.5266694454511093,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.8928,
+      "step": 18
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.682516304944438,
+      "learning_rate": 0.0001,
+      "loss": 0.9943,
+      "step": 19
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5953207994894946,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.8949,
+      "step": 20
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.6906825492076013,
+      "learning_rate": 0.0001105263157894737,
+      "loss": 1.1288,
+      "step": 21
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.671075887310189,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.9903,
+      "step": 22
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.5100061823856961,
+      "learning_rate": 0.00012105263157894738,
+      "loss": 0.8214,
+      "step": 23
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5954574516082615,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8619,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.4735325474803045,
+      "learning_rate": 0.00013157894736842108,
+      "loss": 0.794,
+      "step": 25
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.5792859050154008,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.8778,
+      "step": 26
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.8063892780047107,
+      "learning_rate": 0.00014210526315789474,
+      "loss": 1.0306,
+      "step": 27
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.6768891635446834,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 1.0523,
+      "step": 28
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.5673538618908092,
+      "learning_rate": 0.00015263157894736845,
+      "loss": 0.9252,
+      "step": 29
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.7017082041353666,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8449,
+      "step": 30
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.657083612318453,
+      "learning_rate": 0.0001631578947368421,
+      "loss": 0.9792,
+      "step": 31
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5097573928562225,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8692,
+      "step": 32
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.7351668590114271,
+      "learning_rate": 0.0001736842105263158,
+      "loss": 1.0574,
+      "step": 33
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.45662861605853267,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8089,
+      "step": 34
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.63199243924169,
+      "learning_rate": 0.00018421052631578948,
+      "loss": 0.9736,
+      "step": 35
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5285096956312785,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.863,
+      "step": 36
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.4865534099301057,
+      "learning_rate": 0.00019473684210526317,
+      "loss": 0.8491,
+      "step": 37
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.7465752627954467,
+      "learning_rate": 0.0002,
+      "loss": 1.0501,
+      "step": 38
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.6777157931085108,
+      "learning_rate": 0.00019999966405802826,
+      "loss": 0.9819,
+      "step": 39
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.48012712623173426,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.7824,
+      "step": 40
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.7025851138868436,
+      "learning_rate": 0.00019999697653579705,
+      "loss": 0.8443,
+      "step": 41
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.5804040068154181,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.9188,
+      "step": 42
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.6174137093451777,
+      "learning_rate": 0.0001999916015635627,
+      "loss": 0.8948,
+      "step": 43
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.565237666232095,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8635,
+      "step": 44
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.7716355173434238,
+      "learning_rate": 0.00019998353928577919,
+      "loss": 0.9928,
+      "step": 45
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.7912711363450475,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 1.0359,
+      "step": 46
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.6700910817301969,
+      "learning_rate": 0.0001999727899191228,
+      "loss": 0.9516,
+      "step": 47
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.585045306082608,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.9152,
+      "step": 48
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.5846940869630389,
+      "learning_rate": 0.00019995935375248606,
+      "loss": 0.983,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6064015603095227,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.9917,
+      "step": 50
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.5600487528206084,
+      "learning_rate": 0.00019994323114697022,
+      "loss": 0.9052,
+      "step": 51
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.637614919803038,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.9627,
+      "step": 52
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.5057824749130014,
+      "learning_rate": 0.0001999244225358753,
+      "loss": 0.8209,
+      "step": 53
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.6987392056027488,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8004,
+      "step": 54
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.5920113344336018,
+      "learning_rate": 0.00019990292842468868,
+      "loss": 0.9805,
+      "step": 55
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.6142815169052033,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 1.0478,
+      "step": 56
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.5068824296441238,
+      "learning_rate": 0.0001998787493910712,
+      "loss": 0.9216,
+      "step": 57
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.5597225389448093,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.8183,
+      "step": 58
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.5763303749670541,
+      "learning_rate": 0.000199851886084842,
+      "loss": 0.945,
+      "step": 59
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5366233300766187,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8461,
+      "step": 60
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.6596429498464922,
+      "learning_rate": 0.00019982233922796085,
+      "loss": 0.9229,
+      "step": 61
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5976747715695442,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.9659,
+      "step": 62
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.5164856598070255,
+      "learning_rate": 0.00019979010961450878,
+      "loss": 0.8142,
+      "step": 63
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5430531234847793,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.8592,
+      "step": 64
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.5091259024865504,
+      "learning_rate": 0.00019975519811066663,
+      "loss": 0.8416,
+      "step": 65
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.46998804920242443,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8344,
+      "step": 66
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.5619349448714933,
+      "learning_rate": 0.0001997176056546921,
+      "loss": 0.9138,
+      "step": 67
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.5929775120851979,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.892,
+      "step": 68
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.552548233857106,
+      "learning_rate": 0.0001996773332568941,
+      "loss": 0.9087,
+      "step": 69
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.860017864083892,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.9755,
+      "step": 70
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.4914721627776415,
+      "learning_rate": 0.00019963438199960599,
+      "loss": 0.8207,
+      "step": 71
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.4573685117193804,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.7282,
+      "step": 72
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.5094000388508044,
+      "learning_rate": 0.00019958875303715615,
+      "loss": 0.9036,
+      "step": 73
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.7188280395095236,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 1.0293,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5685404562203534,
+      "learning_rate": 0.0001995404475958373,
+      "loss": 0.9022,
+      "step": 75
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.56915513588708,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.9357,
+      "step": 76
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.6290734267531265,
+      "learning_rate": 0.0001994894669738732,
+      "loss": 0.9204,
+      "step": 77
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.5231699704733803,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.8465,
+      "step": 78
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.552839164836695,
+      "learning_rate": 0.0001994358125413841,
+      "loss": 0.8066,
+      "step": 79
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5533739578556125,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8253,
+      "step": 80
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.5441874080378364,
+      "learning_rate": 0.0001993794857403495,
+      "loss": 0.8499,
+      "step": 81
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.5783011755944921,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.8391,
+      "step": 82
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.5227658869368782,
+      "learning_rate": 0.0001993204880845699,
+      "loss": 0.8002,
+      "step": 83
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.6353336941951139,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.952,
+      "step": 84
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.4849015113603158,
+      "learning_rate": 0.00019925882115962568,
+      "loss": 0.8252,
+      "step": 85
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.5612381543734479,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.9493,
+      "step": 86
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.49033130992014856,
+      "learning_rate": 0.00019919448662283478,
+      "loss": 0.8418,
+      "step": 87
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.6398349798953992,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 1.0504,
+      "step": 88
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.5526787869064176,
+      "learning_rate": 0.00019912748620320794,
+      "loss": 0.877,
+      "step": 89
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5521455522404153,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.8892,
+      "step": 90
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.519755619878471,
+      "learning_rate": 0.00019905782170140238,
+      "loss": 0.899,
+      "step": 91
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.46629053718261354,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.7694,
+      "step": 92
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.6171050549681674,
+      "learning_rate": 0.00019898549498967343,
+      "loss": 0.9466,
+      "step": 93
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.49479909567521657,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.8541,
+      "step": 94
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.5780972034978296,
+      "learning_rate": 0.000198910508011824,
+      "loss": 0.9026,
+      "step": 95
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.580664933003876,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.9675,
+      "step": 96
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.6179081879462358,
+      "learning_rate": 0.00019883286278315262,
+      "loss": 0.8656,
+      "step": 97
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.6391035686987561,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.9212,
+      "step": 98
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.5877945457338969,
+      "learning_rate": 0.00019875256139039902,
+      "loss": 0.9321,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.44098932185732936,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.7673,
+      "step": 100
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.6794514741827338,
+      "learning_rate": 0.00019866960599168826,
+      "loss": 0.923,
+      "step": 101
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.5650947115406529,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.945,
+      "step": 102
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.5554518672223475,
+      "learning_rate": 0.0001985839988164726,
+      "loss": 0.895,
+      "step": 103
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.6049279312276732,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.9053,
+      "step": 104
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.7746043824252926,
+      "learning_rate": 0.00019849574216547171,
+      "loss": 0.9953,
+      "step": 105
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.6332061215183638,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.9304,
+      "step": 106
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.5108607839446464,
+      "learning_rate": 0.00019840483841061058,
+      "loss": 0.8559,
+      "step": 107
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.6926921479668185,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.9331,
+      "step": 108
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.6264027420575131,
+      "learning_rate": 0.00019831128999495606,
+      "loss": 0.9285,
+      "step": 109
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.6330186637950089,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.791,
+      "step": 110
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.6052528890139297,
+      "learning_rate": 0.0001982150994326511,
+      "loss": 0.9436,
+      "step": 111
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.6075773668650208,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.9489,
+      "step": 112
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.47079173468326774,
+      "learning_rate": 0.0001981162693088471,
+      "loss": 0.8329,
+      "step": 113
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.5974572151753968,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8849,
+      "step": 114
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.6647289755145273,
+      "learning_rate": 0.0001980148022796345,
+      "loss": 0.8816,
+      "step": 115
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.5682715106131008,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.8941,
+      "step": 116
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.482152650595725,
+      "learning_rate": 0.00019791070107197153,
+      "loss": 0.8777,
+      "step": 117
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.5165789823618901,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.941,
+      "step": 118
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.5206375165586081,
+      "learning_rate": 0.0001978039684836106,
+      "loss": 0.848,
+      "step": 119
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5473354335363099,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.8837,
+      "step": 120
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.5867533232230686,
+      "learning_rate": 0.0001976946073830234,
+      "loss": 0.8873,
+      "step": 121
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.5004520879149664,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.9032,
+      "step": 122
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.5660056802785558,
+      "learning_rate": 0.00019758262070932375,
+      "loss": 0.8528,
+      "step": 123
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.47198566166403494,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.8371,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.531985413922441,
+      "learning_rate": 0.00019746801147218842,
+      "loss": 0.9518,
+      "step": 125
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.6590497176544988,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.9315,
+      "step": 126
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.5399341149414711,
+      "learning_rate": 0.00019735078275177654,
+      "loss": 0.8771,
+      "step": 127
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.7152022612823268,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.9737,
+      "step": 128
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.5632111210435217,
+      "learning_rate": 0.00019723093769864663,
+      "loss": 0.9494,
+      "step": 129
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.6494432815425615,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.9395,
+      "step": 130
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.645820404697124,
+      "learning_rate": 0.0001971084795336719,
+      "loss": 1.0063,
+      "step": 131
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.6386019532827905,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.8572,
+      "step": 132
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.7541722440377103,
+      "learning_rate": 0.00019698341154795389,
+      "loss": 0.9837,
+      "step": 133
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.7130713099133933,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.9495,
+      "step": 134
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 1.0102407806146998,
+      "learning_rate": 0.00019685573710273376,
+      "loss": 1.1071,
+      "step": 135
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.6399714005772517,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 1.0277,
+      "step": 136
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.5520647074833175,
+      "learning_rate": 0.00019672545962930215,
+      "loss": 0.8267,
+      "step": 137
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.6811703160571222,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.9764,
+      "step": 138
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.4951819776586182,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.8814,
+      "step": 139
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.7252144114525896,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8994,
+      "step": 140
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.6231396634246661,
+      "learning_rate": 0.00019645710967265882,
+      "loss": 0.9026,
+      "step": 141
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.5406149720059259,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.8788,
+      "step": 142
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.59254392448039,
+      "learning_rate": 0.00019631904440143612,
+      "loss": 0.8767,
+      "step": 143
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.581699642170081,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.9353,
+      "step": 144
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.5105455362571412,
+      "learning_rate": 0.00019617839052578603,
+      "loss": 0.903,
+      "step": 145
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.5100626366765849,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.8864,
+      "step": 146
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.5844083492003826,
+      "learning_rate": 0.0001960351518258255,
+      "loss": 0.9265,
+      "step": 147
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5203118186526728,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8405,
+      "step": 148
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.567099504639143,
+      "learning_rate": 0.00019588933215113926,
+      "loss": 0.9201,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.600935593665811,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.7433,
+      "step": 150
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.6859220255955671,
+      "learning_rate": 0.00019574093542067673,
+      "loss": 1.0724,
+      "step": 151
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.7133179044607507,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.8981,
+      "step": 152
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.4567081431598064,
+      "learning_rate": 0.0001955899656226464,
+      "loss": 0.8523,
+      "step": 153
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.6231102612177877,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8895,
+      "step": 154
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.5222674271560935,
+      "learning_rate": 0.0001954364268144088,
+      "loss": 0.8027,
+      "step": 155
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.5662569267762858,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.8573,
+      "step": 156
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.4781599460417116,
+      "learning_rate": 0.00019528032312236736,
+      "loss": 0.8367,
+      "step": 157
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.7033651527431625,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.9358,
+      "step": 158
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.8097342224747683,
+      "learning_rate": 0.00019512165874185767,
+      "loss": 1.0317,
+      "step": 159
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.5806969757982609,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.8729,
+      "step": 160
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.5303743310414841,
+      "learning_rate": 0.0001949604379370345,
+      "loss": 0.8312,
+      "step": 161
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.4256547422428516,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.7859,
+      "step": 162
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.4653065719768864,
+      "learning_rate": 0.00019479666504075736,
+      "loss": 0.8421,
+      "step": 163
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.4996515574568142,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.7827,
+      "step": 164
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.5801231538350596,
+      "learning_rate": 0.0001946303444544741,
+      "loss": 0.9412,
+      "step": 165
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.7326758199494244,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8973,
+      "step": 166
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.5444005275625907,
+      "learning_rate": 0.00019446148064810242,
+      "loss": 0.7796,
+      "step": 167
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5803129383288371,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8867,
+      "step": 168
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.5135925841351157,
+      "learning_rate": 0.00019429007815990993,
+      "loss": 0.8538,
+      "step": 169
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.6383901031960644,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7942,
+      "step": 170
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.6122442084128001,
+      "learning_rate": 0.00019411614159639204,
+      "loss": 0.9368,
+      "step": 171
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.6087794842053829,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.8404,
+      "step": 172
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.5564528142757547,
+      "learning_rate": 0.00019393967563214833,
+      "loss": 0.8325,
+      "step": 173
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.4850567265061148,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8228,
+      "step": 174
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5221162064849709,
+      "learning_rate": 0.00019376068500975667,
+      "loss": 0.8094,
+      "step": 175
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5151874566600034,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8858,
+      "step": 176
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.4878709551254769,
+      "learning_rate": 0.000193579174539646,
+      "loss": 0.787,
+      "step": 177
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.7233030595933002,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.9651,
+      "step": 178
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.5130775448030487,
+      "learning_rate": 0.00019339514909996706,
+      "loss": 0.8825,
+      "step": 179
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.44839345672476477,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8117,
+      "step": 180
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.5883691759941093,
+      "learning_rate": 0.00019320861363646095,
+      "loss": 0.8219,
+      "step": 181
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.5423548373179294,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.896,
+      "step": 182
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.5423744889933809,
+      "learning_rate": 0.00019301957316232658,
+      "loss": 0.9858,
+      "step": 183
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.5418330307570707,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.8639,
+      "step": 184
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.6298302748895126,
+      "learning_rate": 0.0001928280327580858,
+      "loss": 0.8705,
+      "step": 185
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.49816030141794515,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.804,
+      "step": 186
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.5939853111214843,
+      "learning_rate": 0.00019263399757144683,
+      "loss": 0.9174,
+      "step": 187
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.48692067227331565,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8227,
+      "step": 188
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.5617378319201108,
+      "learning_rate": 0.000192437472817166,
+      "loss": 0.8516,
+      "step": 189
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.5479004944647015,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.8182,
+      "step": 190
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.5583539591117561,
+      "learning_rate": 0.00019223846377690754,
+      "loss": 0.8833,
+      "step": 191
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.628958491801015,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.967,
+      "step": 192
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.527302060387629,
+      "learning_rate": 0.00019203697579910154,
+      "loss": 0.8472,
+      "step": 193
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.6949486987354203,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.9665,
+      "step": 194
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.459939602241374,
+      "learning_rate": 0.00019183301429880043,
+      "loss": 0.7723,
+      "step": 195
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.48937827881685525,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.751,
+      "step": 196
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.5767992998755989,
+      "learning_rate": 0.00019162658475753327,
+      "loss": 0.84,
+      "step": 197
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.4694125748671859,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8372,
+      "step": 198
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.6001006685725776,
+      "learning_rate": 0.00019141769272315858,
+      "loss": 0.9252,
+      "step": 199
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5144273620084883,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.9258,
+      "step": 200
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.43796251272775827,
+      "learning_rate": 0.00019120634380971496,
+      "loss": 0.7584,
+      "step": 201
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.5936578788134119,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.9065,
+      "step": 202
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.5598625984686376,
+      "learning_rate": 0.0001909925436972706,
+      "loss": 0.8307,
+      "step": 203
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5443220379494453,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.9247,
+      "step": 204
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.8184433595636279,
+      "learning_rate": 0.00019077629813177036,
+      "loss": 1.0944,
+      "step": 205
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.6500568742123364,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.9583,
+      "step": 206
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.5160545784531383,
+      "learning_rate": 0.00019055761292488142,
+      "loss": 0.8555,
+      "step": 207
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.5811791862451305,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.921,
+      "step": 208
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.5369674097145263,
+      "learning_rate": 0.00019033649395383702,
+      "loss": 0.8913,
+      "step": 209
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.5059613450805891,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.7337,
+      "step": 210
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.6116792895453856,
+      "learning_rate": 0.00019011294716127867,
+      "loss": 0.8973,
+      "step": 211
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.6389591277804562,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.9158,
+      "step": 212
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.4871039962986681,
+      "learning_rate": 0.0001898869785550963,
+      "loss": 0.8081,
+      "step": 213
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.49726856016557125,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.8494,
+      "step": 214
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.47202207333137924,
+      "learning_rate": 0.00018965859420826684,
+      "loss": 0.7875,
+      "step": 215
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.5000434141921354,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.8727,
+      "step": 216
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.6404336077945951,
+      "learning_rate": 0.00018942780025869098,
+      "loss": 0.9935,
+      "step": 217
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.7932723905343942,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.8474,
+      "step": 218
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.5249879319383836,
+      "learning_rate": 0.00018919460290902826,
+      "loss": 0.8568,
+      "step": 219
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.6846471757505506,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.9763,
+      "step": 220
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.6000463340587131,
+      "learning_rate": 0.0001889590084265304,
+      "loss": 0.8808,
+      "step": 221
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.7270437314662276,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.9453,
+      "step": 222
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.7129826804040741,
+      "learning_rate": 0.0001887210231428727,
+      "loss": 0.8448,
+      "step": 223
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.5044831020418066,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.8267,
+      "step": 224
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5375845762581131,
+      "learning_rate": 0.0001884806534539841,
+      "loss": 0.9102,
+      "step": 225
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.4715329765658659,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8088,
+      "step": 226
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.603833247123497,
+      "learning_rate": 0.0001882379058198751,
+      "loss": 0.8914,
+      "step": 227
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.6322228261977552,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.8906,
+      "step": 228
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.6331290721160587,
+      "learning_rate": 0.00018799278676446423,
+      "loss": 0.8661,
+      "step": 229
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.5569009553200088,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.8732,
+      "step": 230
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.6660678270910944,
+      "learning_rate": 0.00018774530287540278,
+      "loss": 0.7198,
+      "step": 231
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.6353279140108365,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.8603,
+      "step": 232
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.46124480588647165,
+      "learning_rate": 0.00018749546080389757,
+      "loss": 0.727,
+      "step": 233
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.5928128178155065,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.8627,
+      "step": 234
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.5546350490698052,
+      "learning_rate": 0.00018724326726453244,
+      "loss": 0.9084,
+      "step": 235
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.7639344549876476,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 1.0281,
+      "step": 236
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.6056069204561181,
+      "learning_rate": 0.00018698872903508755,
+      "loss": 0.952,
+      "step": 237
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.447404951110955,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7797,
+      "step": 238
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.4505850429200494,
+      "learning_rate": 0.0001867318529563574,
+      "loss": 0.7605,
+      "step": 239
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.528153338217792,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8871,
+      "step": 240
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.4960386278474408,
+      "learning_rate": 0.00018647264593196688,
+      "loss": 0.8679,
+      "step": 241
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 1.0266285355191271,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.8459,
+      "step": 242
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.5261590632654706,
+      "learning_rate": 0.00018621111492818585,
+      "loss": 0.8645,
+      "step": 243
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.5402321028707199,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.8604,
+      "step": 244
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.4688281180091872,
+      "learning_rate": 0.00018594726697374175,
+      "loss": 0.8419,
+      "step": 245
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.47272540628058707,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.7914,
+      "step": 246
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.6073714592230718,
+      "learning_rate": 0.0001856811091596308,
+      "loss": 0.8612,
+      "step": 247
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.40992042568299897,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7814,
+      "step": 248
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.5171420488179562,
+      "learning_rate": 0.00018541264863892754,
+      "loss": 0.8596,
+      "step": 249
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5810998337097157,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.9191,
+      "step": 250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.5016026004732917,
+      "learning_rate": 0.00018514189262659235,
+      "loss": 0.7857,
+      "step": 251
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.5290998464684912,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8011,
+      "step": 252
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.47571676983361944,
+      "learning_rate": 0.00018486884839927768,
+      "loss": 0.7898,
+      "step": 253
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.5693979765389368,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.9489,
+      "step": 254
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.49758148747256714,
+      "learning_rate": 0.0001845935232951325,
+      "loss": 0.7966,
+      "step": 255
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.6590744659282638,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 1.001,
+      "step": 256
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.6070782655473008,
+      "learning_rate": 0.00018431592471360503,
+      "loss": 0.9436,
+      "step": 257
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4765173100361963,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7486,
+      "step": 258
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.5831060410544467,
+      "learning_rate": 0.000184036060115244,
+      "loss": 0.8634,
+      "step": 259
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.48581764512842424,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.8026,
+      "step": 260
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.5695229798504049,
+      "learning_rate": 0.00018375393702149787,
+      "loss": 0.8629,
+      "step": 261
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.6269406052489828,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.9387,
+      "step": 262
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.5365713230097443,
+      "learning_rate": 0.00018346956301451304,
+      "loss": 0.8277,
+      "step": 263
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5428528320354593,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.9152,
+      "step": 264
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.7005906897335021,
+      "learning_rate": 0.00018318294573692985,
+      "loss": 1.0014,
+      "step": 265
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.6246304282232839,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.8466,
+      "step": 266
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.5813641119949418,
+      "learning_rate": 0.0001828940928916772,
+      "loss": 0.9221,
+      "step": 267
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.5936386939998907,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.8933,
+      "step": 268
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.6682132049959943,
+      "learning_rate": 0.00018260301224176558,
+      "loss": 1.012,
+      "step": 269
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5003392083890684,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.8073,
+      "step": 270
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.4823222570626458,
+      "learning_rate": 0.00018230971161007853,
+      "loss": 0.8353,
+      "step": 271
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4836212855322634,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.6703,
+      "step": 272
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.570778634672041,
+      "learning_rate": 0.00018201419887916214,
+      "loss": 0.9101,
+      "step": 273
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.7716364793392765,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 1.0361,
+      "step": 274
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.5747014332332541,
+      "learning_rate": 0.00018171648199101346,
+      "loss": 0.8027,
+      "step": 275
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.459727165390823,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7691,
+      "step": 276
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.5631044109219594,
+      "learning_rate": 0.00018141656894686689,
+      "loss": 0.8622,
+      "step": 277
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.6170736779394916,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.9833,
+      "step": 278
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.5320835110335193,
+      "learning_rate": 0.00018111446780697929,
+      "loss": 0.8726,
+      "step": 279
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.7106108656925253,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.8326,
+      "step": 280
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.5699051480731491,
+      "learning_rate": 0.00018081018669041324,
+      "loss": 0.9784,
+      "step": 281
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.565038805973298,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.7859,
+      "step": 282
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.5309404108508301,
+      "learning_rate": 0.00018050373377481878,
+      "loss": 0.8548,
+      "step": 283
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.49634103976736577,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7807,
+      "step": 284
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.4211605885290296,
+      "learning_rate": 0.0001801951172962139,
+      "loss": 0.7503,
+      "step": 285
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.5065282499152559,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.8007,
+      "step": 286
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.5023885538954765,
+      "learning_rate": 0.0001798843455487629,
+      "loss": 0.848,
+      "step": 287
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.5619787151600203,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.9279,
+      "step": 288
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.6090409508083451,
+      "learning_rate": 0.00017957142688455362,
+      "loss": 0.882,
+      "step": 289
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.5815566266550105,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.9173,
+      "step": 290
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.5107579093746113,
+      "learning_rate": 0.00017925636971337304,
+      "loss": 0.8693,
+      "step": 291
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.5525656058505722,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.9301,
+      "step": 292
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.5387814059324538,
+      "learning_rate": 0.00017893918250248104,
+      "loss": 0.9038,
+      "step": 293
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.5241245952403185,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.8477,
+      "step": 294
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.5922467143505036,
+      "learning_rate": 0.00017861987377638312,
+      "loss": 0.826,
+      "step": 295
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.4164260795172474,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7151,
+      "step": 296
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.6322872381363032,
+      "learning_rate": 0.0001782984521166011,
+      "loss": 0.9421,
+      "step": 297
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.6822273627206625,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 1.056,
+      "step": 298
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.5456287789046688,
+      "learning_rate": 0.00017797492616144256,
+      "loss": 0.8608,
+      "step": 299
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.6332916232764236,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.8991,
+      "step": 300
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.48183332292100656,
+      "learning_rate": 0.00017764930460576866,
+      "loss": 0.7591,
+      "step": 301
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.5164293419792554,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7607,
+      "step": 302
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.6776115879178484,
+      "learning_rate": 0.00017732159620076053,
+      "loss": 0.9309,
+      "step": 303
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.41893179244042816,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.8024,
+      "step": 304
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.4524838840827557,
+      "learning_rate": 0.00017699180975368396,
+      "loss": 0.7879,
+      "step": 305
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.504512145895528,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7544,
+      "step": 306
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.5604924118321215,
+      "learning_rate": 0.00017665995412765285,
+      "loss": 0.8203,
+      "step": 307
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.5185653394154404,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7878,
+      "step": 308
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.7509584334458808,
+      "learning_rate": 0.00017632603824139085,
+      "loss": 1.1157,
+      "step": 309
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.45356992265873464,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.744,
+      "step": 310
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.6062594546758928,
+      "learning_rate": 0.0001759900710689918,
+      "loss": 0.9007,
+      "step": 311
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.459860351968077,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.7948,
+      "step": 312
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.4639688266152528,
+      "learning_rate": 0.00017565206163967846,
+      "loss": 0.7786,
+      "step": 313
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.4997818630302224,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8903,
+      "step": 314
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.6224593209313116,
+      "learning_rate": 0.00017531201903755994,
+      "loss": 0.8202,
+      "step": 315
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4552829264775281,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.8474,
+      "step": 316
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.5088192201038029,
+      "learning_rate": 0.00017496995240138744,
+      "loss": 0.7795,
+      "step": 317
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.49495732607506265,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.807,
+      "step": 318
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.5428676966586734,
+      "learning_rate": 0.00017462587092430875,
+      "loss": 0.9178,
+      "step": 319
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.518535517881662,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7665,
+      "step": 320
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.6454046836263292,
+      "learning_rate": 0.00017427978385362112,
+      "loss": 0.9844,
+      "step": 321
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.47639684324202214,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.8325,
+      "step": 322
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.5120781918332273,
+      "learning_rate": 0.0001739317004905227,
+      "loss": 0.8637,
+      "step": 323
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.5079393333553627,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.8289,
+      "step": 324
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.4542924014395214,
+      "learning_rate": 0.00017358163018986282,
+      "loss": 0.8024,
+      "step": 325
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.578089965059899,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.8754,
+      "step": 326
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.5061039518358071,
+      "learning_rate": 0.00017322958235989016,
+      "loss": 0.8667,
+      "step": 327
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.5103291942577219,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7871,
+      "step": 328
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.5008286477772609,
+      "learning_rate": 0.00017287556646200018,
+      "loss": 0.8479,
+      "step": 329
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.5921120659918616,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.8689,
+      "step": 330
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.5371446072013113,
+      "learning_rate": 0.00017251959201048083,
+      "loss": 0.9321,
+      "step": 331
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.8025587927358921,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 1.092,
+      "step": 332
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.5290978362007965,
+      "learning_rate": 0.00017216166857225674,
+      "loss": 0.8358,
+      "step": 333
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.4750956587014473,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.8325,
+      "step": 334
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.5429519797737313,
+      "learning_rate": 0.00017180180576663228,
+      "loss": 0.7595,
+      "step": 335
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.5443289856538186,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.862,
+      "step": 336
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.477175604369523,
+      "learning_rate": 0.00017144001326503273,
+      "loss": 0.7886,
+      "step": 337
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.4567904137938678,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.7842,
+      "step": 338
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.5370099874742391,
+      "learning_rate": 0.00017107630079074478,
+      "loss": 0.7789,
+      "step": 339
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.6116928652479303,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7751,
+      "step": 340
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.6534658855619526,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.9485,
+      "step": 341
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.5267548915654874,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7644,
+      "step": 342
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.546158562900475,
+      "learning_rate": 0.00017034315507498635,
+      "loss": 0.8842,
+      "step": 343
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.49734406840969075,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.8491,
+      "step": 344
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.503221592285957,
+      "learning_rate": 0.00016997374153703625,
+      "loss": 0.7329,
+      "step": 345
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.6020286777353191,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.8598,
+      "step": 346
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.6425221403272974,
+      "learning_rate": 0.00016960244743290868,
+      "loss": 0.9093,
+      "step": 347
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.4992867409454385,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.8603,
+      "step": 348
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.4975241441241856,
+      "learning_rate": 0.00016922928274124886,
+      "loss": 0.8822,
+      "step": 349
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5445502485433601,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.9137,
+      "step": 350
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.4500183589656974,
+      "learning_rate": 0.00016885425749097444,
+      "loss": 0.7316,
+      "step": 351
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.46361537222619187,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7427,
+      "step": 352
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.5617104806372842,
+      "learning_rate": 0.00016847738176100632,
+      "loss": 0.9108,
+      "step": 353
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.5059886172049418,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.8165,
+      "step": 354
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.45585084441524354,
+      "learning_rate": 0.0001680986656799975,
+      "loss": 0.8853,
+      "step": 355
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.6680443221863649,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.9626,
+      "step": 356
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.5145275675086313,
+      "learning_rate": 0.00016771811942606108,
+      "loss": 0.8972,
+      "step": 357
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.4111239629894829,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.6526,
+      "step": 358
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.4511587296844328,
+      "learning_rate": 0.00016733575322649657,
+      "loss": 0.7845,
+      "step": 359
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.42424116750708446,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.765,
+      "step": 360
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.4912814335522475,
+      "learning_rate": 0.00016695157735751513,
+      "loss": 0.769,
+      "step": 361
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.3983467674115582,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7489,
+      "step": 362
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.5625653119972895,
+      "learning_rate": 0.0001665656021439633,
+      "loss": 0.7935,
+      "step": 363
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.5552548657685277,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.8309,
+      "step": 364
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.4300047148160572,
+      "learning_rate": 0.00016617783795904565,
+      "loss": 0.7869,
+      "step": 365
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.5817896749539205,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.8336,
+      "step": 366
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.7112047804693892,
+      "learning_rate": 0.00016578829522404583,
+      "loss": 0.9238,
+      "step": 367
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.5150828670492442,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.8975,
+      "step": 368
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.4782176505007454,
+      "learning_rate": 0.00016539698440804661,
+      "loss": 0.768,
+      "step": 369
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4916380083078038,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7919,
+      "step": 370
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.4919559785281054,
+      "learning_rate": 0.0001650039160276485,
+      "loss": 0.7459,
+      "step": 371
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.428628949999772,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7457,
+      "step": 372
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.563634981100943,
+      "learning_rate": 0.0001646091006466871,
+      "loss": 0.8152,
+      "step": 373
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.5402069948290369,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.8678,
+      "step": 374
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.4399607273299524,
+      "learning_rate": 0.00016421254887594917,
+      "loss": 0.7285,
+      "step": 375
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.6242587539736574,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.9364,
+      "step": 376
+    },
+    {
+      "epoch": 0.3016,
+      "grad_norm": 0.562721845396249,
+      "learning_rate": 0.00016381427137288754,
+      "loss": 0.8331,
+      "step": 377
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.5214076502423438,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.8743,
+      "step": 378
+    },
+    {
+      "epoch": 0.3032,
+      "grad_norm": 0.476645637993095,
+      "learning_rate": 0.0001634142788413346,
+      "loss": 0.7904,
+      "step": 379
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.5193954849687584,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.8551,
+      "step": 380
+    },
+    {
+      "epoch": 0.3048,
+      "grad_norm": 0.5462745527959467,
+      "learning_rate": 0.00016301258203121462,
+      "loss": 0.9005,
+      "step": 381
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.42695216471586,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.726,
+      "step": 382
+    },
+    {
+      "epoch": 0.3064,
+      "grad_norm": 0.5022640697248626,
+      "learning_rate": 0.00016260919173825508,
+      "loss": 0.7257,
+      "step": 383
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4812952467049974,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.8171,
+      "step": 384
+    },
+    {
+      "epoch": 0.308,
+      "grad_norm": 0.42488386188997473,
+      "learning_rate": 0.00016220411880369601,
+      "loss": 0.7608,
+      "step": 385
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.5808555818663869,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.8605,
+      "step": 386
+    },
+    {
+      "epoch": 0.3096,
+      "grad_norm": 0.5665458402725879,
+      "learning_rate": 0.00016179737411399926,
+      "loss": 0.9286,
+      "step": 387
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.48113860245205914,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7679,
+      "step": 388
+    },
+    {
+      "epoch": 0.3112,
+      "grad_norm": 0.5777103511288991,
+      "learning_rate": 0.00016138896860055555,
+      "loss": 0.9326,
+      "step": 389
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.48625375995616793,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.8427,
+      "step": 390
+    },
+    {
+      "epoch": 0.3128,
+      "grad_norm": 0.4386903291683638,
+      "learning_rate": 0.00016097891323939062,
+      "loss": 0.7884,
+      "step": 391
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.46327681828111467,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.8286,
+      "step": 392
+    },
+    {
+      "epoch": 0.3144,
+      "grad_norm": 0.5601019486968891,
+      "learning_rate": 0.00016056721905087056,
+      "loss": 0.8159,
+      "step": 393
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.4301418905400167,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.7878,
+      "step": 394
+    },
+    {
+      "epoch": 0.316,
+      "grad_norm": 0.5026133069983575,
+      "learning_rate": 0.00016015389709940538,
+      "loss": 0.8828,
+      "step": 395
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.5299503679061817,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.8348,
+      "step": 396
+    },
+    {
+      "epoch": 0.3176,
+      "grad_norm": 0.581319730296141,
+      "learning_rate": 0.0001597389584931517,
+      "loss": 0.8024,
+      "step": 397
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.5946632338435576,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.8225,
+      "step": 398
+    },
+    {
+      "epoch": 0.3192,
+      "grad_norm": 0.4841597405281517,
+      "learning_rate": 0.0001593224143837142,
+      "loss": 0.8143,
+      "step": 399
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5492608521995358,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7956,
+      "step": 400
+    },
+    {
+      "epoch": 0.3208,
+      "grad_norm": 0.45387668461942016,
+      "learning_rate": 0.00015890427596584617,
+      "loss": 0.7372,
+      "step": 401
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.5297681314438389,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.8176,
+      "step": 402
+    },
+    {
+      "epoch": 0.3224,
+      "grad_norm": 0.5237874007801877,
+      "learning_rate": 0.00015848455447714822,
+      "loss": 0.8855,
+      "step": 403
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.4203172607528882,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7114,
+      "step": 404
+    },
+    {
+      "epoch": 0.324,
+      "grad_norm": 0.5071416412021843,
+      "learning_rate": 0.00015806326119776663,
+      "loss": 0.8688,
+      "step": 405
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.46162030906175094,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7411,
+      "step": 406
+    },
+    {
+      "epoch": 0.3256,
+      "grad_norm": 0.4894583703431277,
+      "learning_rate": 0.00015764040745008988,
+      "loss": 0.801,
+      "step": 407
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4711268262898016,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.7989,
+      "step": 408
+    },
+    {
+      "epoch": 0.3272,
+      "grad_norm": 0.6222170391946266,
+      "learning_rate": 0.00015721600459844468,
+      "loss": 0.8928,
+      "step": 409
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.5537514579365937,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.8917,
+      "step": 410
+    },
+    {
+      "epoch": 0.3288,
+      "grad_norm": 0.6053596812657615,
+      "learning_rate": 0.00015679006404879033,
+      "loss": 0.9436,
+      "step": 411
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.5368770833791756,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.8546,
+      "step": 412
+    },
+    {
+      "epoch": 0.3304,
+      "grad_norm": 0.6557971237633967,
+      "learning_rate": 0.00015636259724841222,
+      "loss": 0.9259,
+      "step": 413
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.416288866882269,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7353,
+      "step": 414
+    },
+    {
+      "epoch": 0.332,
+      "grad_norm": 0.5320282584159587,
+      "learning_rate": 0.00015593361568561428,
+      "loss": 0.9659,
+      "step": 415
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.541563183445848,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.8598,
+      "step": 416
+    },
+    {
+      "epoch": 0.3336,
+      "grad_norm": 0.573944781458947,
+      "learning_rate": 0.0001555031308894101,
+      "loss": 0.8327,
+      "step": 417
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.48636608196368486,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7427,
+      "step": 418
+    },
+    {
+      "epoch": 0.3352,
+      "grad_norm": 0.6118686890019372,
+      "learning_rate": 0.0001550711544292131,
+      "loss": 0.8806,
+      "step": 419
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.526158680155827,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.8303,
+      "step": 420
+    },
+    {
+      "epoch": 0.3368,
+      "grad_norm": 0.585820456566811,
+      "learning_rate": 0.00015463769791452574,
+      "loss": 0.8888,
+      "step": 421
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.5641310017175182,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.8957,
+      "step": 422
+    },
+    {
+      "epoch": 0.3384,
+      "grad_norm": 0.5515760176220771,
+      "learning_rate": 0.00015420277299462736,
+      "loss": 0.8979,
+      "step": 423
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.6231208105999786,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 1.0441,
+      "step": 424
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.7365207038290411,
+      "learning_rate": 0.00015376639135826107,
+      "loss": 0.9749,
+      "step": 425
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.4273855647133252,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7737,
+      "step": 426
+    },
+    {
+      "epoch": 0.3416,
+      "grad_norm": 0.5059365066578263,
+      "learning_rate": 0.00015332856473331978,
+      "loss": 0.8877,
+      "step": 427
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.503614973058092,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7642,
+      "step": 428
+    },
+    {
+      "epoch": 0.3432,
+      "grad_norm": 0.6730166765511943,
+      "learning_rate": 0.00015288930488653094,
+      "loss": 0.9213,
+      "step": 429
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.46396620660917376,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7674,
+      "step": 430
+    },
+    {
+      "epoch": 0.3448,
+      "grad_norm": 0.5034901297109798,
+      "learning_rate": 0.0001524486236231402,
+      "loss": 0.8469,
+      "step": 431
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.5123756459992598,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.8319,
+      "step": 432
+    },
+    {
+      "epoch": 0.3464,
+      "grad_norm": 0.5387604003324437,
+      "learning_rate": 0.00015200653278659432,
+      "loss": 0.9073,
+      "step": 433
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.4725307584661988,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7775,
+      "step": 434
+    },
+    {
+      "epoch": 0.348,
+      "grad_norm": 0.4878781719567979,
+      "learning_rate": 0.00015156304425822267,
+      "loss": 0.7366,
+      "step": 435
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.455230826737341,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.8115,
+      "step": 436
+    },
+    {
+      "epoch": 0.3496,
+      "grad_norm": 0.5261328242485291,
+      "learning_rate": 0.00015111816995691809,
+      "loss": 0.7685,
+      "step": 437
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.5064275726334223,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.8493,
+      "step": 438
+    },
+    {
+      "epoch": 0.3512,
+      "grad_norm": 0.468520711844833,
+      "learning_rate": 0.00015067192183881658,
+      "loss": 0.6597,
+      "step": 439
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.44373890533763766,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7828,
+      "step": 440
+    },
+    {
+      "epoch": 0.3528,
+      "grad_norm": 0.3758784537131547,
+      "learning_rate": 0.00015022431189697568,
+      "loss": 0.7145,
+      "step": 441
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.5414512159240334,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8886,
+      "step": 442
+    },
+    {
+      "epoch": 0.3544,
+      "grad_norm": 0.46836934499681643,
+      "learning_rate": 0.0001497753521610526,
+      "loss": 0.8519,
+      "step": 443
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.5478216473576355,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.8393,
+      "step": 444
+    },
+    {
+      "epoch": 0.356,
+      "grad_norm": 0.6188204660715885,
+      "learning_rate": 0.00014932505469698052,
+      "loss": 0.9017,
+      "step": 445
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.520421586764439,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.8611,
+      "step": 446
+    },
+    {
+      "epoch": 0.3576,
+      "grad_norm": 0.48349042295159567,
+      "learning_rate": 0.0001488734316066446,
+      "loss": 0.8638,
+      "step": 447
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.6761883802868082,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7786,
+      "step": 448
+    },
+    {
+      "epoch": 0.3592,
+      "grad_norm": 0.4161033990297297,
+      "learning_rate": 0.0001484204950275565,
+      "loss": 0.726,
+      "step": 449
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5637465759314259,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.8283,
+      "step": 450
+    },
+    {
+      "epoch": 0.3608,
+      "grad_norm": 0.4763183478858337,
+      "learning_rate": 0.00014796625713252848,
+      "loss": 0.7195,
+      "step": 451
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.5708493331226704,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.8474,
+      "step": 452
+    },
+    {
+      "epoch": 0.3624,
+      "grad_norm": 0.5242772582100799,
+      "learning_rate": 0.00014751073012934587,
+      "loss": 0.8982,
+      "step": 453
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.4342083988934154,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.6962,
+      "step": 454
+    },
+    {
+      "epoch": 0.364,
+      "grad_norm": 0.4241316584309476,
+      "learning_rate": 0.0001470539262604393,
+      "loss": 0.779,
+      "step": 455
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.5512281159501403,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.9414,
+      "step": 456
+    },
+    {
+      "epoch": 0.3656,
+      "grad_norm": 0.47281763704161467,
+      "learning_rate": 0.00014659585780255556,
+      "loss": 0.7479,
+      "step": 457
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.5253262322318374,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.9399,
+      "step": 458
+    },
+    {
+      "epoch": 0.3672,
+      "grad_norm": 0.4713053988398959,
+      "learning_rate": 0.0001461365370664276,
+      "loss": 0.751,
+      "step": 459
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.48804668492107184,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7617,
+      "step": 460
+    },
+    {
+      "epoch": 0.3688,
+      "grad_norm": 0.4894482088008986,
+      "learning_rate": 0.00014567597639644387,
+      "loss": 0.8002,
+      "step": 461
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.4960660530804454,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.8239,
+      "step": 462
+    },
+    {
+      "epoch": 0.3704,
+      "grad_norm": 0.4336854030947481,
+      "learning_rate": 0.00014521418817031628,
+      "loss": 0.7727,
+      "step": 463
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.5365196664802592,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.7533,
+      "step": 464
+    },
+    {
+      "epoch": 0.372,
+      "grad_norm": 0.46916503004233767,
+      "learning_rate": 0.00014475118479874774,
+      "loss": 0.7548,
+      "step": 465
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.7009035094639057,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7939,
+      "step": 466
+    },
+    {
+      "epoch": 0.3736,
+      "grad_norm": 0.45369423347572424,
+      "learning_rate": 0.0001442869787250987,
+      "loss": 0.7162,
+      "step": 467
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.5406665246930531,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.8502,
+      "step": 468
+    },
+    {
+      "epoch": 0.3752,
+      "grad_norm": 0.4098289899450824,
+      "learning_rate": 0.00014382158242505234,
+      "loss": 0.7633,
+      "step": 469
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.5308195138806584,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.8122,
+      "step": 470
+    },
+    {
+      "epoch": 0.3768,
+      "grad_norm": 0.5017791079272266,
+      "learning_rate": 0.00014335500840627986,
+      "loss": 0.8365,
+      "step": 471
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.6353701531931035,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.8842,
+      "step": 472
+    },
+    {
+      "epoch": 0.3784,
+      "grad_norm": 0.5828084171271591,
+      "learning_rate": 0.0001428872692081038,
+      "loss": 0.8396,
+      "step": 473
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.49085125369595195,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.8298,
+      "step": 474
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.5753743296959409,
+      "learning_rate": 0.00014241837740116132,
+      "loss": 0.854,
+      "step": 475
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.4753494413053485,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7404,
+      "step": 476
+    },
+    {
+      "epoch": 0.3816,
+      "grad_norm": 0.5012155769884424,
+      "learning_rate": 0.00014194834558706632,
+      "loss": 0.8808,
+      "step": 477
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.4641152707086399,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7923,
+      "step": 478
+    },
+    {
+      "epoch": 0.3832,
+      "grad_norm": 0.6036177751051387,
+      "learning_rate": 0.0001414771863980707,
+      "loss": 0.8972,
+      "step": 479
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.49537016981133947,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.8381,
+      "step": 480
+    },
+    {
+      "epoch": 0.3848,
+      "grad_norm": 0.47387744660741005,
+      "learning_rate": 0.00014100491249672498,
+      "loss": 0.8017,
+      "step": 481
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.5346572691831166,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7592,
+      "step": 482
+    },
+    {
+      "epoch": 0.3864,
+      "grad_norm": 0.39518148845260165,
+      "learning_rate": 0.0001405315365755379,
+      "loss": 0.7655,
+      "step": 483
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.5086735680516187,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.9133,
+      "step": 484
+    },
+    {
+      "epoch": 0.388,
+      "grad_norm": 0.5183446764477347,
+      "learning_rate": 0.00014005707135663527,
+      "loss": 0.8268,
+      "step": 485
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.5079181549084607,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7564,
+      "step": 486
+    },
+    {
+      "epoch": 0.3896,
+      "grad_norm": 0.5723226508908508,
+      "learning_rate": 0.00013958152959141825,
+      "loss": 0.8547,
+      "step": 487
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.43762737232927784,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7613,
+      "step": 488
+    },
+    {
+      "epoch": 0.3912,
+      "grad_norm": 0.5777902349863276,
+      "learning_rate": 0.00013910492406022033,
+      "loss": 0.8169,
+      "step": 489
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.46119697962249157,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.693,
+      "step": 490
+    },
+    {
+      "epoch": 0.3928,
+      "grad_norm": 0.5679592046461474,
+      "learning_rate": 0.0001386272675719642,
+      "loss": 0.8662,
+      "step": 491
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.5143643118251245,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.784,
+      "step": 492
+    },
+    {
+      "epoch": 0.3944,
+      "grad_norm": 0.5327345799873556,
+      "learning_rate": 0.00013814857296381728,
+      "loss": 0.8125,
+      "step": 493
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.4595260751518562,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7549,
+      "step": 494
+    },
+    {
+      "epoch": 0.396,
+      "grad_norm": 0.5045007852387343,
+      "learning_rate": 0.00013766885310084688,
+      "loss": 0.8154,
+      "step": 495
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4463937954372007,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.6961,
+      "step": 496
+    },
+    {
+      "epoch": 0.3976,
+      "grad_norm": 0.552245733737881,
+      "learning_rate": 0.00013718812087567414,
+      "loss": 0.8687,
+      "step": 497
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.49848544400728884,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7747,
+      "step": 498
+    },
+    {
+      "epoch": 0.3992,
+      "grad_norm": 0.5223353961366203,
+      "learning_rate": 0.000136706389208128,
+      "loss": 0.7559,
+      "step": 499
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.6075208329067229,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.8909,
+      "step": 500
+    },
+    {
+      "epoch": 0.4008,
+      "grad_norm": 0.815476728266723,
+      "learning_rate": 0.00013622367104489756,
+      "loss": 0.9289,
+      "step": 501
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.643028271104798,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.8244,
+      "step": 502
+    },
+    {
+      "epoch": 0.4024,
+      "grad_norm": 0.48280726217974457,
+      "learning_rate": 0.0001357399793591844,
+      "loss": 0.828,
+      "step": 503
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.5439674438982885,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.8534,
+      "step": 504
+    },
+    {
+      "epoch": 0.404,
+      "grad_norm": 0.4687977100600042,
+      "learning_rate": 0.00013525532715035366,
+      "loss": 0.7626,
+      "step": 505
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.433287770246132,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7956,
+      "step": 506
+    },
+    {
+      "epoch": 0.4056,
+      "grad_norm": 0.5126930198527264,
+      "learning_rate": 0.00013476972744358507,
+      "loss": 0.832,
+      "step": 507
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.6354646818331343,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.9188,
+      "step": 508
+    },
+    {
+      "epoch": 0.4072,
+      "grad_norm": 0.6326475734716094,
+      "learning_rate": 0.00013428319328952253,
+      "loss": 0.9524,
+      "step": 509
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.41664299189410514,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7067,
+      "step": 510
+    },
+    {
+      "epoch": 0.4088,
+      "grad_norm": 0.4740842085361335,
+      "learning_rate": 0.0001337957377639235,
+      "loss": 0.6943,
+      "step": 511
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.48662916484067625,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7801,
+      "step": 512
+    },
+    {
+      "epoch": 0.4104,
+      "grad_norm": 0.4918517295772121,
+      "learning_rate": 0.0001333073739673076,
+      "loss": 0.7931,
+      "step": 513
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.4631998368398158,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7924,
+      "step": 514
+    },
+    {
+      "epoch": 0.412,
+      "grad_norm": 0.6379201450826519,
+      "learning_rate": 0.0001328181150246045,
+      "loss": 0.9894,
+      "step": 515
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4242585975126765,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7472,
+      "step": 516
+    },
+    {
+      "epoch": 0.4136,
+      "grad_norm": 0.5478588902325013,
+      "learning_rate": 0.00013232797408480127,
+      "loss": 0.9123,
+      "step": 517
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.5389180561217947,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.8135,
+      "step": 518
+    },
+    {
+      "epoch": 0.4152,
+      "grad_norm": 0.585498259232321,
+      "learning_rate": 0.00013183696432058888,
+      "loss": 0.9488,
+      "step": 519
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.5871634185845148,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.9312,
+      "step": 520
+    },
+    {
+      "epoch": 0.4168,
+      "grad_norm": 0.562709572212109,
+      "learning_rate": 0.00013134509892800822,
+      "loss": 0.8945,
+      "step": 521
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.4825365262187394,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.78,
+      "step": 522
+    },
+    {
+      "epoch": 0.4184,
+      "grad_norm": 0.5558307139548955,
+      "learning_rate": 0.00013085239112609547,
+      "loss": 0.8581,
+      "step": 523
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.5099124822237073,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.8459,
+      "step": 524
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.5651918845767404,
+      "learning_rate": 0.00013035885415652685,
+      "loss": 0.8585,
+      "step": 525
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.594763452897937,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.9477,
+      "step": 526
+    },
+    {
+      "epoch": 0.4216,
+      "grad_norm": 0.5108837346519073,
+      "learning_rate": 0.00012986450128326266,
+      "loss": 0.8448,
+      "step": 527
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.6149527213658991,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.8743,
+      "step": 528
+    },
+    {
+      "epoch": 0.4232,
+      "grad_norm": 0.5190735988917229,
+      "learning_rate": 0.00012936934579219094,
+      "loss": 0.8117,
+      "step": 529
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.6081118249705028,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.8562,
+      "step": 530
+    },
+    {
+      "epoch": 0.4248,
+      "grad_norm": 0.4608503311720638,
+      "learning_rate": 0.00012887340099077024,
+      "loss": 0.7288,
+      "step": 531
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.6559202185899272,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.8916,
+      "step": 532
+    },
+    {
+      "epoch": 0.4264,
+      "grad_norm": 0.47914269014723737,
+      "learning_rate": 0.0001283766802076722,
+      "loss": 0.7825,
+      "step": 533
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.569294625307409,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.8941,
+      "step": 534
+    },
+    {
+      "epoch": 0.428,
+      "grad_norm": 0.5115071926180038,
+      "learning_rate": 0.00012787919679242306,
+      "loss": 0.8166,
+      "step": 535
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.43673382782497544,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.8108,
+      "step": 536
+    },
+    {
+      "epoch": 0.4296,
+      "grad_norm": 0.4193508222034899,
+      "learning_rate": 0.00012738096411504522,
+      "loss": 0.6614,
+      "step": 537
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.5444916476669717,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.8307,
+      "step": 538
+    },
+    {
+      "epoch": 0.4312,
+      "grad_norm": 0.39709390155112134,
+      "learning_rate": 0.00012688199556569753,
+      "loss": 0.7885,
+      "step": 539
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4663528741990849,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7717,
+      "step": 540
+    },
+    {
+      "epoch": 0.4328,
+      "grad_norm": 0.44767087061840327,
+      "learning_rate": 0.0001263823045543158,
+      "loss": 0.7243,
+      "step": 541
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.5400025740138015,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.9107,
+      "step": 542
+    },
+    {
+      "epoch": 0.4344,
+      "grad_norm": 0.7242760382075248,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.928,
+      "step": 543
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4699883135672619,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.8147,
+      "step": 544
+    },
+    {
+      "epoch": 0.436,
+      "grad_norm": 0.4507172457148455,
+      "learning_rate": 0.00012538080888191408,
+      "loss": 0.7147,
+      "step": 545
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.4477273850438377,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7828,
+      "step": 546
+    },
+    {
+      "epoch": 0.4376,
+      "grad_norm": 0.573213467575292,
+      "learning_rate": 0.00012487903113640337,
+      "loss": 0.9016,
+      "step": 547
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.7395294482787395,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.8013,
+      "step": 548
+    },
+    {
+      "epoch": 0.4392,
+      "grad_norm": 0.4278327164826983,
+      "learning_rate": 0.00012437658475915377,
+      "loss": 0.722,
+      "step": 549
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.5481371702328152,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7552,
+      "step": 550
+    },
+    {
+      "epoch": 0.4408,
+      "grad_norm": 0.4438702046348752,
+      "learning_rate": 0.00012387348325356874,
+      "loss": 0.7923,
+      "step": 551
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.49339323979189215,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.6711,
+      "step": 552
+    },
+    {
+      "epoch": 0.4424,
+      "grad_norm": 0.43469574787777077,
+      "learning_rate": 0.00012336974014065844,
+      "loss": 0.7608,
+      "step": 553
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.45469460771454245,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7548,
+      "step": 554
+    },
+    {
+      "epoch": 0.444,
+      "grad_norm": 0.4406509515165732,
+      "learning_rate": 0.00012286536895867654,
+      "loss": 0.7219,
+      "step": 555
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.49441488602329037,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7927,
+      "step": 556
+    },
+    {
+      "epoch": 0.4456,
+      "grad_norm": 0.7379867681521697,
+      "learning_rate": 0.00012236038326275626,
+      "loss": 0.9384,
+      "step": 557
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.4419917602257873,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.6832,
+      "step": 558
+    },
+    {
+      "epoch": 0.4472,
+      "grad_norm": 0.5889559609480378,
+      "learning_rate": 0.00012185479662454595,
+      "loss": 0.7841,
+      "step": 559
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.40878678657385226,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7209,
+      "step": 560
+    },
+    {
+      "epoch": 0.4488,
+      "grad_norm": 0.4902067343879289,
+      "learning_rate": 0.00012134862263184467,
+      "loss": 0.7689,
+      "step": 561
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.39543419743016395,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.7314,
+      "step": 562
+    },
+    {
+      "epoch": 0.4504,
+      "grad_norm": 0.5023813018744453,
+      "learning_rate": 0.00012084187488823657,
+      "loss": 0.7474,
+      "step": 563
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.49204425833848103,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.8135,
+      "step": 564
+    },
+    {
+      "epoch": 0.452,
+      "grad_norm": 0.5434543681211457,
+      "learning_rate": 0.00012033456701272576,
+      "loss": 0.9608,
+      "step": 565
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.3584608607245962,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7393,
+      "step": 566
+    },
+    {
+      "epoch": 0.4536,
+      "grad_norm": 0.5421303928890512,
+      "learning_rate": 0.00011982671263936995,
+      "loss": 0.8541,
+      "step": 567
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.5041842986860352,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7802,
+      "step": 568
+    },
+    {
+      "epoch": 0.4552,
+      "grad_norm": 0.5686617552697284,
+      "learning_rate": 0.00011931832541691418,
+      "loss": 0.7993,
+      "step": 569
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.5174441794883738,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.8221,
+      "step": 570
+    },
+    {
+      "epoch": 0.4568,
+      "grad_norm": 0.4343168867662223,
+      "learning_rate": 0.00011880941900842397,
+      "loss": 0.7197,
+      "step": 571
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.6260800137721261,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.8975,
+      "step": 572
+    },
+    {
+      "epoch": 0.4584,
+      "grad_norm": 0.4406198345363788,
+      "learning_rate": 0.00011830000709091815,
+      "loss": 0.7761,
+      "step": 573
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.46177843371607763,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7759,
+      "step": 574
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.5577682154881546,
+      "learning_rate": 0.0001177901033550012,
+      "loss": 0.8044,
+      "step": 575
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.5629217703960968,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.9484,
+      "step": 576
+    },
+    {
+      "epoch": 0.4616,
+      "grad_norm": 0.47427370070049796,
+      "learning_rate": 0.00011727972150449544,
+      "loss": 0.8189,
+      "step": 577
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.43541803965039383,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7736,
+      "step": 578
+    },
+    {
+      "epoch": 0.4632,
+      "grad_norm": 0.5604636650499492,
+      "learning_rate": 0.00011676887525607271,
+      "loss": 0.7894,
+      "step": 579
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.40486175581997225,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7001,
+      "step": 580
+    },
+    {
+      "epoch": 0.4648,
+      "grad_norm": 0.6116138036064188,
+      "learning_rate": 0.00011625757833888551,
+      "loss": 0.968,
+      "step": 581
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.47050393754892433,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7368,
+      "step": 582
+    },
+    {
+      "epoch": 0.4664,
+      "grad_norm": 0.6197201726874729,
+      "learning_rate": 0.0001157458444941984,
+      "loss": 0.8447,
+      "step": 583
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.43127498601075004,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7601,
+      "step": 584
+    },
+    {
+      "epoch": 0.468,
+      "grad_norm": 0.5239926917161649,
+      "learning_rate": 0.00011523368747501839,
+      "loss": 0.7749,
+      "step": 585
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.5557068205548962,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7991,
+      "step": 586
+    },
+    {
+      "epoch": 0.4696,
+      "grad_norm": 0.38524225379152355,
+      "learning_rate": 0.00011472112104572547,
+      "loss": 0.6911,
+      "step": 587
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.5867606517164021,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.8164,
+      "step": 588
+    },
+    {
+      "epoch": 0.4712,
+      "grad_norm": 0.5524784254512559,
+      "learning_rate": 0.0001142081589817027,
+      "loss": 0.8913,
+      "step": 589
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.5090520257090125,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7865,
+      "step": 590
+    },
+    {
+      "epoch": 0.4728,
+      "grad_norm": 0.4061404196162076,
+      "learning_rate": 0.00011369481506896582,
+      "loss": 0.7403,
+      "step": 591
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3887932548504206,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7083,
+      "step": 592
+    },
+    {
+      "epoch": 0.4744,
+      "grad_norm": 0.4300486673673649,
+      "learning_rate": 0.00011318110310379301,
+      "loss": 0.7345,
+      "step": 593
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.44405217119720175,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7479,
+      "step": 594
+    },
+    {
+      "epoch": 0.476,
+      "grad_norm": 0.5407904878583915,
+      "learning_rate": 0.00011266703689235394,
+      "loss": 0.8068,
+      "step": 595
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.6441015602700922,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.883,
+      "step": 596
+    },
+    {
+      "epoch": 0.4776,
+      "grad_norm": 0.48156565405133755,
+      "learning_rate": 0.00011215263025033869,
+      "loss": 0.7746,
+      "step": 597
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.5509487651952958,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7485,
+      "step": 598
+    },
+    {
+      "epoch": 0.4792,
+      "grad_norm": 0.6022651603352311,
+      "learning_rate": 0.00011163789700258655,
+      "loss": 0.9138,
+      "step": 599
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.5698457697147494,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.824,
+      "step": 600
+    },
+    {
+      "epoch": 0.4808,
+      "grad_norm": 0.37478841358951825,
+      "learning_rate": 0.00011112285098271451,
+      "loss": 0.7013,
+      "step": 601
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.5937745460335725,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.9156,
+      "step": 602
+    },
+    {
+      "epoch": 0.4824,
+      "grad_norm": 0.4117617011434398,
+      "learning_rate": 0.00011060750603274535,
+      "loss": 0.7609,
+      "step": 603
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.452191062612939,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.7114,
+      "step": 604
+    },
+    {
+      "epoch": 0.484,
+      "grad_norm": 0.5080631766111878,
+      "learning_rate": 0.00011009187600273566,
+      "loss": 0.7616,
+      "step": 605
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.46197159052348113,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7823,
+      "step": 606
+    },
+    {
+      "epoch": 0.4856,
+      "grad_norm": 0.5913203255644318,
+      "learning_rate": 0.00010957597475040373,
+      "loss": 0.8534,
+      "step": 607
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.4913816367545617,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.8438,
+      "step": 608
+    },
+    {
+      "epoch": 0.4872,
+      "grad_norm": 0.4321676807811443,
+      "learning_rate": 0.00010905981614075693,
+      "loss": 0.6898,
+      "step": 609
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.43210841415093076,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.7677,
+      "step": 610
+    },
+    {
+      "epoch": 0.4888,
+      "grad_norm": 0.4399667333715279,
+      "learning_rate": 0.00010854341404571928,
+      "loss": 0.7211,
+      "step": 611
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.5513206722172173,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.8129,
+      "step": 612
+    },
+    {
+      "epoch": 0.4904,
+      "grad_norm": 0.4525528282854611,
+      "learning_rate": 0.00010802678234375851,
+      "loss": 0.7782,
+      "step": 613
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.38156065752706614,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.6655,
+      "step": 614
+    },
+    {
+      "epoch": 0.492,
+      "grad_norm": 0.7079666217225821,
+      "learning_rate": 0.0001075099349195131,
+      "loss": 0.9053,
+      "step": 615
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.584023598611178,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.8989,
+      "step": 616
+    },
+    {
+      "epoch": 0.4936,
+      "grad_norm": 0.5537053421798345,
+      "learning_rate": 0.00010699288566341914,
+      "loss": 0.8031,
+      "step": 617
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.5226360487155243,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.8439,
+      "step": 618
+    },
+    {
+      "epoch": 0.4952,
+      "grad_norm": 0.4522843280300031,
+      "learning_rate": 0.000106475648471337,
+      "loss": 0.7835,
+      "step": 619
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.48848173792589555,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.8099,
+      "step": 620
+    },
+    {
+      "epoch": 0.4968,
+      "grad_norm": 0.45958022507217944,
+      "learning_rate": 0.00010595823724417795,
+      "loss": 0.7018,
+      "step": 621
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.41466007031314833,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.7065,
+      "step": 622
+    },
+    {
+      "epoch": 0.4984,
+      "grad_norm": 0.425364937392599,
+      "learning_rate": 0.00010544066588753044,
+      "loss": 0.7512,
+      "step": 623
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.5096590210775217,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.7954,
+      "step": 624
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.48648410550376947,
+      "learning_rate": 0.00010492294831128641,
+      "loss": 0.8412,
+      "step": 625
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.623289539500742,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.8941,
+      "step": 626
+    },
+    {
+      "epoch": 0.5016,
+      "grad_norm": 0.5677803983975525,
+      "learning_rate": 0.00010440509842926767,
+      "loss": 0.841,
+      "step": 627
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.34105874371506884,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.5849,
+      "step": 628
+    },
+    {
+      "epoch": 0.5032,
+      "grad_norm": 0.6244999281906921,
+      "learning_rate": 0.00010388713015885161,
+      "loss": 0.8543,
+      "step": 629
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.5141705732697385,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.8356,
+      "step": 630
+    },
+    {
+      "epoch": 0.5048,
+      "grad_norm": 0.5386242018582555,
+      "learning_rate": 0.00010336905742059742,
+      "loss": 0.7775,
+      "step": 631
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.6612636754644633,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.9341,
+      "step": 632
+    },
+    {
+      "epoch": 0.5064,
+      "grad_norm": 0.5066295764943997,
+      "learning_rate": 0.0001028508941378719,
+      "loss": 0.7679,
+      "step": 633
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.5929415431147723,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.8538,
+      "step": 634
+    },
+    {
+      "epoch": 0.508,
+      "grad_norm": 0.4823545097198918,
+      "learning_rate": 0.00010233265423647523,
+      "loss": 0.8509,
+      "step": 635
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.45011328394148964,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.7047,
+      "step": 636
+    },
+    {
+      "epoch": 0.5096,
+      "grad_norm": 0.4957158187778744,
+      "learning_rate": 0.00010181435164426676,
+      "loss": 0.8761,
+      "step": 637
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.5131984276569533,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.8019,
+      "step": 638
+    },
+    {
+      "epoch": 0.5112,
+      "grad_norm": 0.6623000702243821,
+      "learning_rate": 0.00010129600029079072,
+      "loss": 0.8546,
+      "step": 639
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.372888034511951,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.719,
+      "step": 640
+    },
+    {
+      "epoch": 0.5128,
+      "grad_norm": 0.48089725992376436,
+      "learning_rate": 0.00010077761410690172,
+      "loss": 0.7641,
+      "step": 641
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.4869011180856098,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.8183,
+      "step": 642
+    },
+    {
+      "epoch": 0.5144,
+      "grad_norm": 0.6683558921353259,
+      "learning_rate": 0.00010025920702439051,
+      "loss": 0.8939,
+      "step": 643
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.510736639991274,
+      "learning_rate": 0.0001,
+      "loss": 0.7993,
+      "step": 644
+    },
+    {
+      "epoch": 0.516,
+      "grad_norm": 0.5055250734081491,
+      "learning_rate": 9.97407929756095e-05,
+      "loss": 0.808,
+      "step": 645
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.6225426629122933,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.9013,
+      "step": 646
+    },
+    {
+      "epoch": 0.5176,
+      "grad_norm": 0.5637350549578577,
+      "learning_rate": 9.92223858930983e-05,
+      "loss": 0.7932,
+      "step": 647
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.46993838974226765,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.7825,
+      "step": 648
+    },
+    {
+      "epoch": 0.5192,
+      "grad_norm": 0.5196216125995946,
+      "learning_rate": 9.870399970920932e-05,
+      "loss": 0.9727,
+      "step": 649
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.5130227828649497,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.718,
+      "step": 650
+    },
+    {
+      "epoch": 0.5208,
+      "grad_norm": 0.5286992190561343,
+      "learning_rate": 9.818564835573323e-05,
+      "loss": 0.7683,
+      "step": 651
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.6290709225831332,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.8726,
+      "step": 652
+    },
+    {
+      "epoch": 0.5224,
+      "grad_norm": 0.40958606639063,
+      "learning_rate": 9.766734576352478e-05,
+      "loss": 0.6989,
+      "step": 653
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.5992144491797667,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.8579,
+      "step": 654
+    },
+    {
+      "epoch": 0.524,
+      "grad_norm": 0.5408709821613941,
+      "learning_rate": 9.714910586212816e-05,
+      "loss": 0.7791,
+      "step": 655
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.5603092739646592,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.887,
+      "step": 656
+    },
+    {
+      "epoch": 0.5256,
+      "grad_norm": 0.4076298331910016,
+      "learning_rate": 9.663094257940258e-05,
+      "loss": 0.7167,
+      "step": 657
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.48742935888812106,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7307,
+      "step": 658
+    },
+    {
+      "epoch": 0.5272,
+      "grad_norm": 0.6328343040471255,
+      "learning_rate": 9.611286984114841e-05,
+      "loss": 0.9219,
+      "step": 659
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.525172619120507,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.7884,
+      "step": 660
+    },
+    {
+      "epoch": 0.5288,
+      "grad_norm": 0.5533911039815641,
+      "learning_rate": 9.559490157073236e-05,
+      "loss": 0.7802,
+      "step": 661
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.5248090780104834,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.9503,
+      "step": 662
+    },
+    {
+      "epoch": 0.5304,
+      "grad_norm": 0.4546716340365706,
+      "learning_rate": 9.507705168871358e-05,
+      "loss": 0.6962,
+      "step": 663
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.539918735292058,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.9195,
+      "step": 664
+    },
+    {
+      "epoch": 0.532,
+      "grad_norm": 0.461873164842901,
+      "learning_rate": 9.455933411246958e-05,
+      "loss": 0.7413,
+      "step": 665
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.5404774421220055,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.7735,
+      "step": 666
+    },
+    {
+      "epoch": 0.5336,
+      "grad_norm": 0.48150403308113476,
+      "learning_rate": 9.404176275582208e-05,
+      "loss": 0.776,
+      "step": 667
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.4292803600015022,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.7823,
+      "step": 668
+    },
+    {
+      "epoch": 0.5352,
+      "grad_norm": 0.521600142890234,
+      "learning_rate": 9.352435152866298e-05,
+      "loss": 0.7734,
+      "step": 669
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.46376561189326265,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.8263,
+      "step": 670
+    },
+    {
+      "epoch": 0.5368,
+      "grad_norm": 0.509629506645239,
+      "learning_rate": 9.300711433658087e-05,
+      "loss": 0.8912,
+      "step": 671
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.4393346835557655,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7385,
+      "step": 672
+    },
+    {
+      "epoch": 0.5384,
+      "grad_norm": 0.40822658272155166,
+      "learning_rate": 9.249006508048694e-05,
+      "loss": 0.7319,
+      "step": 673
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.5102282881706691,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.7441,
+      "step": 674
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.500285468354489,
+      "learning_rate": 9.197321765624152e-05,
+      "loss": 0.8649,
+      "step": 675
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.5955534441368605,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7089,
+      "step": 676
+    },
+    {
+      "epoch": 0.5416,
+      "grad_norm": 0.514751343769818,
+      "learning_rate": 9.145658595428074e-05,
+      "loss": 0.7919,
+      "step": 677
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.6202458274255608,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.8531,
+      "step": 678
+    },
+    {
+      "epoch": 0.5432,
+      "grad_norm": 0.5370911720219436,
+      "learning_rate": 9.09401838592431e-05,
+      "loss": 0.8021,
+      "step": 679
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4596200244567425,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.7639,
+      "step": 680
+    },
+    {
+      "epoch": 0.5448,
+      "grad_norm": 0.4432096354968599,
+      "learning_rate": 9.04240252495963e-05,
+      "loss": 0.7892,
+      "step": 681
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.6913138897781747,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.9536,
+      "step": 682
+    },
+    {
+      "epoch": 0.5464,
+      "grad_norm": 0.5368223534102192,
+      "learning_rate": 8.990812399726435e-05,
+      "loss": 0.7887,
+      "step": 683
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.4470921354684901,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.6637,
+      "step": 684
+    },
+    {
+      "epoch": 0.548,
+      "grad_norm": 0.5052879442478138,
+      "learning_rate": 8.939249396725467e-05,
+      "loss": 0.7381,
+      "step": 685
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.6471533979972708,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.8575,
+      "step": 686
+    },
+    {
+      "epoch": 0.5496,
+      "grad_norm": 0.46346575110280064,
+      "learning_rate": 8.887714901728551e-05,
+      "loss": 0.715,
+      "step": 687
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.40234694118173886,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7428,
+      "step": 688
+    },
+    {
+      "epoch": 0.5512,
+      "grad_norm": 0.5984049938457736,
+      "learning_rate": 8.836210299741346e-05,
+      "loss": 0.9005,
+      "step": 689
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.5026897204768275,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7645,
+      "step": 690
+    },
+    {
+      "epoch": 0.5528,
+      "grad_norm": 0.510315330137784,
+      "learning_rate": 8.784736974966135e-05,
+      "loss": 0.7703,
+      "step": 691
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.43250426602522235,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.7601,
+      "step": 692
+    },
+    {
+      "epoch": 0.5544,
+      "grad_norm": 0.4267727818094612,
+      "learning_rate": 8.733296310764611e-05,
+      "loss": 0.7808,
+      "step": 693
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.6966319221424616,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.9314,
+      "step": 694
+    },
+    {
+      "epoch": 0.556,
+      "grad_norm": 0.5901768920926003,
+      "learning_rate": 8.6818896896207e-05,
+      "loss": 0.8581,
+      "step": 695
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.5357043052840119,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7986,
+      "step": 696
+    },
+    {
+      "epoch": 0.5576,
+      "grad_norm": 0.48810903994513444,
+      "learning_rate": 8.63051849310342e-05,
+      "loss": 0.8064,
+      "step": 697
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.4772440063206294,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.7047,
+      "step": 698
+    },
+    {
+      "epoch": 0.5592,
+      "grad_norm": 0.42291427247242586,
+      "learning_rate": 8.579184101829734e-05,
+      "loss": 0.7234,
+      "step": 699
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5168772230534443,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.7791,
+      "step": 700
+    },
+    {
+      "epoch": 0.5608,
+      "grad_norm": 0.5716681716869837,
+      "learning_rate": 8.527887895427454e-05,
+      "loss": 0.8252,
+      "step": 701
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.5754784563452197,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.8623,
+      "step": 702
+    },
+    {
+      "epoch": 0.5624,
+      "grad_norm": 0.6061252279312543,
+      "learning_rate": 8.476631252498162e-05,
+      "loss": 0.8743,
+      "step": 703
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.5159057110672841,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7723,
+      "step": 704
+    },
+    {
+      "epoch": 0.564,
+      "grad_norm": 0.6158818624972439,
+      "learning_rate": 8.425415550580162e-05,
+      "loss": 0.7882,
+      "step": 705
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.6907244050654523,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.8958,
+      "step": 706
+    },
+    {
+      "epoch": 0.5656,
+      "grad_norm": 0.5451679905966559,
+      "learning_rate": 8.374242166111448e-05,
+      "loss": 0.7427,
+      "step": 707
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.5869039308703426,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7315,
+      "step": 708
+    },
+    {
+      "epoch": 0.5672,
+      "grad_norm": 0.5163326015460028,
+      "learning_rate": 8.323112474392731e-05,
+      "loss": 0.8171,
+      "step": 709
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.4555044357400919,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.7902,
+      "step": 710
+    },
+    {
+      "epoch": 0.5688,
+      "grad_norm": 0.5179276976344805,
+      "learning_rate": 8.272027849550457e-05,
+      "loss": 0.7577,
+      "step": 711
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.5098795094050994,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7619,
+      "step": 712
+    },
+    {
+      "epoch": 0.5704,
+      "grad_norm": 0.5744928900086157,
+      "learning_rate": 8.220989664499878e-05,
+      "loss": 0.8035,
+      "step": 713
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.4339301950666902,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.7728,
+      "step": 714
+    },
+    {
+      "epoch": 0.572,
+      "grad_norm": 0.48392687897486975,
+      "learning_rate": 8.169999290908188e-05,
+      "loss": 0.7994,
+      "step": 715
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.5577276559513615,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.9411,
+      "step": 716
+    },
+    {
+      "epoch": 0.5736,
+      "grad_norm": 0.3813800760512216,
+      "learning_rate": 8.119058099157604e-05,
+      "loss": 0.7255,
+      "step": 717
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.44710205088526395,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.6141,
+      "step": 718
+    },
+    {
+      "epoch": 0.5752,
+      "grad_norm": 0.5734428231196345,
+      "learning_rate": 8.068167458308582e-05,
+      "loss": 0.8319,
+      "step": 719
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.471162491579229,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.707,
+      "step": 720
+    },
+    {
+      "epoch": 0.5768,
+      "grad_norm": 0.5268331597214416,
+      "learning_rate": 8.017328736063006e-05,
+      "loss": 0.7831,
+      "step": 721
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.5512066366460038,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.8724,
+      "step": 722
+    },
+    {
+      "epoch": 0.5784,
+      "grad_norm": 0.45159331766129185,
+      "learning_rate": 7.966543298727425e-05,
+      "loss": 0.7226,
+      "step": 723
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.5628669189861594,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.8246,
+      "step": 724
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.4489717846993135,
+      "learning_rate": 7.915812511176347e-05,
+      "loss": 0.7273,
+      "step": 725
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.5931957639193516,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 1.016,
+      "step": 726
+    },
+    {
+      "epoch": 0.5816,
+      "grad_norm": 0.6878979745323104,
+      "learning_rate": 7.865137736815535e-05,
+      "loss": 0.8393,
+      "step": 727
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.48279145262903267,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.7742,
+      "step": 728
+    },
+    {
+      "epoch": 0.5832,
+      "grad_norm": 0.45784299054923344,
+      "learning_rate": 7.814520337545406e-05,
+      "loss": 0.7713,
+      "step": 729
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.6114375823080654,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.8754,
+      "step": 730
+    },
+    {
+      "epoch": 0.5848,
+      "grad_norm": 0.5131715129078588,
+      "learning_rate": 7.763961673724379e-05,
+      "loss": 0.7944,
+      "step": 731
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.5131994906415137,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.797,
+      "step": 732
+    },
+    {
+      "epoch": 0.5864,
+      "grad_norm": 0.5822445031872177,
+      "learning_rate": 7.713463104132345e-05,
+      "loss": 0.8634,
+      "step": 733
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.43485343215141814,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.7476,
+      "step": 734
+    },
+    {
+      "epoch": 0.588,
+      "grad_norm": 0.4613738032755144,
+      "learning_rate": 7.663025985934158e-05,
+      "loss": 0.7776,
+      "step": 735
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.48501401501599023,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.7265,
+      "step": 736
+    },
+    {
+      "epoch": 0.5896,
+      "grad_norm": 0.37660273663796184,
+      "learning_rate": 7.61265167464313e-05,
+      "loss": 0.6849,
+      "step": 737
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.5386722069111288,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.8196,
+      "step": 738
+    },
+    {
+      "epoch": 0.5912,
+      "grad_norm": 0.4242962757171859,
+      "learning_rate": 7.562341524084623e-05,
+      "loss": 0.7498,
+      "step": 739
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.5741200059037986,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.7797,
+      "step": 740
+    },
+    {
+      "epoch": 0.5928,
+      "grad_norm": 0.5204073806313823,
+      "learning_rate": 7.512096886359664e-05,
+      "loss": 0.8835,
+      "step": 741
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.5076601851050411,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.8621,
+      "step": 742
+    },
+    {
+      "epoch": 0.5944,
+      "grad_norm": 0.5179380741278033,
+      "learning_rate": 7.461919111808595e-05,
+      "loss": 0.7959,
+      "step": 743
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.5888857086694583,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.8564,
+      "step": 744
+    },
+    {
+      "epoch": 0.596,
+      "grad_norm": 0.5008378451925248,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.7997,
+      "step": 745
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.44589256233006075,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.7606,
+      "step": 746
+    },
+    {
+      "epoch": 0.5976,
+      "grad_norm": 0.4116672820082607,
+      "learning_rate": 7.361769544568425e-05,
+      "loss": 0.6629,
+      "step": 747
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.515604754170254,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.6872,
+      "step": 748
+    },
+    {
+      "epoch": 0.5992,
+      "grad_norm": 0.449656904421517,
+      "learning_rate": 7.311800443430251e-05,
+      "loss": 0.7218,
+      "step": 749
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4423761588323261,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7004,
+      "step": 750
+    },
+    {
+      "epoch": 0.6008,
+      "grad_norm": 0.43304351381495504,
+      "learning_rate": 7.26190358849548e-05,
+      "loss": 0.7544,
+      "step": 751
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.388949384344476,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.7087,
+      "step": 752
+    },
+    {
+      "epoch": 0.6024,
+      "grad_norm": 0.45049895544234086,
+      "learning_rate": 7.212080320757695e-05,
+      "loss": 0.7291,
+      "step": 753
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.7366164282084258,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.9146,
+      "step": 754
+    },
+    {
+      "epoch": 0.604,
+      "grad_norm": 0.46491414551503013,
+      "learning_rate": 7.162331979232783e-05,
+      "loss": 0.7666,
+      "step": 755
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.42867949944378453,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.7032,
+      "step": 756
+    },
+    {
+      "epoch": 0.6056,
+      "grad_norm": 0.5147435436229524,
+      "learning_rate": 7.112659900922976e-05,
+      "loss": 0.8262,
+      "step": 757
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.5367661938820013,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7697,
+      "step": 758
+    },
+    {
+      "epoch": 0.6072,
+      "grad_norm": 0.5824157256969407,
+      "learning_rate": 7.06306542078091e-05,
+      "loss": 0.7554,
+      "step": 759
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.38534545354199295,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6849,
+      "step": 760
+    },
+    {
+      "epoch": 0.6088,
+      "grad_norm": 0.5284622775081942,
+      "learning_rate": 7.013549871673736e-05,
+      "loss": 0.8728,
+      "step": 761
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.480831013120713,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.7724,
+      "step": 762
+    },
+    {
+      "epoch": 0.6104,
+      "grad_norm": 0.4294159155043536,
+      "learning_rate": 6.964114584347316e-05,
+      "loss": 0.6737,
+      "step": 763
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.5485800019065085,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6973,
+      "step": 764
+    },
+    {
+      "epoch": 0.612,
+      "grad_norm": 0.5378477773759867,
+      "learning_rate": 6.914760887390452e-05,
+      "loss": 0.8079,
+      "step": 765
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.6023163155494816,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.8248,
+      "step": 766
+    },
+    {
+      "epoch": 0.6136,
+      "grad_norm": 0.5365760352930691,
+      "learning_rate": 6.865490107199181e-05,
+      "loss": 0.7639,
+      "step": 767
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.4530385017080909,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.7566,
+      "step": 768
+    },
+    {
+      "epoch": 0.6152,
+      "grad_norm": 0.5552262914060541,
+      "learning_rate": 6.816303567941112e-05,
+      "loss": 0.7683,
+      "step": 769
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.5542391995409183,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.9047,
+      "step": 770
+    },
+    {
+      "epoch": 0.6168,
+      "grad_norm": 0.49364800684924487,
+      "learning_rate": 6.767202591519875e-05,
+      "loss": 0.6982,
+      "step": 771
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.474981388028254,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7669,
+      "step": 772
+    },
+    {
+      "epoch": 0.6184,
+      "grad_norm": 0.4926565804261099,
+      "learning_rate": 6.718188497539554e-05,
+      "loss": 0.7552,
+      "step": 773
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.6103439077637114,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.805,
+      "step": 774
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.7709192308181996,
+      "learning_rate": 6.669262603269246e-05,
+      "loss": 0.915,
+      "step": 775
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.6386787972460591,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.7317,
+      "step": 776
+    },
+    {
+      "epoch": 0.6216,
+      "grad_norm": 0.5065628984406876,
+      "learning_rate": 6.620426223607654e-05,
+      "loss": 0.7835,
+      "step": 777
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.5436850226643325,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7782,
+      "step": 778
+    },
+    {
+      "epoch": 0.6232,
+      "grad_norm": 0.5753151796667159,
+      "learning_rate": 6.571680671047749e-05,
+      "loss": 0.7177,
+      "step": 779
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.5575765980076977,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.8319,
+      "step": 780
+    },
+    {
+      "epoch": 0.6248,
+      "grad_norm": 0.47209779716940015,
+      "learning_rate": 6.523027255641493e-05,
+      "loss": 0.8046,
+      "step": 781
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.4895384765337746,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.7902,
+      "step": 782
+    },
+    {
+      "epoch": 0.6264,
+      "grad_norm": 0.454757882406579,
+      "learning_rate": 6.474467284964634e-05,
+      "loss": 0.7323,
+      "step": 783
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.5787640937084728,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.8592,
+      "step": 784
+    },
+    {
+      "epoch": 0.628,
+      "grad_norm": 0.43575556952847144,
+      "learning_rate": 6.426002064081565e-05,
+      "loss": 0.6981,
+      "step": 785
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.5599270657727717,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.8224,
+      "step": 786
+    },
+    {
+      "epoch": 0.6296,
+      "grad_norm": 0.3927164317907513,
+      "learning_rate": 6.377632895510248e-05,
+      "loss": 0.7449,
+      "step": 787
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.5876453267453996,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.7211,
+      "step": 788
+    },
+    {
+      "epoch": 0.6312,
+      "grad_norm": 0.5284880195456529,
+      "learning_rate": 6.329361079187199e-05,
+      "loss": 0.7267,
+      "step": 789
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.4344724546277582,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7286,
+      "step": 790
+    },
+    {
+      "epoch": 0.6328,
+      "grad_norm": 0.47331754558605854,
+      "learning_rate": 6.281187912432587e-05,
+      "loss": 0.7227,
+      "step": 791
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.42289814294337735,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.7804,
+      "step": 792
+    },
+    {
+      "epoch": 0.6344,
+      "grad_norm": 0.4597744885394716,
+      "learning_rate": 6.233114689915316e-05,
+      "loss": 0.7347,
+      "step": 793
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.6886867743546877,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.8761,
+      "step": 794
+    },
+    {
+      "epoch": 0.636,
+      "grad_norm": 0.5704387953894104,
+      "learning_rate": 6.18514270361827e-05,
+      "loss": 0.7524,
+      "step": 795
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.7980508446517586,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.8903,
+      "step": 796
+    },
+    {
+      "epoch": 0.6376,
+      "grad_norm": 0.6561748021138134,
+      "learning_rate": 6.13727324280358e-05,
+      "loss": 0.906,
+      "step": 797
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.3997245264461394,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.7133,
+      "step": 798
+    },
+    {
+      "epoch": 0.6392,
+      "grad_norm": 0.49172147669387856,
+      "learning_rate": 6.08950759397797e-05,
+      "loss": 0.7171,
+      "step": 799
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.47494727743860726,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6789,
+      "step": 800
+    },
+    {
+      "epoch": 0.6408,
+      "grad_norm": 0.509205059753719,
+      "learning_rate": 6.0418470408581774e-05,
+      "loss": 0.7476,
+      "step": 801
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.44098130040514394,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.8466,
+      "step": 802
+    },
+    {
+      "epoch": 0.6424,
+      "grad_norm": 0.4554178807918066,
+      "learning_rate": 5.9942928643364724e-05,
+      "loss": 0.8006,
+      "step": 803
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.5317635887247807,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.9353,
+      "step": 804
+    },
+    {
+      "epoch": 0.644,
+      "grad_norm": 0.37059723794483423,
+      "learning_rate": 5.946846342446214e-05,
+      "loss": 0.6507,
+      "step": 805
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.4702183008942206,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.792,
+      "step": 806
+    },
+    {
+      "epoch": 0.6456,
+      "grad_norm": 0.4551255320821976,
+      "learning_rate": 5.899508750327501e-05,
+      "loss": 0.7764,
+      "step": 807
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.6339311550272567,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.8613,
+      "step": 808
+    },
+    {
+      "epoch": 0.6472,
+      "grad_norm": 0.4761527444118468,
+      "learning_rate": 5.8522813601929324e-05,
+      "loss": 0.6806,
+      "step": 809
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.4615038937896931,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.7258,
+      "step": 810
+    },
+    {
+      "epoch": 0.6488,
+      "grad_norm": 0.4606585651526168,
+      "learning_rate": 5.80516544129337e-05,
+      "loss": 0.7436,
+      "step": 811
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.4253710733831671,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6899,
+      "step": 812
+    },
+    {
+      "epoch": 0.6504,
+      "grad_norm": 0.48024005565934813,
+      "learning_rate": 5.758162259883867e-05,
+      "loss": 0.718,
+      "step": 813
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.5152988534627136,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.889,
+      "step": 814
+    },
+    {
+      "epoch": 0.652,
+      "grad_norm": 0.5324023326915394,
+      "learning_rate": 5.7112730791896207e-05,
+      "loss": 0.7591,
+      "step": 815
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.44548172271198205,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.7246,
+      "step": 816
+    },
+    {
+      "epoch": 0.6536,
+      "grad_norm": 0.453129656700259,
+      "learning_rate": 5.664499159372017e-05,
+      "loss": 0.6994,
+      "step": 817
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.6138800607670666,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.8983,
+      "step": 818
+    },
+    {
+      "epoch": 0.6552,
+      "grad_norm": 0.5924138427272913,
+      "learning_rate": 5.617841757494762e-05,
+      "loss": 0.7872,
+      "step": 819
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.41836352920609127,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.7172,
+      "step": 820
+    },
+    {
+      "epoch": 0.6568,
+      "grad_norm": 0.5060582567527775,
+      "learning_rate": 5.5713021274901335e-05,
+      "loss": 0.7465,
+      "step": 821
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.5100761142075639,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.7791,
+      "step": 822
+    },
+    {
+      "epoch": 0.6584,
+      "grad_norm": 0.4748718110304019,
+      "learning_rate": 5.524881520125229e-05,
+      "loss": 0.7087,
+      "step": 823
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.4781754943952788,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.7846,
+      "step": 824
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.5116215246876256,
+      "learning_rate": 5.4785811829683764e-05,
+      "loss": 0.9123,
+      "step": 825
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.5653042070555684,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.8254,
+      "step": 826
+    },
+    {
+      "epoch": 0.6616,
+      "grad_norm": 0.4099161045841414,
+      "learning_rate": 5.432402360355615e-05,
+      "loss": 0.77,
+      "step": 827
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.5907690848546384,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.8821,
+      "step": 828
+    },
+    {
+      "epoch": 0.6632,
+      "grad_norm": 0.38933980873615237,
+      "learning_rate": 5.386346293357242e-05,
+      "loss": 0.6871,
+      "step": 829
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.445542876932296,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.7841,
+      "step": 830
+    },
+    {
+      "epoch": 0.6648,
+      "grad_norm": 0.6335858845221375,
+      "learning_rate": 5.3404142197444506e-05,
+      "loss": 0.997,
+      "step": 831
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.4803421356568687,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.7519,
+      "step": 832
+    },
+    {
+      "epoch": 0.6664,
+      "grad_norm": 0.6064940252059536,
+      "learning_rate": 5.2946073739560706e-05,
+      "loss": 0.8153,
+      "step": 833
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.4128777697333255,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6295,
+      "step": 834
+    },
+    {
+      "epoch": 0.668,
+      "grad_norm": 0.4591857654670067,
+      "learning_rate": 5.248926987065417e-05,
+      "loss": 0.7009,
+      "step": 835
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.5819461821891235,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.7929,
+      "step": 836
+    },
+    {
+      "epoch": 0.6696,
+      "grad_norm": 0.4556561799817687,
+      "learning_rate": 5.203374286747158e-05,
+      "loss": 0.7401,
+      "step": 837
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.511193692242379,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.7604,
+      "step": 838
+    },
+    {
+      "epoch": 0.6712,
+      "grad_norm": 0.5125207401474582,
+      "learning_rate": 5.15795049724435e-05,
+      "loss": 0.7768,
+      "step": 839
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.43482903797591776,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.73,
+      "step": 840
+    },
+    {
+      "epoch": 0.6728,
+      "grad_norm": 0.40158455350863376,
+      "learning_rate": 5.112656839335543e-05,
+      "loss": 0.6212,
+      "step": 841
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.4794802313794276,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.6973,
+      "step": 842
+    },
+    {
+      "epoch": 0.6744,
+      "grad_norm": 0.6134753346845444,
+      "learning_rate": 5.0674945303019526e-05,
+      "loss": 0.7147,
+      "step": 843
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4247728899241009,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.7204,
+      "step": 844
+    },
+    {
+      "epoch": 0.676,
+      "grad_norm": 0.4746055926663477,
+      "learning_rate": 5.022464783894744e-05,
+      "loss": 0.8316,
+      "step": 845
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.4563585858381452,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7573,
+      "step": 846
+    },
+    {
+      "epoch": 0.6776,
+      "grad_norm": 0.6514548402961644,
+      "learning_rate": 4.977568810302432e-05,
+      "loss": 0.8697,
+      "step": 847
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.4395184935238493,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.7215,
+      "step": 848
+    },
+    {
+      "epoch": 0.6792,
+      "grad_norm": 0.4252562174952073,
+      "learning_rate": 4.9328078161183464e-05,
+      "loss": 0.7515,
+      "step": 849
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.4227675902507199,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.7169,
+      "step": 850
+    },
+    {
+      "epoch": 0.6808,
+      "grad_norm": 0.4751010569172859,
+      "learning_rate": 4.88818300430819e-05,
+      "loss": 0.6626,
+      "step": 851
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.44882287012285954,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.7743,
+      "step": 852
+    },
+    {
+      "epoch": 0.6824,
+      "grad_norm": 0.56196232380045,
+      "learning_rate": 4.843695574177737e-05,
+      "loss": 0.7914,
+      "step": 853
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.5483852356476023,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.7902,
+      "step": 854
+    },
+    {
+      "epoch": 0.684,
+      "grad_norm": 0.6439704028827931,
+      "learning_rate": 4.7993467213405706e-05,
+      "loss": 0.899,
+      "step": 855
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.5663217598956615,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.7919,
+      "step": 856
+    },
+    {
+      "epoch": 0.6856,
+      "grad_norm": 0.49501494595816853,
+      "learning_rate": 4.755137637685979e-05,
+      "loss": 0.7635,
+      "step": 857
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.5930865783379573,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.8109,
+      "step": 858
+    },
+    {
+      "epoch": 0.6872,
+      "grad_norm": 0.5111538383222523,
+      "learning_rate": 4.7110695113469085e-05,
+      "loss": 0.8551,
+      "step": 859
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4724704337446228,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.7325,
+      "step": 860
+    },
+    {
+      "epoch": 0.6888,
+      "grad_norm": 0.4676027869294293,
+      "learning_rate": 4.6671435266680216e-05,
+      "loss": 0.7118,
+      "step": 861
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.5366310858073888,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.9235,
+      "step": 862
+    },
+    {
+      "epoch": 0.6904,
+      "grad_norm": 0.45322326200589436,
+      "learning_rate": 4.623360864173893e-05,
+      "loss": 0.6605,
+      "step": 863
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.5001990346277503,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.7978,
+      "step": 864
+    },
+    {
+      "epoch": 0.692,
+      "grad_norm": 0.40182637606093063,
+      "learning_rate": 4.579722700537268e-05,
+      "loss": 0.6705,
+      "step": 865
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.46906673716168085,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.7007,
+      "step": 866
+    },
+    {
+      "epoch": 0.6936,
+      "grad_norm": 0.5362371092900002,
+      "learning_rate": 4.5362302085474254e-05,
+      "loss": 0.7112,
+      "step": 867
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.5078116208393267,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.795,
+      "step": 868
+    },
+    {
+      "epoch": 0.6952,
+      "grad_norm": 0.4556106823335202,
+      "learning_rate": 4.492884557078688e-05,
+      "loss": 0.6837,
+      "step": 869
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.6318423018303347,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.8347,
+      "step": 870
+    },
+    {
+      "epoch": 0.6968,
+      "grad_norm": 0.6495688410104123,
+      "learning_rate": 4.449686911058992e-05,
+      "loss": 0.8639,
+      "step": 871
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.46876871290667643,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.7366,
+      "step": 872
+    },
+    {
+      "epoch": 0.6984,
+      "grad_norm": 0.4011594116666654,
+      "learning_rate": 4.406638431438576e-05,
+      "loss": 0.7196,
+      "step": 873
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.49479019610079433,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.6864,
+      "step": 874
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.4815943792182797,
+      "learning_rate": 4.36374027515878e-05,
+      "loss": 0.775,
+      "step": 875
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.4579322955767885,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.7325,
+      "step": 876
+    },
+    {
+      "epoch": 0.7016,
+      "grad_norm": 0.4991315092613472,
+      "learning_rate": 4.320993595120969e-05,
+      "loss": 0.7522,
+      "step": 877
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.48310010668510406,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.7525,
+      "step": 878
+    },
+    {
+      "epoch": 0.7032,
+      "grad_norm": 0.5308408656927481,
+      "learning_rate": 4.278399540155536e-05,
+      "loss": 0.6725,
+      "step": 879
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4659049002583655,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.7696,
+      "step": 880
+    },
+    {
+      "epoch": 0.7048,
+      "grad_norm": 0.4987725525086302,
+      "learning_rate": 4.2359592549910145e-05,
+      "loss": 0.6749,
+      "step": 881
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.4604305217685288,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.6969,
+      "step": 882
+    },
+    {
+      "epoch": 0.7064,
+      "grad_norm": 0.3745632467390563,
+      "learning_rate": 4.193673880223339e-05,
+      "loss": 0.687,
+      "step": 883
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.4212724462363019,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.7271,
+      "step": 884
+    },
+    {
+      "epoch": 0.708,
+      "grad_norm": 0.5905873954325088,
+      "learning_rate": 4.1515445522851784e-05,
+      "loss": 0.7435,
+      "step": 885
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.43581393035533844,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.8126,
+      "step": 886
+    },
+    {
+      "epoch": 0.7096,
+      "grad_norm": 0.43467533201659003,
+      "learning_rate": 4.109572403415386e-05,
+      "loss": 0.6746,
+      "step": 887
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4294076965433029,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7496,
+      "step": 888
+    },
+    {
+      "epoch": 0.7112,
+      "grad_norm": 0.8322364025954283,
+      "learning_rate": 4.0677585616285774e-05,
+      "loss": 0.898,
+      "step": 889
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.4999601809894697,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.7144,
+      "step": 890
+    },
+    {
+      "epoch": 0.7128,
+      "grad_norm": 0.49987777257007493,
+      "learning_rate": 4.026104150684835e-05,
+      "loss": 0.7348,
+      "step": 891
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.5329345377491767,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.7592,
+      "step": 892
+    },
+    {
+      "epoch": 0.7144,
+      "grad_norm": 0.38317445403290057,
+      "learning_rate": 3.984610290059467e-05,
+      "loss": 0.6683,
+      "step": 893
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.5080676931856752,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.7267,
+      "step": 894
+    },
+    {
+      "epoch": 0.716,
+      "grad_norm": 0.47221572673796364,
+      "learning_rate": 3.943278094912946e-05,
+      "loss": 0.7896,
+      "step": 895
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.4694812272910821,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.7261,
+      "step": 896
+    },
+    {
+      "epoch": 0.7176,
+      "grad_norm": 0.4662327987492834,
+      "learning_rate": 3.902108676060937e-05,
+      "loss": 0.7037,
+      "step": 897
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.5021252185446033,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.8143,
+      "step": 898
+    },
+    {
+      "epoch": 0.7192,
+      "grad_norm": 0.5225033992837377,
+      "learning_rate": 3.861103139944449e-05,
+      "loss": 0.7233,
+      "step": 899
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.4105701621753054,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.7242,
+      "step": 900
+    },
+    {
+      "epoch": 0.7208,
+      "grad_norm": 0.5306223649945033,
+      "learning_rate": 3.820262588600074e-05,
+      "loss": 0.8186,
+      "step": 901
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.4894162749572337,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.7235,
+      "step": 902
+    },
+    {
+      "epoch": 0.7224,
+      "grad_norm": 0.46277641935711217,
+      "learning_rate": 3.7795881196303995e-05,
+      "loss": 0.7005,
+      "step": 903
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.6991867546461775,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 1.0024,
+      "step": 904
+    },
+    {
+      "epoch": 0.724,
+      "grad_norm": 0.49594568279315143,
+      "learning_rate": 3.739080826174498e-05,
+      "loss": 0.712,
+      "step": 905
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.46694755255082615,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.782,
+      "step": 906
+    },
+    {
+      "epoch": 0.7256,
+      "grad_norm": 0.5100406197855876,
+      "learning_rate": 3.6987417968785366e-05,
+      "loss": 0.8262,
+      "step": 907
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.5020707616239913,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.7464,
+      "step": 908
+    },
+    {
+      "epoch": 0.7272,
+      "grad_norm": 0.4353773917613402,
+      "learning_rate": 3.658572115866541e-05,
+      "loss": 0.7117,
+      "step": 909
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4619046343314709,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.7834,
+      "step": 910
+    },
+    {
+      "epoch": 0.7288,
+      "grad_norm": 0.43278550793305576,
+      "learning_rate": 3.618572862711247e-05,
+      "loss": 0.6557,
+      "step": 911
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.48220896332287105,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.7391,
+      "step": 912
+    },
+    {
+      "epoch": 0.7304,
+      "grad_norm": 0.5493168603240508,
+      "learning_rate": 3.578745112405083e-05,
+      "loss": 0.6951,
+      "step": 913
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.45094635972687336,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.816,
+      "step": 914
+    },
+    {
+      "epoch": 0.732,
+      "grad_norm": 0.4891259920727296,
+      "learning_rate": 3.539089935331294e-05,
+      "loss": 0.7715,
+      "step": 915
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4375019371148813,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.7016,
+      "step": 916
+    },
+    {
+      "epoch": 0.7336,
+      "grad_norm": 0.41909776259069126,
+      "learning_rate": 3.4996083972351515e-05,
+      "loss": 0.7253,
+      "step": 917
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.4234543390903855,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.665,
+      "step": 918
+    },
+    {
+      "epoch": 0.7352,
+      "grad_norm": 0.46238881696974216,
+      "learning_rate": 3.4603015591953395e-05,
+      "loss": 0.7774,
+      "step": 919
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.5023095345955012,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.7471,
+      "step": 920
+    },
+    {
+      "epoch": 0.7368,
+      "grad_norm": 0.42649563909059657,
+      "learning_rate": 3.421170477595419e-05,
+      "loss": 0.6287,
+      "step": 921
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.4622928542748576,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7827,
+      "step": 922
+    },
+    {
+      "epoch": 0.7384,
+      "grad_norm": 0.5917202929644055,
+      "learning_rate": 3.3822162040954354e-05,
+      "loss": 0.7612,
+      "step": 923
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.4715229666077649,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.7823,
+      "step": 924
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.4382219635122664,
+      "learning_rate": 3.34343978560367e-05,
+      "loss": 0.734,
+      "step": 925
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.5137357092717723,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.7479,
+      "step": 926
+    },
+    {
+      "epoch": 0.7416,
+      "grad_norm": 0.5566119421838533,
+      "learning_rate": 3.3048422642484886e-05,
+      "loss": 0.8414,
+      "step": 927
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.48515834899726845,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.7669,
+      "step": 928
+    },
+    {
+      "epoch": 0.7432,
+      "grad_norm": 0.5399448894800771,
+      "learning_rate": 3.266424677350346e-05,
+      "loss": 0.7363,
+      "step": 929
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4218218798399454,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6361,
+      "step": 930
+    },
+    {
+      "epoch": 0.7448,
+      "grad_norm": 0.4702185424668213,
+      "learning_rate": 3.228188057393895e-05,
+      "loss": 0.8182,
+      "step": 931
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.6263706758365083,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.8523,
+      "step": 932
+    },
+    {
+      "epoch": 0.7464,
+      "grad_norm": 0.6424858002914683,
+      "learning_rate": 3.190133432000252e-05,
+      "loss": 0.8708,
+      "step": 933
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.5013551193313562,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.8237,
+      "step": 934
+    },
+    {
+      "epoch": 0.748,
+      "grad_norm": 0.592697303217561,
+      "learning_rate": 3.1522618238993725e-05,
+      "loss": 0.8038,
+      "step": 935
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.5095286786464307,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.7275,
+      "step": 936
+    },
+    {
+      "epoch": 0.7496,
+      "grad_norm": 0.41610246174986487,
+      "learning_rate": 3.114574250902558e-05,
+      "loss": 0.7124,
+      "step": 937
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.525044257671557,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.681,
+      "step": 938
+    },
+    {
+      "epoch": 0.7512,
+      "grad_norm": 0.5187728408748397,
+      "learning_rate": 3.077071725875116e-05,
+      "loss": 0.7496,
+      "step": 939
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.5954039487911745,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.7171,
+      "step": 940
+    },
+    {
+      "epoch": 0.7528,
+      "grad_norm": 0.45051436916443294,
+      "learning_rate": 3.0397552567091337e-05,
+      "loss": 0.7073,
+      "step": 941
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.5648027876929638,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.7821,
+      "step": 942
+    },
+    {
+      "epoch": 0.7544,
+      "grad_norm": 0.47074310904855954,
+      "learning_rate": 3.0026258462963787e-05,
+      "loss": 0.7567,
+      "step": 943
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.36734367946250823,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.6361,
+      "step": 944
+    },
+    {
+      "epoch": 0.756,
+      "grad_norm": 0.6807230093209986,
+      "learning_rate": 2.9656844925013637e-05,
+      "loss": 0.9965,
+      "step": 945
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.4230860966116503,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.7158,
+      "step": 946
+    },
+    {
+      "epoch": 0.7576,
+      "grad_norm": 0.4171111061257004,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.6374,
+      "step": 947
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.48253610831141586,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.8264,
+      "step": 948
+    },
+    {
+      "epoch": 0.7592,
+      "grad_norm": 0.4123366523890242,
+      "learning_rate": 2.8923699209255284e-05,
+      "loss": 0.6718,
+      "step": 949
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.49828993487937395,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.7665,
+      "step": 950
+    },
+    {
+      "epoch": 0.7608,
+      "grad_norm": 0.45664615319919927,
+      "learning_rate": 2.8559986734967282e-05,
+      "loss": 0.7114,
+      "step": 951
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.48335649902965766,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6716,
+      "step": 952
+    },
+    {
+      "epoch": 0.7624,
+      "grad_norm": 0.4847550301151093,
+      "learning_rate": 2.819819423336775e-05,
+      "loss": 0.7474,
+      "step": 953
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.5234526590176337,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.772,
+      "step": 954
+    },
+    {
+      "epoch": 0.764,
+      "grad_norm": 0.49879514783701684,
+      "learning_rate": 2.7838331427743282e-05,
+      "loss": 0.7406,
+      "step": 955
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.5033085718387407,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.7003,
+      "step": 956
+    },
+    {
+      "epoch": 0.7656,
+      "grad_norm": 0.38764469155216236,
+      "learning_rate": 2.7480407989519198e-05,
+      "loss": 0.6239,
+      "step": 957
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.5703766646578441,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.7723,
+      "step": 958
+    },
+    {
+      "epoch": 0.7672,
+      "grad_norm": 0.4234542614190962,
+      "learning_rate": 2.712443353799984e-05,
+      "loss": 0.6501,
+      "step": 959
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.5008835886481774,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.8263,
+      "step": 960
+    },
+    {
+      "epoch": 0.7688,
+      "grad_norm": 0.599495073220037,
+      "learning_rate": 2.677041764010988e-05,
+      "loss": 0.7097,
+      "step": 961
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.5844611495743196,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.7835,
+      "step": 962
+    },
+    {
+      "epoch": 0.7704,
+      "grad_norm": 0.43476950878045484,
+      "learning_rate": 2.6418369810137188e-05,
+      "loss": 0.6799,
+      "step": 963
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.48616257006951086,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.7521,
+      "step": 964
+    },
+    {
+      "epoch": 0.772,
+      "grad_norm": 0.5436846517632036,
+      "learning_rate": 2.6068299509477266e-05,
+      "loss": 0.7803,
+      "step": 965
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.47154203209532536,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6548,
+      "step": 966
+    },
+    {
+      "epoch": 0.7736,
+      "grad_norm": 0.45566972553858986,
+      "learning_rate": 2.5720216146378917e-05,
+      "loss": 0.6729,
+      "step": 967
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.7693258124254059,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.7057,
+      "step": 968
+    },
+    {
+      "epoch": 0.7752,
+      "grad_norm": 0.6801760699430367,
+      "learning_rate": 2.5374129075691265e-05,
+      "loss": 0.7413,
+      "step": 969
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.6048820714864578,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.7702,
+      "step": 970
+    },
+    {
+      "epoch": 0.7768,
+      "grad_norm": 0.48961143100612164,
+      "learning_rate": 2.503004759861258e-05,
+      "loss": 0.8139,
+      "step": 971
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.4121627801544965,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.7368,
+      "step": 972
+    },
+    {
+      "epoch": 0.7784,
+      "grad_norm": 0.44146407445745256,
+      "learning_rate": 2.4687980962440072e-05,
+      "loss": 0.7054,
+      "step": 973
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.49447203885631874,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.7304,
+      "step": 974
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.599025188481658,
+      "learning_rate": 2.4347938360321566e-05,
+      "loss": 0.8518,
+      "step": 975
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.5079607966395224,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.7569,
+      "step": 976
+    },
+    {
+      "epoch": 0.7816,
+      "grad_norm": 0.5186789106045554,
+      "learning_rate": 2.400992893100822e-05,
+      "loss": 0.7706,
+      "step": 977
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.5603537997364025,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.7536,
+      "step": 978
+    },
+    {
+      "epoch": 0.7832,
+      "grad_norm": 0.5791561150960993,
+      "learning_rate": 2.3673961758609152e-05,
+      "loss": 0.8087,
+      "step": 979
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.43902036476656336,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.7452,
+      "step": 980
+    },
+    {
+      "epoch": 0.7848,
+      "grad_norm": 0.7575912669682205,
+      "learning_rate": 2.334004587234717e-05,
+      "loss": 0.8813,
+      "step": 981
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.42945935489036363,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6217,
+      "step": 982
+    },
+    {
+      "epoch": 0.7864,
+      "grad_norm": 0.44750100331178905,
+      "learning_rate": 2.300819024631603e-05,
+      "loss": 0.7209,
+      "step": 983
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.4409845770875295,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6433,
+      "step": 984
+    },
+    {
+      "epoch": 0.788,
+      "grad_norm": 0.46647686163124646,
+      "learning_rate": 2.26784037992395e-05,
+      "loss": 0.7116,
+      "step": 985
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.5162949726365446,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.8006,
+      "step": 986
+    },
+    {
+      "epoch": 0.7896,
+      "grad_norm": 0.5114649080326217,
+      "learning_rate": 2.2350695394231345e-05,
+      "loss": 0.8247,
+      "step": 987
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.49001671731893587,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.7786,
+      "step": 988
+    },
+    {
+      "epoch": 0.7912,
+      "grad_norm": 0.4223229927621443,
+      "learning_rate": 2.2025073838557454e-05,
+      "loss": 0.6628,
+      "step": 989
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.4803671003840686,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.7072,
+      "step": 990
+    },
+    {
+      "epoch": 0.7928,
+      "grad_norm": 0.5532220981211563,
+      "learning_rate": 2.1701547883398922e-05,
+      "loss": 0.8015,
+      "step": 991
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.43704204658882784,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.7021,
+      "step": 992
+    },
+    {
+      "epoch": 0.7944,
+      "grad_norm": 0.5544186398509853,
+      "learning_rate": 2.138012622361689e-05,
+      "loss": 0.7479,
+      "step": 993
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 1.0685852325949345,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.8271,
+      "step": 994
+    },
+    {
+      "epoch": 0.796,
+      "grad_norm": 0.4580950809239211,
+      "learning_rate": 2.106081749751897e-05,
+      "loss": 0.6717,
+      "step": 995
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.4178006662670166,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.6687,
+      "step": 996
+    },
+    {
+      "epoch": 0.7976,
+      "grad_norm": 0.5273656844850789,
+      "learning_rate": 2.0743630286627002e-05,
+      "loss": 0.6818,
+      "step": 997
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.43466594074759396,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.649,
+      "step": 998
+    },
+    {
+      "epoch": 0.7992,
+      "grad_norm": 0.43858172586530747,
+      "learning_rate": 2.0428573115446392e-05,
+      "loss": 0.6802,
+      "step": 999
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.4186937573931191,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6171,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8008,
+      "grad_norm": 0.5520115829546185,
+      "learning_rate": 2.011565445123711e-05,
+      "loss": 0.8476,
+      "step": 1001
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.4708825634329752,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.7885,
+      "step": 1002
+    },
+    {
+      "epoch": 0.8024,
+      "grad_norm": 0.48267036334656843,
+      "learning_rate": 1.980488270378612e-05,
+      "loss": 0.7612,
+      "step": 1003
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.5077422497180833,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6632,
+      "step": 1004
+    },
+    {
+      "epoch": 0.804,
+      "grad_norm": 0.539435837601806,
+      "learning_rate": 1.9496266225181248e-05,
+      "loss": 0.833,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.4419758101276182,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7182,
+      "step": 1006
+    },
+    {
+      "epoch": 0.8056,
+      "grad_norm": 0.46706717847608514,
+      "learning_rate": 1.918981330958678e-05,
+      "loss": 0.7399,
+      "step": 1007
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3942868612201381,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6905,
+      "step": 1008
+    },
+    {
+      "epoch": 0.8072,
+      "grad_norm": 0.823933864388514,
+      "learning_rate": 1.8885532193020704e-05,
+      "loss": 0.7783,
+      "step": 1009
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.4728779675632251,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7123,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8088,
+      "grad_norm": 0.4374541321566614,
+      "learning_rate": 1.8583431053133127e-05,
+      "loss": 0.5856,
+      "step": 1011
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.4985701255460996,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.6754,
+      "step": 1012
+    },
+    {
+      "epoch": 0.8104,
+      "grad_norm": 0.46546386224577047,
+      "learning_rate": 1.8283518008986567e-05,
+      "loss": 0.7292,
+      "step": 1013
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.5099137877528729,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.7328,
+      "step": 1014
+    },
+    {
+      "epoch": 0.812,
+      "grad_norm": 0.46783382282539937,
+      "learning_rate": 1.7985801120837865e-05,
+      "loss": 0.6792,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.46221528019887015,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.7198,
+      "step": 1016
+    },
+    {
+      "epoch": 0.8136,
+      "grad_norm": 0.4657759004524181,
+      "learning_rate": 1.7690288389921493e-05,
+      "loss": 0.7583,
+      "step": 1017
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.4755406411752139,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.7101,
+      "step": 1018
+    },
+    {
+      "epoch": 0.8152,
+      "grad_norm": 0.45146092263453164,
+      "learning_rate": 1.739698775823442e-05,
+      "loss": 0.7212,
+      "step": 1019
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.48181166430899425,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.6708,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8168,
+      "grad_norm": 0.5121672276622006,
+      "learning_rate": 1.7105907108322816e-05,
+      "loss": 0.7762,
+      "step": 1021
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.5347895896829294,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7214,
+      "step": 1022
+    },
+    {
+      "epoch": 0.8184,
+      "grad_norm": 0.5062691297853303,
+      "learning_rate": 1.6817054263070174e-05,
+      "loss": 0.7986,
+      "step": 1023
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.45470665182666015,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.7639,
+      "step": 1024
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.5633270465411959,
+      "learning_rate": 1.6530436985486996e-05,
+      "loss": 0.7965,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.4632187262935168,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6665,
+      "step": 1026
+    },
+    {
+      "epoch": 0.8216,
+      "grad_norm": 0.574279183736144,
+      "learning_rate": 1.6246062978502164e-05,
+      "loss": 0.8285,
+      "step": 1027
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.44387512333929124,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.7701,
+      "step": 1028
+    },
+    {
+      "epoch": 0.8232,
+      "grad_norm": 0.5253025020458589,
+      "learning_rate": 1.5963939884756042e-05,
+      "loss": 0.7622,
+      "step": 1029
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.35988195955396635,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6074,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8248,
+      "grad_norm": 0.44751945613534244,
+      "learning_rate": 1.5684075286394985e-05,
+      "loss": 0.7025,
+      "step": 1031
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.4525032309828442,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.7061,
+      "step": 1032
+    },
+    {
+      "epoch": 0.8264,
+      "grad_norm": 0.6537877188354023,
+      "learning_rate": 1.5406476704867524e-05,
+      "loss": 0.9126,
+      "step": 1033
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.4864319539069579,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.7534,
+      "step": 1034
+    },
+    {
+      "epoch": 0.828,
+      "grad_norm": 0.4876601102451837,
+      "learning_rate": 1.5131151600722337e-05,
+      "loss": 0.7088,
+      "step": 1035
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.40757454727223125,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.787,
+      "step": 1036
+    },
+    {
+      "epoch": 0.8296,
+      "grad_norm": 0.47892847598073485,
+      "learning_rate": 1.485810737340767e-05,
+      "loss": 0.7728,
+      "step": 1037
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.4757832511827876,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.7495,
+      "step": 1038
+    },
+    {
+      "epoch": 0.8312,
+      "grad_norm": 0.5231076380426355,
+      "learning_rate": 1.4587351361072454e-05,
+      "loss": 0.8446,
+      "step": 1039
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.4584636561692689,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.7148,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8328,
+      "grad_norm": 0.4249478616706151,
+      "learning_rate": 1.4318890840369182e-05,
+      "loss": 0.6406,
+      "step": 1041
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.5974664972050698,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.7458,
+      "step": 1042
+    },
+    {
+      "epoch": 0.8344,
+      "grad_norm": 0.43967215657372255,
+      "learning_rate": 1.4052733026258281e-05,
+      "loss": 0.7047,
+      "step": 1043
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.3775520915989605,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.6716,
+      "step": 1044
+    },
+    {
+      "epoch": 0.836,
+      "grad_norm": 0.5605163401425338,
+      "learning_rate": 1.3788885071814172e-05,
+      "loss": 0.6772,
+      "step": 1045
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.5621911471363742,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.8153,
+      "step": 1046
+    },
+    {
+      "epoch": 0.8376,
+      "grad_norm": 0.42912955154901866,
+      "learning_rate": 1.3527354068033139e-05,
+      "loss": 0.66,
+      "step": 1047
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.5202569987929434,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.7066,
+      "step": 1048
+    },
+    {
+      "epoch": 0.8392,
+      "grad_norm": 0.4807687247104095,
+      "learning_rate": 1.326814704364262e-05,
+      "loss": 0.7121,
+      "step": 1049
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.4986644316693339,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.7457,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8408,
+      "grad_norm": 0.47792773262845173,
+      "learning_rate": 1.3011270964912459e-05,
+      "loss": 0.6795,
+      "step": 1051
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.46685496422213985,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.7332,
+      "step": 1052
+    },
+    {
+      "epoch": 0.8424,
+      "grad_norm": 0.48669661361282085,
+      "learning_rate": 1.275673273546758e-05,
+      "loss": 0.7764,
+      "step": 1053
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.4235981012865011,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6397,
+      "step": 1054
+    },
+    {
+      "epoch": 0.844,
+      "grad_norm": 0.4904792258557603,
+      "learning_rate": 1.2504539196102439e-05,
+      "loss": 0.8163,
+      "step": 1055
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.6050020644586003,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.7383,
+      "step": 1056
+    },
+    {
+      "epoch": 0.8456,
+      "grad_norm": 0.5121518171335521,
+      "learning_rate": 1.2254697124597237e-05,
+      "loss": 0.762,
+      "step": 1057
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.5721883730616573,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.8488,
+      "step": 1058
+    },
+    {
+      "epoch": 0.8472,
+      "grad_norm": 0.5478659357304045,
+      "learning_rate": 1.2007213235535786e-05,
+      "loss": 0.7503,
+      "step": 1059
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.43895362313133507,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.7001,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8488,
+      "grad_norm": 0.5635448355820784,
+      "learning_rate": 1.176209418012495e-05,
+      "loss": 0.7955,
+      "step": 1061
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.49276395642104454,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.7104,
+      "step": 1062
+    },
+    {
+      "epoch": 0.8504,
+      "grad_norm": 0.47743783074570756,
+      "learning_rate": 1.1519346546015907e-05,
+      "loss": 0.7628,
+      "step": 1063
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.5468138659604102,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.738,
+      "step": 1064
+    },
+    {
+      "epoch": 0.852,
+      "grad_norm": 0.59119847348091,
+      "learning_rate": 1.1278976857127311e-05,
+      "loss": 0.7701,
+      "step": 1065
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.7471754912626668,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.7783,
+      "step": 1066
+    },
+    {
+      "epoch": 0.8536,
+      "grad_norm": 0.6290087709600349,
+      "learning_rate": 1.1040991573469629e-05,
+      "loss": 0.8718,
+      "step": 1067
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.4751492256677272,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.7373,
+      "step": 1068
+    },
+    {
+      "epoch": 0.8552,
+      "grad_norm": 0.5120311906692161,
+      "learning_rate": 1.0805397090971737e-05,
+      "loss": 0.7467,
+      "step": 1069
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.5690162969284462,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.7774,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8568,
+      "grad_norm": 0.655117479287365,
+      "learning_rate": 1.057219974130903e-05,
+      "loss": 0.8564,
+      "step": 1071
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.5215038621038415,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6966,
+      "step": 1072
+    },
+    {
+      "epoch": 0.8584,
+      "grad_norm": 0.578325587613857,
+      "learning_rate": 1.0341405791733183e-05,
+      "loss": 0.8556,
+      "step": 1073
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.4465886903437484,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.7757,
+      "step": 1074
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.46492410138691087,
+      "learning_rate": 1.0113021444903726e-05,
+      "loss": 0.7337,
+      "step": 1075
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.49953516173947843,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.7506,
+      "step": 1076
+    },
+    {
+      "epoch": 0.8616,
+      "grad_norm": 0.44577082171877463,
+      "learning_rate": 9.887052838721322e-06,
+      "loss": 0.6879,
+      "step": 1077
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.4685474938489625,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.7219,
+      "step": 1078
+    },
+    {
+      "epoch": 0.8632,
+      "grad_norm": 0.4658071939604654,
+      "learning_rate": 9.663506046162985e-06,
+      "loss": 0.6495,
+      "step": 1079
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.6385767458322889,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.8595,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8648,
+      "grad_norm": 0.43942390825182814,
+      "learning_rate": 9.44238707511862e-06,
+      "loss": 0.6651,
+      "step": 1081
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.5034642463474058,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.7372,
+      "step": 1082
+    },
+    {
+      "epoch": 0.8664,
+      "grad_norm": 0.5006545717086979,
+      "learning_rate": 9.22370186822965e-06,
+      "loss": 0.7398,
+      "step": 1083
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.44933894560864746,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.623,
+      "step": 1084
+    },
+    {
+      "epoch": 0.868,
+      "grad_norm": 0.5840445867283482,
+      "learning_rate": 9.0074563027294e-06,
+      "loss": 0.8533,
+      "step": 1085
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.5706035386100896,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6921,
+      "step": 1086
+    },
+    {
+      "epoch": 0.8696,
+      "grad_norm": 0.46084713291319324,
+      "learning_rate": 8.79365619028507e-06,
+      "loss": 0.7625,
+      "step": 1087
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.5420705220649868,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.7185,
+      "step": 1088
+    },
+    {
+      "epoch": 0.8712,
+      "grad_norm": 0.44589867765790675,
+      "learning_rate": 8.582307276841462e-06,
+      "loss": 0.6253,
+      "step": 1089
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.4607921796004629,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.6485,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8728,
+      "grad_norm": 1.0893025371996412,
+      "learning_rate": 8.37341524246672e-06,
+      "loss": 0.7809,
+      "step": 1091
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4811219887426921,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.807,
+      "step": 1092
+    },
+    {
+      "epoch": 0.8744,
+      "grad_norm": 0.4840226488388807,
+      "learning_rate": 8.166985701199582e-06,
+      "loss": 0.6789,
+      "step": 1093
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.5135873716292094,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.853,
+      "step": 1094
+    },
+    {
+      "epoch": 0.876,
+      "grad_norm": 0.36562653186171645,
+      "learning_rate": 7.963024200898462e-06,
+      "loss": 0.6781,
+      "step": 1095
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.42593289151012326,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6166,
+      "step": 1096
+    },
+    {
+      "epoch": 0.8776,
+      "grad_norm": 0.5315582194594366,
+      "learning_rate": 7.761536223092458e-06,
+      "loss": 0.6649,
+      "step": 1097
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.6127137326307142,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.7117,
+      "step": 1098
+    },
+    {
+      "epoch": 0.8792,
+      "grad_norm": 0.5303908062206815,
+      "learning_rate": 7.562527182833978e-06,
+      "loss": 0.8552,
+      "step": 1099
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.47838774189460087,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6406,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8808,
+      "grad_norm": 0.3913538973837513,
+      "learning_rate": 7.366002428553153e-06,
+      "loss": 0.6891,
+      "step": 1101
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.5334598670020214,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6587,
+      "step": 1102
+    },
+    {
+      "epoch": 0.8824,
+      "grad_norm": 0.46587118676131073,
+      "learning_rate": 7.171967241914224e-06,
+      "loss": 0.6475,
+      "step": 1103
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.4486855546434353,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6735,
+      "step": 1104
+    },
+    {
+      "epoch": 0.884,
+      "grad_norm": 0.39660319163011865,
+      "learning_rate": 6.980426837673437e-06,
+      "loss": 0.5958,
+      "step": 1105
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.48039135395366067,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6764,
+      "step": 1106
+    },
+    {
+      "epoch": 0.8856,
+      "grad_norm": 0.39495013034193327,
+      "learning_rate": 6.791386363539065e-06,
+      "loss": 0.6095,
+      "step": 1107
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.4717751676173904,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.7248,
+      "step": 1108
+    },
+    {
+      "epoch": 0.8872,
+      "grad_norm": 0.5150801453634942,
+      "learning_rate": 6.604850900032955e-06,
+      "loss": 0.7773,
+      "step": 1109
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.6082428013992502,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.8776,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8888,
+      "grad_norm": 0.4406367241712505,
+      "learning_rate": 6.420825460353974e-06,
+      "loss": 0.6803,
+      "step": 1111
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.46665206191644454,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.671,
+      "step": 1112
+    },
+    {
+      "epoch": 0.8904,
+      "grad_norm": 0.5460304496293431,
+      "learning_rate": 6.239314990243339e-06,
+      "loss": 0.691,
+      "step": 1113
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.47705943421423336,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.7156,
+      "step": 1114
+    },
+    {
+      "epoch": 0.892,
+      "grad_norm": 0.6375269617497463,
+      "learning_rate": 6.0603243678516995e-06,
+      "loss": 0.6572,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.4630754577437863,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6607,
+      "step": 1116
+    },
+    {
+      "epoch": 0.8936,
+      "grad_norm": 0.4469535198120172,
+      "learning_rate": 5.883858403607967e-06,
+      "loss": 0.6361,
+      "step": 1117
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.5132241096703618,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.8124,
+      "step": 1118
+    },
+    {
+      "epoch": 0.8952,
+      "grad_norm": 0.41536852207540703,
+      "learning_rate": 5.7099218400900716e-06,
+      "loss": 0.6163,
+      "step": 1119
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.40779479029246263,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6948,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8968,
+      "grad_norm": 0.45953098037180146,
+      "learning_rate": 5.538519351897575e-06,
+      "loss": 0.7308,
+      "step": 1121
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.47649432472023995,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.7045,
+      "step": 1122
+    },
+    {
+      "epoch": 0.8984,
+      "grad_norm": 0.4450325910963035,
+      "learning_rate": 5.369655545525909e-06,
+      "loss": 0.7013,
+      "step": 1123
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.5813744234646867,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.8248,
+      "step": 1124
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.5214211863700414,
+      "learning_rate": 5.2033349592426335e-06,
+      "loss": 0.7779,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.48349638498519676,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6961,
+      "step": 1126
+    },
+    {
+      "epoch": 0.9016,
+      "grad_norm": 0.5056775048112797,
+      "learning_rate": 5.039562062965508e-06,
+      "loss": 0.6622,
+      "step": 1127
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.5647896557856095,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.756,
+      "step": 1128
+    },
+    {
+      "epoch": 0.9032,
+      "grad_norm": 0.4349903928065448,
+      "learning_rate": 4.87834125814235e-06,
+      "loss": 0.693,
+      "step": 1129
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.5798443024117006,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.802,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9048,
+      "grad_norm": 0.4267548379955475,
+      "learning_rate": 4.719676877632639e-06,
+      "loss": 0.6816,
+      "step": 1131
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.45202988249577536,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.7188,
+      "step": 1132
+    },
+    {
+      "epoch": 0.9064,
+      "grad_norm": 0.44678846932757,
+      "learning_rate": 4.563573185591219e-06,
+      "loss": 0.6814,
+      "step": 1133
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.4836421448666474,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6817,
+      "step": 1134
+    },
+    {
+      "epoch": 0.908,
+      "grad_norm": 0.5740021878837424,
+      "learning_rate": 4.4100343773536225e-06,
+      "loss": 0.7962,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.4784481331451972,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6667,
+      "step": 1136
+    },
+    {
+      "epoch": 0.9096,
+      "grad_norm": 0.4432807695617788,
+      "learning_rate": 4.259064579323302e-06,
+      "loss": 0.7471,
+      "step": 1137
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.4177194996243855,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6628,
+      "step": 1138
+    },
+    {
+      "epoch": 0.9112,
+      "grad_norm": 0.4282931762857877,
+      "learning_rate": 4.1106678488607495e-06,
+      "loss": 0.7124,
+      "step": 1139
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4262061012697565,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.717,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9128,
+      "grad_norm": 0.4514805942685446,
+      "learning_rate": 3.964848174174541e-06,
+      "loss": 0.6876,
+      "step": 1141
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.5303649954638886,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.7667,
+      "step": 1142
+    },
+    {
+      "epoch": 0.9144,
+      "grad_norm": 0.4918631618165322,
+      "learning_rate": 3.821609474213983e-06,
+      "loss": 0.6502,
+      "step": 1143
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.5700142202967081,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.8474,
+      "step": 1144
+    },
+    {
+      "epoch": 0.916,
+      "grad_norm": 0.4991967240869772,
+      "learning_rate": 3.6809555985639068e-06,
+      "loss": 0.7511,
+      "step": 1145
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.40217138492730187,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.682,
+      "step": 1146
+    },
+    {
+      "epoch": 0.9176,
+      "grad_norm": 0.48229462822619174,
+      "learning_rate": 3.5428903273411863e-06,
+      "loss": 0.7206,
+      "step": 1147
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.47550104145425826,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.7343,
+      "step": 1148
+    },
+    {
+      "epoch": 0.9192,
+      "grad_norm": 0.45188720769842533,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.7025,
+      "step": 1149
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.7668311673684182,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.8834,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9208,
+      "grad_norm": 0.5669737602421924,
+      "learning_rate": 3.2745403706978872e-06,
+      "loss": 0.7298,
+      "step": 1151
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.5137507586726185,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6404,
+      "step": 1152
+    },
+    {
+      "epoch": 0.9224,
+      "grad_norm": 0.5034102659190928,
+      "learning_rate": 3.1442628972662704e-06,
+      "loss": 0.8691,
+      "step": 1153
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.514897507309948,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6907,
+      "step": 1154
+    },
+    {
+      "epoch": 0.924,
+      "grad_norm": 0.38352291014196305,
+      "learning_rate": 3.0165884520461316e-06,
+      "loss": 0.6594,
+      "step": 1155
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.5516929282475472,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.7701,
+      "step": 1156
+    },
+    {
+      "epoch": 0.9256,
+      "grad_norm": 0.5482142752963403,
+      "learning_rate": 2.8915204663281013e-06,
+      "loss": 0.759,
+      "step": 1157
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.5054509558775777,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.801,
+      "step": 1158
+    },
+    {
+      "epoch": 0.9272,
+      "grad_norm": 0.4659084921053555,
+      "learning_rate": 2.7690623013533976e-06,
+      "loss": 0.761,
+      "step": 1159
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.47021240701730616,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.7001,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9288,
+      "grad_norm": 0.49700801293005603,
+      "learning_rate": 2.649217248223468e-06,
+      "loss": 0.8184,
+      "step": 1161
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.7469452107817106,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.7729,
+      "step": 1162
+    },
+    {
+      "epoch": 0.9304,
+      "grad_norm": 0.4532722798183533,
+      "learning_rate": 2.5319885278115906e-06,
+      "loss": 0.6464,
+      "step": 1163
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.40494768586241986,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.664,
+      "step": 1164
+    },
+    {
+      "epoch": 0.932,
+      "grad_norm": 0.48079459836260613,
+      "learning_rate": 2.4173792906762804e-06,
+      "loss": 0.7605,
+      "step": 1165
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.507345904652204,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.777,
+      "step": 1166
+    },
+    {
+      "epoch": 0.9336,
+      "grad_norm": 0.48410887979651696,
+      "learning_rate": 2.3053926169765984e-06,
+      "loss": 0.7629,
+      "step": 1167
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.5838788418864651,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.7766,
+      "step": 1168
+    },
+    {
+      "epoch": 0.9352,
+      "grad_norm": 0.4743951487176639,
+      "learning_rate": 2.1960315163894075e-06,
+      "loss": 0.8391,
+      "step": 1169
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.5560881695486783,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.8358,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9368,
+      "grad_norm": 0.4701398204578433,
+      "learning_rate": 2.0892989280284823e-06,
+      "loss": 0.729,
+      "step": 1171
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.6364855840237912,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.7894,
+      "step": 1172
+    },
+    {
+      "epoch": 0.9384,
+      "grad_norm": 0.5991736751608101,
+      "learning_rate": 1.9851977203654835e-06,
+      "loss": 0.8437,
+      "step": 1173
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.5354023370634079,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.7529,
+      "step": 1174
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.6862494322593848,
+      "learning_rate": 1.8837306911529184e-06,
+      "loss": 0.8039,
+      "step": 1175
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.4778786680609419,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6964,
+      "step": 1176
+    },
+    {
+      "epoch": 0.9416,
+      "grad_norm": 0.49107623760081326,
+      "learning_rate": 1.7849005673489127e-06,
+      "loss": 0.7291,
+      "step": 1177
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.4364131865010124,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6781,
+      "step": 1178
+    },
+    {
+      "epoch": 0.9432,
+      "grad_norm": 0.413089179648038,
+      "learning_rate": 1.6887100050439587e-06,
+      "loss": 0.6462,
+      "step": 1179
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.44454951240419877,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6908,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9448,
+      "grad_norm": 0.5037902192703654,
+      "learning_rate": 1.595161589389449e-06,
+      "loss": 0.7678,
+      "step": 1181
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.3803449954705906,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6427,
+      "step": 1182
+    },
+    {
+      "epoch": 0.9464,
+      "grad_norm": 0.5063374888963099,
+      "learning_rate": 1.5042578345283108e-06,
+      "loss": 0.7234,
+      "step": 1183
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.4950748093427182,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.7362,
+      "step": 1184
+    },
+    {
+      "epoch": 0.948,
+      "grad_norm": 0.4298150720578157,
+      "learning_rate": 1.4160011835273934e-06,
+      "loss": 0.6113,
+      "step": 1185
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.4484073192872167,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.702,
+      "step": 1186
+    },
+    {
+      "epoch": 0.9496,
+      "grad_norm": 0.48076142652847925,
+      "learning_rate": 1.3303940083117527e-06,
+      "loss": 0.6509,
+      "step": 1187
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.5815289608293004,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.7593,
+      "step": 1188
+    },
+    {
+      "epoch": 0.9512,
+      "grad_norm": 0.47033738739691816,
+      "learning_rate": 1.2474386096010039e-06,
+      "loss": 0.7879,
+      "step": 1189
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4095355712885498,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.6214,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9528,
+      "grad_norm": 0.5139860956680408,
+      "learning_rate": 1.1671372168474138e-06,
+      "loss": 0.7745,
+      "step": 1191
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.5828420082000267,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.7234,
+      "step": 1192
+    },
+    {
+      "epoch": 0.9544,
+      "grad_norm": 0.45752920918524914,
+      "learning_rate": 1.089491988176017e-06,
+      "loss": 0.7287,
+      "step": 1193
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.4996228279183593,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.722,
+      "step": 1194
+    },
+    {
+      "epoch": 0.956,
+      "grad_norm": 0.4128316143700129,
+      "learning_rate": 1.014505010326583e-06,
+      "loss": 0.6079,
+      "step": 1195
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.6235720276309589,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.7944,
+      "step": 1196
+    },
+    {
+      "epoch": 0.9576,
+      "grad_norm": 0.5324177522890242,
+      "learning_rate": 9.421782985976068e-07,
+      "loss": 0.7151,
+      "step": 1197
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.4373834013581542,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6744,
+      "step": 1198
+    },
+    {
+      "epoch": 0.9592,
+      "grad_norm": 0.48830621150963666,
+      "learning_rate": 8.725137967920738e-07,
+      "loss": 0.7439,
+      "step": 1199
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.5172978854452933,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.7225,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9608,
+      "grad_norm": 0.5055221153296886,
+      "learning_rate": 8.055133771652345e-07,
+      "loss": 0.7656,
+      "step": 1201
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.5802956841191637,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.8947,
+      "step": 1202
+    },
+    {
+      "epoch": 0.9624,
+      "grad_norm": 0.5737272556988163,
+      "learning_rate": 7.411788403743237e-07,
+      "loss": 0.76,
+      "step": 1203
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.43047610633564,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.7098,
+      "step": 1204
+    },
+    {
+      "epoch": 0.964,
+      "grad_norm": 0.49059844294427335,
+      "learning_rate": 6.7951191543012e-07,
+      "loss": 0.7956,
+      "step": 1205
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.45196699568062104,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.7491,
+      "step": 1206
+    },
+    {
+      "epoch": 0.9656,
+      "grad_norm": 0.460241222532305,
+      "learning_rate": 6.205142596505176e-07,
+      "loss": 0.7106,
+      "step": 1207
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.3593871411948802,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6234,
+      "step": 1208
+    },
+    {
+      "epoch": 0.9672,
+      "grad_norm": 0.7025232576691512,
+      "learning_rate": 5.64187458615939e-07,
+      "loss": 0.8086,
+      "step": 1209
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.5658939556876554,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.7155,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9688,
+      "grad_norm": 0.48882639524211474,
+      "learning_rate": 5.105330261267916e-07,
+      "loss": 0.7347,
+      "step": 1211
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.4601932878135786,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.7215,
+      "step": 1212
+    },
+    {
+      "epoch": 0.9704,
+      "grad_norm": 0.4529043738632631,
+      "learning_rate": 4.5955240416271084e-07,
+      "loss": 0.6856,
+      "step": 1213
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.501067794848288,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.664,
+      "step": 1214
+    },
+    {
+      "epoch": 0.972,
+      "grad_norm": 0.679684097281984,
+      "learning_rate": 4.112469628438365e-07,
+      "loss": 0.8777,
+      "step": 1215
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.49247108010902707,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.646,
+      "step": 1216
+    },
+    {
+      "epoch": 0.9736,
+      "grad_norm": 0.7252832657307932,
+      "learning_rate": 3.6561800039403016e-07,
+      "loss": 0.8622,
+      "step": 1217
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.5300950176234022,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.8567,
+      "step": 1218
+    },
+    {
+      "epoch": 0.9752,
+      "grad_norm": 0.8460453494721898,
+      "learning_rate": 3.2266674310589273e-07,
+      "loss": 0.6174,
+      "step": 1219
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4996024193406382,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.8069,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9768,
+      "grad_norm": 0.4664523979748622,
+      "learning_rate": 2.8239434530792365e-07,
+      "loss": 0.7181,
+      "step": 1221
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.47115617791424774,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6722,
+      "step": 1222
+    },
+    {
+      "epoch": 0.9784,
+      "grad_norm": 0.4420849651630586,
+      "learning_rate": 2.448018893333681e-07,
+      "loss": 0.719,
+      "step": 1223
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.48151919354073947,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.7406,
+      "step": 1224
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.4976385289428759,
+      "learning_rate": 2.098903854912515e-07,
+      "loss": 0.808,
+      "step": 1225
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.48447004710712116,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.7694,
+      "step": 1226
+    },
+    {
+      "epoch": 0.9816,
+      "grad_norm": 0.4476502108264484,
+      "learning_rate": 1.7766077203915655e-07,
+      "loss": 0.7376,
+      "step": 1227
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.5955771747704717,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.9274,
+      "step": 1228
+    },
+    {
+      "epoch": 0.9832,
+      "grad_norm": 0.4079829047595218,
+      "learning_rate": 1.481139151579991e-07,
+      "loss": 0.7124,
+      "step": 1229
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.5028524878332806,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.728,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9848,
+      "grad_norm": 0.4703890920466158,
+      "learning_rate": 1.2125060892881346e-07,
+      "loss": 0.6841,
+      "step": 1231
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.48867952784400126,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.7001,
+      "step": 1232
+    },
+    {
+      "epoch": 0.9864,
+      "grad_norm": 0.5451033805868799,
+      "learning_rate": 9.707157531134713e-08,
+      "loss": 0.7471,
+      "step": 1233
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.6460001731465164,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.7885,
+      "step": 1234
+    },
+    {
+      "epoch": 0.988,
+      "grad_norm": 0.45891982107820856,
+      "learning_rate": 7.557746412468758e-08,
+      "loss": 0.6772,
+      "step": 1235
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.4222374108765135,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6913,
+      "step": 1236
+    },
+    {
+      "epoch": 0.9896,
+      "grad_norm": 0.47249096967220855,
+      "learning_rate": 5.6768853029787184e-08,
+      "loss": 0.7293,
+      "step": 1237
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.6296538495733958,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.7455,
+      "step": 1238
+    },
+    {
+      "epoch": 0.9912,
+      "grad_norm": 0.5041542749156558,
+      "learning_rate": 4.064624751394242e-08,
+      "loss": 0.6603,
+      "step": 1239
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4879198104782223,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.7456,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9928,
+      "grad_norm": 0.533299422297126,
+      "learning_rate": 2.7210080877237976e-08,
+      "loss": 0.6959,
+      "step": 1241
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.42419472398628383,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6748,
+      "step": 1242
+    },
+    {
+      "epoch": 0.9944,
+      "grad_norm": 0.5388702644620558,
+      "learning_rate": 1.646071422083395e-08,
+      "loss": 0.7351,
+      "step": 1243
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.45194480277494764,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.7659,
+      "step": 1244
+    },
+    {
+      "epoch": 0.996,
+      "grad_norm": 0.5266213337050553,
+      "learning_rate": 8.398436437317969e-09,
+      "loss": 0.7255,
+      "step": 1245
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.4521052692283909,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6672,
+      "step": 1246
+    },
+    {
+      "epoch": 0.9976,
+      "grad_norm": 0.4367416807703453,
+      "learning_rate": 3.023464202944748e-09,
+      "loss": 0.6763,
+      "step": 1247
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.5425671735354565,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.8357,
+      "step": 1248
+    },
+    {
+      "epoch": 0.9992,
+      "grad_norm": 0.39197805688782217,
+      "learning_rate": 3.3594197175190745e-10,
+      "loss": 0.6252,
+      "step": 1249
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.636152187807064,
+      "learning_rate": 0.0,
+      "loss": 0.7529,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0,
+      "step": 1250,
+      "total_flos": 1013983240585216.0,
+      "train_loss": 0.8066569664001465,
+      "train_runtime": 18635.1825,
+      "train_samples_per_second": 1.073,
+      "train_steps_per_second": 0.067
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1013983240585216.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a94241f4cbc407ab7bd263a285354f3ced348326
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "q_proj",
+    "down_proj",
+    "up_proj",
+    "o_proj",
+    "v_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1d1a624e9e1e573417e2e92c0582dc8619a5d9bf
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:222d6020a814fe4a9d38c7187c52bcc5c8fab80c418dd58f37f19789d5d3258e
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6abfdc94db754ee2253717394fccd92e378a5293
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48d577f56281a55d9ebce8783da6a9f998306862aa87ce27b012361b7d4c3a07
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..40a4398bb2113ddb8bdc3b4d774db352dfc0b074
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,8792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.8986589092020809,
+      "learning_rate": 5.263157894736842e-06,
+      "loss": 1.305,
+      "step": 1
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.830293875624169,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.0948,
+      "step": 2
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 1.0859176979111835,
+      "learning_rate": 1.5789473684210526e-05,
+      "loss": 1.4743,
+      "step": 3
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8491307507912119,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.2398,
+      "step": 4
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.821827783074679,
+      "learning_rate": 2.6315789473684212e-05,
+      "loss": 1.3072,
+      "step": 5
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.824226467468147,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.2926,
+      "step": 6
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.7462316432208581,
+      "learning_rate": 3.6842105263157895e-05,
+      "loss": 1.1826,
+      "step": 7
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9694515614371708,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.329,
+      "step": 8
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 1.355360401054164,
+      "learning_rate": 4.736842105263158e-05,
+      "loss": 1.2415,
+      "step": 9
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.7877105211597762,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.0667,
+      "step": 10
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 1.1658824689297784,
+      "learning_rate": 5.789473684210527e-05,
+      "loss": 1.2091,
+      "step": 11
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.9327083427649957,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.144,
+      "step": 12
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.8541289719054126,
+      "learning_rate": 6.842105263157895e-05,
+      "loss": 1.0759,
+      "step": 13
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.7901468926109594,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.1089,
+      "step": 14
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.9870845593100134,
+      "learning_rate": 7.894736842105263e-05,
+      "loss": 1.1508,
+      "step": 15
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.8366002733618001,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.9787,
+      "step": 16
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.5647308493323178,
+      "learning_rate": 8.947368421052632e-05,
+      "loss": 0.8868,
+      "step": 17
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.5265349634968662,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9564,
+      "step": 18
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.6688334553966441,
+      "learning_rate": 0.0001,
+      "loss": 0.9151,
+      "step": 19
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8331128891603933,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 1.0021,
+      "step": 20
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.6671520676918071,
+      "learning_rate": 0.0001105263157894737,
+      "loss": 1.0317,
+      "step": 21
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.5400337392126402,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.9324,
+      "step": 22
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.5516519660957107,
+      "learning_rate": 0.00012105263157894738,
+      "loss": 0.8616,
+      "step": 23
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.6051849477286368,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.9653,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.5220444727960437,
+      "learning_rate": 0.00013157894736842108,
+      "loss": 0.8718,
+      "step": 25
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.6123955645188963,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9204,
+      "step": 26
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.739824439009589,
+      "learning_rate": 0.00014210526315789474,
+      "loss": 0.9997,
+      "step": 27
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 1.0035094814749785,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 1.102,
+      "step": 28
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.49200935834599646,
+      "learning_rate": 0.00015263157894736845,
+      "loss": 0.8157,
+      "step": 29
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.5254439825147661,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8688,
+      "step": 30
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.7579956292478108,
+      "learning_rate": 0.0001631578947368421,
+      "loss": 0.9638,
+      "step": 31
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5490449797222368,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.9614,
+      "step": 32
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.7159607383012427,
+      "learning_rate": 0.0001736842105263158,
+      "loss": 1.1859,
+      "step": 33
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.5031259092114513,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8207,
+      "step": 34
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.6530032773796935,
+      "learning_rate": 0.00018421052631578948,
+      "loss": 0.9945,
+      "step": 35
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5311755174792404,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8765,
+      "step": 36
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.5265268927327297,
+      "learning_rate": 0.00019473684210526317,
+      "loss": 0.8142,
+      "step": 37
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.6594978793784297,
+      "learning_rate": 0.0002,
+      "loss": 0.9188,
+      "step": 38
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.6108156444196295,
+      "learning_rate": 0.00019999966405802826,
+      "loss": 0.9996,
+      "step": 39
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5207524633266147,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8505,
+      "step": 40
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.5945794453910999,
+      "learning_rate": 0.00019999697653579705,
+      "loss": 0.8842,
+      "step": 41
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.6835069109971997,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 1.0261,
+      "step": 42
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.5822115055269951,
+      "learning_rate": 0.0001999916015635627,
+      "loss": 0.8708,
+      "step": 43
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5409224848399561,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8429,
+      "step": 44
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.6552700099296889,
+      "learning_rate": 0.00019998353928577919,
+      "loss": 1.0053,
+      "step": 45
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.6259818352901533,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 1.0065,
+      "step": 46
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.5816629608848023,
+      "learning_rate": 0.0001999727899191228,
+      "loss": 0.9192,
+      "step": 47
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5455571314604929,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.9025,
+      "step": 48
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.5678393942881697,
+      "learning_rate": 0.00019995935375248606,
+      "loss": 1.0135,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.583023033404638,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.9791,
+      "step": 50
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.5607331923054734,
+      "learning_rate": 0.00019994323114697022,
+      "loss": 0.8702,
+      "step": 51
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5631397290394965,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.9171,
+      "step": 52
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.4958191335673494,
+      "learning_rate": 0.0001999244225358753,
+      "loss": 0.7935,
+      "step": 53
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.6636439021012339,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8472,
+      "step": 54
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.6049671251396802,
+      "learning_rate": 0.00019990292842468868,
+      "loss": 0.9366,
+      "step": 55
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.6812943564570575,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 1.0725,
+      "step": 56
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.5778111405788794,
+      "learning_rate": 0.0001998787493910712,
+      "loss": 0.8981,
+      "step": 57
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.5423108759849826,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.8489,
+      "step": 58
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.6029690474490851,
+      "learning_rate": 0.000199851886084842,
+      "loss": 0.92,
+      "step": 59
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.527992273229674,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.7874,
+      "step": 60
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.5350764937310879,
+      "learning_rate": 0.00019982233922796085,
+      "loss": 0.9199,
+      "step": 61
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5796700937405846,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8645,
+      "step": 62
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.5215364379687121,
+      "learning_rate": 0.00019979010961450878,
+      "loss": 0.8547,
+      "step": 63
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.6062129459008473,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.8867,
+      "step": 64
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.5605748343295668,
+      "learning_rate": 0.00019975519811066663,
+      "loss": 0.8111,
+      "step": 65
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.576459556463628,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8717,
+      "step": 66
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.5324349668538816,
+      "learning_rate": 0.0001997176056546921,
+      "loss": 0.8282,
+      "step": 67
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.5138522744442616,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.8646,
+      "step": 68
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.5010041056712672,
+      "learning_rate": 0.0001996773332568941,
+      "loss": 0.812,
+      "step": 69
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.7886740692553563,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 1.0595,
+      "step": 70
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.5457095602344805,
+      "learning_rate": 0.00019963438199960599,
+      "loss": 0.8222,
+      "step": 71
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.49123538118979226,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.7757,
+      "step": 72
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.5115269662398529,
+      "learning_rate": 0.00019958875303715615,
+      "loss": 0.9046,
+      "step": 73
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.6578445782195319,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.9942,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5954139274878782,
+      "learning_rate": 0.0001995404475958373,
+      "loss": 0.903,
+      "step": 75
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.618133330783266,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.9027,
+      "step": 76
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.5209779171701726,
+      "learning_rate": 0.0001994894669738732,
+      "loss": 0.8011,
+      "step": 77
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.5721584497274415,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.8403,
+      "step": 78
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.45441592205501147,
+      "learning_rate": 0.0001994358125413841,
+      "loss": 0.7716,
+      "step": 79
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5903525833772922,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8783,
+      "step": 80
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.5303726745446902,
+      "learning_rate": 0.0001993794857403495,
+      "loss": 0.8185,
+      "step": 81
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.772029898870653,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.8899,
+      "step": 82
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.641589294357296,
+      "learning_rate": 0.0001993204880845699,
+      "loss": 0.9059,
+      "step": 83
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.6217613426147984,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.9168,
+      "step": 84
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.447177478722229,
+      "learning_rate": 0.00019925882115962568,
+      "loss": 0.809,
+      "step": 85
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.552806211756663,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.9089,
+      "step": 86
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.464126069407049,
+      "learning_rate": 0.00019919448662283478,
+      "loss": 0.8101,
+      "step": 87
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.6040238156148237,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.9611,
+      "step": 88
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.6338051158624403,
+      "learning_rate": 0.00019912748620320794,
+      "loss": 0.9381,
+      "step": 89
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5623887909587395,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.8903,
+      "step": 90
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.5592043890991619,
+      "learning_rate": 0.00019905782170140238,
+      "loss": 0.9391,
+      "step": 91
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.4635024578463063,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8363,
+      "step": 92
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.5898751617245477,
+      "learning_rate": 0.00019898549498967343,
+      "loss": 0.9501,
+      "step": 93
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.5054290458812549,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.8782,
+      "step": 94
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.6134646974943796,
+      "learning_rate": 0.000198910508011824,
+      "loss": 0.9037,
+      "step": 95
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5717677399296033,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.9824,
+      "step": 96
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.6370817907582318,
+      "learning_rate": 0.00019883286278315262,
+      "loss": 0.804,
+      "step": 97
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.6284576519885868,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.9213,
+      "step": 98
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.5832943669530816,
+      "learning_rate": 0.00019875256139039902,
+      "loss": 0.9025,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.6280556178077573,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8009,
+      "step": 100
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.6784617300670089,
+      "learning_rate": 0.00019866960599168826,
+      "loss": 0.9593,
+      "step": 101
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.5699631314098912,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.9014,
+      "step": 102
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.5005466864303845,
+      "learning_rate": 0.0001985839988164726,
+      "loss": 0.8087,
+      "step": 103
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.5819193260487425,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.9133,
+      "step": 104
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.7866528561879383,
+      "learning_rate": 0.00019849574216547171,
+      "loss": 1.0283,
+      "step": 105
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.5642528780685404,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8159,
+      "step": 106
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.6208922393037105,
+      "learning_rate": 0.00019840483841061058,
+      "loss": 0.9366,
+      "step": 107
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.675109352046293,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.9294,
+      "step": 108
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.641474898133526,
+      "learning_rate": 0.00019831128999495606,
+      "loss": 0.925,
+      "step": 109
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.8968056261287591,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.9321,
+      "step": 110
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.6247995474162983,
+      "learning_rate": 0.0001982150994326511,
+      "loss": 0.9779,
+      "step": 111
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5600887864975691,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.9847,
+      "step": 112
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.521242776088519,
+      "learning_rate": 0.0001981162693088471,
+      "loss": 0.8781,
+      "step": 113
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.6125366480896934,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8999,
+      "step": 114
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.6533464104827684,
+      "learning_rate": 0.0001980148022796345,
+      "loss": 0.864,
+      "step": 115
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.552235681308966,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.9333,
+      "step": 116
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.4431325972855391,
+      "learning_rate": 0.00019791070107197153,
+      "loss": 0.8051,
+      "step": 117
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.5453940866305156,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.9783,
+      "step": 118
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.48508766719121615,
+      "learning_rate": 0.0001978039684836106,
+      "loss": 0.7914,
+      "step": 119
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5031532876965941,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.8062,
+      "step": 120
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.580797968287359,
+      "learning_rate": 0.0001976946073830234,
+      "loss": 0.8684,
+      "step": 121
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.49221239109286524,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.9073,
+      "step": 122
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.5681710105726208,
+      "learning_rate": 0.00019758262070932375,
+      "loss": 0.8279,
+      "step": 123
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.5221819706798567,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.8291,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5050274477718419,
+      "learning_rate": 0.00019746801147218842,
+      "loss": 0.8889,
+      "step": 125
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.7052424313505582,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.9742,
+      "step": 126
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.5825164051491847,
+      "learning_rate": 0.00019735078275177654,
+      "loss": 0.9465,
+      "step": 127
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.7561470692223455,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 1.0117,
+      "step": 128
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.5772282257061512,
+      "learning_rate": 0.00019723093769864663,
+      "loss": 0.7795,
+      "step": 129
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5297039548484948,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8911,
+      "step": 130
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.6347690410009654,
+      "learning_rate": 0.0001971084795336719,
+      "loss": 1.0346,
+      "step": 131
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.7027346281966333,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.9865,
+      "step": 132
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.6417895499509113,
+      "learning_rate": 0.00019698341154795389,
+      "loss": 0.9555,
+      "step": 133
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.5397045126751981,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.9009,
+      "step": 134
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.8677659845405544,
+      "learning_rate": 0.00019685573710273376,
+      "loss": 0.9189,
+      "step": 135
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.7541996003157508,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 1.0182,
+      "step": 136
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.4763386277168582,
+      "learning_rate": 0.00019672545962930215,
+      "loss": 0.7819,
+      "step": 137
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.601268655373242,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.9141,
+      "step": 138
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.5119877362673206,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.8581,
+      "step": 139
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5741522809403841,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8725,
+      "step": 140
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.5046784102658995,
+      "learning_rate": 0.00019645710967265882,
+      "loss": 0.8491,
+      "step": 141
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.7807045825377874,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.8545,
+      "step": 142
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.5587935402064526,
+      "learning_rate": 0.00019631904440143612,
+      "loss": 0.9317,
+      "step": 143
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.5252305623416647,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.9038,
+      "step": 144
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.7861363392166195,
+      "learning_rate": 0.00019617839052578603,
+      "loss": 0.9387,
+      "step": 145
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.5612971617745024,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.8797,
+      "step": 146
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.7073398481937953,
+      "learning_rate": 0.0001960351518258255,
+      "loss": 1.0086,
+      "step": 147
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.49866728031534496,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.7951,
+      "step": 148
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.6099236862660113,
+      "learning_rate": 0.00019588933215113926,
+      "loss": 0.9616,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.6027224592471235,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.8058,
+      "step": 150
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.5781441720245145,
+      "learning_rate": 0.00019574093542067673,
+      "loss": 0.9271,
+      "step": 151
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.5813183201401009,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.8407,
+      "step": 152
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.4872919673337898,
+      "learning_rate": 0.0001955899656226464,
+      "loss": 0.8195,
+      "step": 153
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.6324964991159663,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8517,
+      "step": 154
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.588729212998381,
+      "learning_rate": 0.0001954364268144088,
+      "loss": 0.878,
+      "step": 155
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.6264412787638441,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.8654,
+      "step": 156
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.57769459491213,
+      "learning_rate": 0.00019528032312236736,
+      "loss": 0.8709,
+      "step": 157
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.6357337602139279,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.9951,
+      "step": 158
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.9163531621840607,
+      "learning_rate": 0.00019512165874185767,
+      "loss": 1.0036,
+      "step": 159
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.541890790204263,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.8461,
+      "step": 160
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.6106261496261902,
+      "learning_rate": 0.0001949604379370345,
+      "loss": 0.9491,
+      "step": 161
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.45323037858604226,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8413,
+      "step": 162
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.5103765969188921,
+      "learning_rate": 0.00019479666504075736,
+      "loss": 0.8609,
+      "step": 163
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.5510921340396624,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8158,
+      "step": 164
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.6175356227388614,
+      "learning_rate": 0.0001946303444544741,
+      "loss": 1.0139,
+      "step": 165
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.6370307555105761,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8622,
+      "step": 166
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.6811171081420755,
+      "learning_rate": 0.00019446148064810242,
+      "loss": 0.9372,
+      "step": 167
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5598562796566181,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8956,
+      "step": 168
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.4617016786400735,
+      "learning_rate": 0.00019429007815990993,
+      "loss": 0.8088,
+      "step": 169
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.6102848905343905,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.882,
+      "step": 170
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.6962296432424184,
+      "learning_rate": 0.00019411614159639204,
+      "loss": 0.9596,
+      "step": 171
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5750923928215433,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.8366,
+      "step": 172
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.6265326368878519,
+      "learning_rate": 0.00019393967563214833,
+      "loss": 0.7805,
+      "step": 173
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.5381935667615324,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.847,
+      "step": 174
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.47642701981179963,
+      "learning_rate": 0.00019376068500975667,
+      "loss": 0.7575,
+      "step": 175
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5185766673031289,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.7912,
+      "step": 176
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.5960095358549333,
+      "learning_rate": 0.000193579174539646,
+      "loss": 0.9236,
+      "step": 177
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.7084179815147453,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 1.0528,
+      "step": 178
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.5365346049625459,
+      "learning_rate": 0.00019339514909996706,
+      "loss": 0.8705,
+      "step": 179
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.44830967852358555,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8658,
+      "step": 180
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.5799425229989558,
+      "learning_rate": 0.00019320861363646095,
+      "loss": 0.8809,
+      "step": 181
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.5266534061420898,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.8845,
+      "step": 182
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.49940366689432303,
+      "learning_rate": 0.00019301957316232658,
+      "loss": 0.8514,
+      "step": 183
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.5502261680204071,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.9158,
+      "step": 184
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.6852350642167058,
+      "learning_rate": 0.0001928280327580858,
+      "loss": 0.9311,
+      "step": 185
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.460573911082788,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7992,
+      "step": 186
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.5827491189696308,
+      "learning_rate": 0.00019263399757144683,
+      "loss": 0.8715,
+      "step": 187
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.47097245497372964,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8452,
+      "step": 188
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.6304041203761992,
+      "learning_rate": 0.000192437472817166,
+      "loss": 0.953,
+      "step": 189
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.5043229786824451,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.8501,
+      "step": 190
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.5534866676999771,
+      "learning_rate": 0.00019223846377690754,
+      "loss": 0.8797,
+      "step": 191
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.6283272982521915,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8713,
+      "step": 192
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.48771113456670234,
+      "learning_rate": 0.00019203697579910154,
+      "loss": 0.8375,
+      "step": 193
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.6286083414158488,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.9169,
+      "step": 194
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.5115509368730083,
+      "learning_rate": 0.00019183301429880043,
+      "loss": 0.9247,
+      "step": 195
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.47098215222574896,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.8131,
+      "step": 196
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.5556862923247421,
+      "learning_rate": 0.00019162658475753327,
+      "loss": 0.8602,
+      "step": 197
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.4841783174408543,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8302,
+      "step": 198
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.6708171063402733,
+      "learning_rate": 0.00019141769272315858,
+      "loss": 0.9678,
+      "step": 199
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4849407325306258,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8129,
+      "step": 200
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.47576669125357923,
+      "learning_rate": 0.00019120634380971496,
+      "loss": 0.8517,
+      "step": 201
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.4855825169345397,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8717,
+      "step": 202
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.557922956246443,
+      "learning_rate": 0.0001909925436972706,
+      "loss": 0.9238,
+      "step": 203
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.4979915047932215,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.8587,
+      "step": 204
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.8209533560536886,
+      "learning_rate": 0.00019077629813177036,
+      "loss": 1.1452,
+      "step": 205
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.6042494889849994,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.9018,
+      "step": 206
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.5365944913148071,
+      "learning_rate": 0.00019055761292488142,
+      "loss": 0.9367,
+      "step": 207
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4880610297558772,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.9145,
+      "step": 208
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.5096484059564002,
+      "learning_rate": 0.00019033649395383702,
+      "loss": 0.8918,
+      "step": 209
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.540369934145095,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.8379,
+      "step": 210
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.5280688976883253,
+      "learning_rate": 0.00019011294716127867,
+      "loss": 0.8826,
+      "step": 211
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.5988073761605279,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.8368,
+      "step": 212
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.47933975417162,
+      "learning_rate": 0.0001898869785550963,
+      "loss": 0.8315,
+      "step": 213
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.5192695982409684,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.9,
+      "step": 214
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.5312935651865488,
+      "learning_rate": 0.00018965859420826684,
+      "loss": 0.7853,
+      "step": 215
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.5376996711618321,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.9059,
+      "step": 216
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.6103998381142328,
+      "learning_rate": 0.00018942780025869098,
+      "loss": 1.0594,
+      "step": 217
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.6018626550476088,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.9539,
+      "step": 218
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.49601363247290925,
+      "learning_rate": 0.00018919460290902826,
+      "loss": 0.893,
+      "step": 219
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.7674521964202011,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.9014,
+      "step": 220
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.5336013967413517,
+      "learning_rate": 0.0001889590084265304,
+      "loss": 0.7981,
+      "step": 221
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.7835491538209578,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.8743,
+      "step": 222
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.6692990893335212,
+      "learning_rate": 0.0001887210231428727,
+      "loss": 0.797,
+      "step": 223
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.5554694978076148,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.9045,
+      "step": 224
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.7018575579850007,
+      "learning_rate": 0.0001884806534539841,
+      "loss": 0.9661,
+      "step": 225
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.5732712171563312,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8338,
+      "step": 226
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.6257272102104295,
+      "learning_rate": 0.0001882379058198751,
+      "loss": 0.9729,
+      "step": 227
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.5594595787785903,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.8213,
+      "step": 228
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.6979267188451503,
+      "learning_rate": 0.00018799278676446423,
+      "loss": 0.9913,
+      "step": 229
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.5994165349249353,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.9171,
+      "step": 230
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.4716135434126101,
+      "learning_rate": 0.00018774530287540278,
+      "loss": 0.6811,
+      "step": 231
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.7599573528458343,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.9036,
+      "step": 232
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.5150015644946726,
+      "learning_rate": 0.00018749546080389757,
+      "loss": 0.7706,
+      "step": 233
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.5200450168942828,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7307,
+      "step": 234
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.47433937688998185,
+      "learning_rate": 0.00018724326726453244,
+      "loss": 0.79,
+      "step": 235
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.8329159675900423,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 1.0186,
+      "step": 236
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.575539326963917,
+      "learning_rate": 0.00018698872903508755,
+      "loss": 0.9364,
+      "step": 237
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.4530633216315929,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7354,
+      "step": 238
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.5869814040574673,
+      "learning_rate": 0.0001867318529563574,
+      "loss": 0.8727,
+      "step": 239
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.5449803815964556,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8626,
+      "step": 240
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.4894517147542963,
+      "learning_rate": 0.00018647264593196688,
+      "loss": 0.8402,
+      "step": 241
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.6292559711803009,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.9694,
+      "step": 242
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.6147704342501734,
+      "learning_rate": 0.00018621111492818585,
+      "loss": 0.8603,
+      "step": 243
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.5431473970139671,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.8142,
+      "step": 244
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.5496557815725536,
+      "learning_rate": 0.00018594726697374175,
+      "loss": 0.7979,
+      "step": 245
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.5072773852005839,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.8802,
+      "step": 246
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.6101453917715832,
+      "learning_rate": 0.0001856811091596308,
+      "loss": 0.913,
+      "step": 247
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.45322241352839027,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.8332,
+      "step": 248
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.554767308131564,
+      "learning_rate": 0.00018541264863892754,
+      "loss": 0.7633,
+      "step": 249
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5912783933770612,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.867,
+      "step": 250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.5891468598140162,
+      "learning_rate": 0.00018514189262659235,
+      "loss": 0.9687,
+      "step": 251
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4997613391149276,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.7937,
+      "step": 252
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.5028289046293992,
+      "learning_rate": 0.00018486884839927768,
+      "loss": 0.7867,
+      "step": 253
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.5815838698745088,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.8327,
+      "step": 254
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.49985488612870416,
+      "learning_rate": 0.0001845935232951325,
+      "loss": 0.7991,
+      "step": 255
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.6216224645165545,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 1.0184,
+      "step": 256
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.6116470706461582,
+      "learning_rate": 0.00018431592471360503,
+      "loss": 1.0301,
+      "step": 257
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.5531047775225604,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.8914,
+      "step": 258
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.5079295779498472,
+      "learning_rate": 0.000184036060115244,
+      "loss": 0.9567,
+      "step": 259
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4454675416660914,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.7465,
+      "step": 260
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.5493225471484111,
+      "learning_rate": 0.00018375393702149787,
+      "loss": 0.9337,
+      "step": 261
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5991957073365373,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.9526,
+      "step": 262
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.5348809803871967,
+      "learning_rate": 0.00018346956301451304,
+      "loss": 0.7977,
+      "step": 263
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5186383297042673,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.8872,
+      "step": 264
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.5960123170128296,
+      "learning_rate": 0.00018318294573692985,
+      "loss": 0.9598,
+      "step": 265
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.6515494979551818,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.8145,
+      "step": 266
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.6080928936698701,
+      "learning_rate": 0.0001828940928916772,
+      "loss": 0.9205,
+      "step": 267
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.5504865158312073,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.8171,
+      "step": 268
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.7062526284859021,
+      "learning_rate": 0.00018260301224176558,
+      "loss": 0.9854,
+      "step": 269
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.4843625073655576,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.8036,
+      "step": 270
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.521267876354061,
+      "learning_rate": 0.00018230971161007853,
+      "loss": 0.7845,
+      "step": 271
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.5890747746964897,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.8138,
+      "step": 272
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.5239600933357469,
+      "learning_rate": 0.00018201419887916214,
+      "loss": 0.8446,
+      "step": 273
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.7687056630992152,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 1.1336,
+      "step": 274
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.5885758299804096,
+      "learning_rate": 0.00018171648199101346,
+      "loss": 0.8736,
+      "step": 275
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4545727094517842,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.8202,
+      "step": 276
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.4853582138003786,
+      "learning_rate": 0.00018141656894686689,
+      "loss": 0.8451,
+      "step": 277
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.6297161632990195,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.9326,
+      "step": 278
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.6520961746658375,
+      "learning_rate": 0.00018111446780697929,
+      "loss": 0.8899,
+      "step": 279
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5798076905396926,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.9056,
+      "step": 280
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.6156072852450034,
+      "learning_rate": 0.00018081018669041324,
+      "loss": 0.8909,
+      "step": 281
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.5732894197464338,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.9058,
+      "step": 282
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.5727194681440995,
+      "learning_rate": 0.00018050373377481878,
+      "loss": 0.9331,
+      "step": 283
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.6256855912463656,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.9958,
+      "step": 284
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.5141184537751282,
+      "learning_rate": 0.0001801951172962139,
+      "loss": 0.8346,
+      "step": 285
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.5626344634126693,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.8372,
+      "step": 286
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.44884469985131437,
+      "learning_rate": 0.0001798843455487629,
+      "loss": 0.8172,
+      "step": 287
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.5857688983623034,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.8494,
+      "step": 288
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.5818393340233482,
+      "learning_rate": 0.00017957142688455362,
+      "loss": 0.9132,
+      "step": 289
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.5189677144113589,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7677,
+      "step": 290
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.5373850787765176,
+      "learning_rate": 0.00017925636971337304,
+      "loss": 0.9232,
+      "step": 291
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.5336004325307646,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.955,
+      "step": 292
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.4479741697852256,
+      "learning_rate": 0.00017893918250248104,
+      "loss": 0.7711,
+      "step": 293
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.5892862205502235,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.9122,
+      "step": 294
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.7034529147174017,
+      "learning_rate": 0.00017861987377638312,
+      "loss": 0.9757,
+      "step": 295
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.4920748092757167,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.8567,
+      "step": 296
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.6935746977422499,
+      "learning_rate": 0.0001782984521166011,
+      "loss": 0.9767,
+      "step": 297
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.6857207812627873,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.9712,
+      "step": 298
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.5575033476384434,
+      "learning_rate": 0.00017797492616144256,
+      "loss": 0.8163,
+      "step": 299
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5723045438689641,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.9316,
+      "step": 300
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.47386175233905387,
+      "learning_rate": 0.00017764930460576866,
+      "loss": 0.8197,
+      "step": 301
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.5586605608011292,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.8275,
+      "step": 302
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.6728938504797903,
+      "learning_rate": 0.00017732159620076053,
+      "loss": 0.9101,
+      "step": 303
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.49103317186866774,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.8359,
+      "step": 304
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.49217192479223953,
+      "learning_rate": 0.00017699180975368396,
+      "loss": 0.7596,
+      "step": 305
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.5218476405354987,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7309,
+      "step": 306
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.608874260267828,
+      "learning_rate": 0.00017665995412765285,
+      "loss": 0.8727,
+      "step": 307
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.5522196870339003,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.9198,
+      "step": 308
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.7229557914044292,
+      "learning_rate": 0.00017632603824139085,
+      "loss": 0.979,
+      "step": 309
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.5348698413188704,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.8814,
+      "step": 310
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.5622567164554441,
+      "learning_rate": 0.0001759900710689918,
+      "loss": 0.8156,
+      "step": 311
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.5093756906036769,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8298,
+      "step": 312
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.46793398379285495,
+      "learning_rate": 0.00017565206163967846,
+      "loss": 0.7569,
+      "step": 313
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.4688320329106946,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.7834,
+      "step": 314
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.5708290914468318,
+      "learning_rate": 0.00017531201903755994,
+      "loss": 0.8218,
+      "step": 315
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4704229420649958,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.8628,
+      "step": 316
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.4573714211278024,
+      "learning_rate": 0.00017496995240138744,
+      "loss": 0.8404,
+      "step": 317
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.5018743555661164,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.8737,
+      "step": 318
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.524828796964741,
+      "learning_rate": 0.00017462587092430875,
+      "loss": 0.8281,
+      "step": 319
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.579486821084244,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.85,
+      "step": 320
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.5740604251572758,
+      "learning_rate": 0.00017427978385362112,
+      "loss": 0.8501,
+      "step": 321
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.4379389482626325,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7899,
+      "step": 322
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.5143736514553539,
+      "learning_rate": 0.0001739317004905227,
+      "loss": 0.8614,
+      "step": 323
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4787207925108045,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.8104,
+      "step": 324
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.4652530210510522,
+      "learning_rate": 0.00017358163018986282,
+      "loss": 0.7544,
+      "step": 325
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.439584122847283,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.7797,
+      "step": 326
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.5785898710901757,
+      "learning_rate": 0.00017322958235989016,
+      "loss": 0.8617,
+      "step": 327
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.5178898755449122,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7682,
+      "step": 328
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.4952121878341345,
+      "learning_rate": 0.00017287556646200018,
+      "loss": 0.7104,
+      "step": 329
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.5343104495072938,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.7876,
+      "step": 330
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.4808948891989481,
+      "learning_rate": 0.00017251959201048083,
+      "loss": 0.7816,
+      "step": 331
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.6958866218686733,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.9808,
+      "step": 332
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.5846119691054286,
+      "learning_rate": 0.00017216166857225674,
+      "loss": 0.8052,
+      "step": 333
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.46570870144186904,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.8445,
+      "step": 334
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.7693973221460947,
+      "learning_rate": 0.00017180180576663228,
+      "loss": 0.9614,
+      "step": 335
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4704975865295557,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8044,
+      "step": 336
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.4206300887289306,
+      "learning_rate": 0.00017144001326503273,
+      "loss": 0.7798,
+      "step": 337
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.46078013380689686,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.8296,
+      "step": 338
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.4442810770381299,
+      "learning_rate": 0.00017107630079074478,
+      "loss": 0.7575,
+      "step": 339
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.7175217922445156,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.9168,
+      "step": 340
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.6585571860135052,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.9008,
+      "step": 341
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.5223436809700848,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7575,
+      "step": 342
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.46856118841401995,
+      "learning_rate": 0.00017034315507498635,
+      "loss": 0.8129,
+      "step": 343
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.43582582860860897,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7806,
+      "step": 344
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.534652749247743,
+      "learning_rate": 0.00016997374153703625,
+      "loss": 0.7925,
+      "step": 345
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.5343850961920595,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.9008,
+      "step": 346
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.5018036235407817,
+      "learning_rate": 0.00016960244743290868,
+      "loss": 0.8923,
+      "step": 347
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.5433844630930597,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.8905,
+      "step": 348
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.45363314802800025,
+      "learning_rate": 0.00016922928274124886,
+      "loss": 0.7348,
+      "step": 349
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5800104410459153,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.897,
+      "step": 350
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.3607190442404761,
+      "learning_rate": 0.00016885425749097444,
+      "loss": 0.6753,
+      "step": 351
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.5796044387108275,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.79,
+      "step": 352
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.5288070170976834,
+      "learning_rate": 0.00016847738176100632,
+      "loss": 0.8346,
+      "step": 353
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.4642039164416691,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.8436,
+      "step": 354
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.47729002802695314,
+      "learning_rate": 0.0001680986656799975,
+      "loss": 0.8802,
+      "step": 355
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.6752672490776792,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 1.0189,
+      "step": 356
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.5408019204341415,
+      "learning_rate": 0.00016771811942606108,
+      "loss": 0.8333,
+      "step": 357
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.46012779574926227,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.77,
+      "step": 358
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.44564761511274714,
+      "learning_rate": 0.00016733575322649657,
+      "loss": 0.7955,
+      "step": 359
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4397975238603909,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.7309,
+      "step": 360
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.4601391505956761,
+      "learning_rate": 0.00016695157735751513,
+      "loss": 0.7598,
+      "step": 361
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.43078743526528973,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7514,
+      "step": 362
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.6943844358517226,
+      "learning_rate": 0.0001665656021439633,
+      "loss": 0.9688,
+      "step": 363
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.6199289848077439,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7856,
+      "step": 364
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.46086670594540174,
+      "learning_rate": 0.00016617783795904565,
+      "loss": 0.8119,
+      "step": 365
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.4933704290625319,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.8066,
+      "step": 366
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.6226839151315376,
+      "learning_rate": 0.00016578829522404583,
+      "loss": 0.9057,
+      "step": 367
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.47510078911012704,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7527,
+      "step": 368
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.5487304743269249,
+      "learning_rate": 0.00016539698440804661,
+      "loss": 0.9311,
+      "step": 369
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.5243769562453834,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7744,
+      "step": 370
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.6060607560363251,
+      "learning_rate": 0.0001650039160276485,
+      "loss": 0.9562,
+      "step": 371
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.4447612102652895,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.799,
+      "step": 372
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.5043201921155803,
+      "learning_rate": 0.0001646091006466871,
+      "loss": 0.777,
+      "step": 373
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.6474988233382649,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.9045,
+      "step": 374
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.4340313396759206,
+      "learning_rate": 0.00016421254887594917,
+      "loss": 0.7521,
+      "step": 375
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.5615075768410082,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.9001,
+      "step": 376
+    },
+    {
+      "epoch": 0.3016,
+      "grad_norm": 0.5607387146117787,
+      "learning_rate": 0.00016381427137288754,
+      "loss": 0.8576,
+      "step": 377
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.4731339230325628,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.768,
+      "step": 378
+    },
+    {
+      "epoch": 0.3032,
+      "grad_norm": 0.49559062627473566,
+      "learning_rate": 0.0001634142788413346,
+      "loss": 0.8063,
+      "step": 379
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4455448205556065,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.803,
+      "step": 380
+    },
+    {
+      "epoch": 0.3048,
+      "grad_norm": 0.5188763316404755,
+      "learning_rate": 0.00016301258203121462,
+      "loss": 0.8693,
+      "step": 381
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.4788968930738808,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.8193,
+      "step": 382
+    },
+    {
+      "epoch": 0.3064,
+      "grad_norm": 0.565510388051632,
+      "learning_rate": 0.00016260919173825508,
+      "loss": 0.7964,
+      "step": 383
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.456549323203182,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.798,
+      "step": 384
+    },
+    {
+      "epoch": 0.308,
+      "grad_norm": 0.43220854449176305,
+      "learning_rate": 0.00016220411880369601,
+      "loss": 0.7546,
+      "step": 385
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.5894674924180361,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.8805,
+      "step": 386
+    },
+    {
+      "epoch": 0.3096,
+      "grad_norm": 0.4867511866315462,
+      "learning_rate": 0.00016179737411399926,
+      "loss": 0.8065,
+      "step": 387
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.45750857416095064,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.8482,
+      "step": 388
+    },
+    {
+      "epoch": 0.3112,
+      "grad_norm": 0.51811947034319,
+      "learning_rate": 0.00016138896860055555,
+      "loss": 0.852,
+      "step": 389
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.542414398039288,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7855,
+      "step": 390
+    },
+    {
+      "epoch": 0.3128,
+      "grad_norm": 0.4177555443021668,
+      "learning_rate": 0.00016097891323939062,
+      "loss": 0.7254,
+      "step": 391
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.5122162363581776,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7879,
+      "step": 392
+    },
+    {
+      "epoch": 0.3144,
+      "grad_norm": 0.521585064075366,
+      "learning_rate": 0.00016056721905087056,
+      "loss": 0.8425,
+      "step": 393
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.5337676365319911,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.8443,
+      "step": 394
+    },
+    {
+      "epoch": 0.316,
+      "grad_norm": 0.5532009911621519,
+      "learning_rate": 0.00016015389709940538,
+      "loss": 0.8758,
+      "step": 395
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.5728067440563523,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7819,
+      "step": 396
+    },
+    {
+      "epoch": 0.3176,
+      "grad_norm": 0.5210103844247099,
+      "learning_rate": 0.0001597389584931517,
+      "loss": 0.8325,
+      "step": 397
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.5087466797181295,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.8146,
+      "step": 398
+    },
+    {
+      "epoch": 0.3192,
+      "grad_norm": 0.41931715800933833,
+      "learning_rate": 0.0001593224143837142,
+      "loss": 0.7942,
+      "step": 399
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4901933553582932,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.865,
+      "step": 400
+    },
+    {
+      "epoch": 0.3208,
+      "grad_norm": 0.4765670951576811,
+      "learning_rate": 0.00015890427596584617,
+      "loss": 0.8184,
+      "step": 401
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.46904520645493447,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.8152,
+      "step": 402
+    },
+    {
+      "epoch": 0.3224,
+      "grad_norm": 0.4959381631072489,
+      "learning_rate": 0.00015848455447714822,
+      "loss": 0.8772,
+      "step": 403
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.43533764493862964,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.8032,
+      "step": 404
+    },
+    {
+      "epoch": 0.324,
+      "grad_norm": 0.5203866447099498,
+      "learning_rate": 0.00015806326119776663,
+      "loss": 0.8453,
+      "step": 405
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.49538698755881333,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7219,
+      "step": 406
+    },
+    {
+      "epoch": 0.3256,
+      "grad_norm": 0.5626546685841983,
+      "learning_rate": 0.00015764040745008988,
+      "loss": 0.763,
+      "step": 407
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.5463990462052876,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.8225,
+      "step": 408
+    },
+    {
+      "epoch": 0.3272,
+      "grad_norm": 0.6738808055422155,
+      "learning_rate": 0.00015721600459844468,
+      "loss": 0.9124,
+      "step": 409
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.5566517665237946,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.857,
+      "step": 410
+    },
+    {
+      "epoch": 0.3288,
+      "grad_norm": 0.5818652259183016,
+      "learning_rate": 0.00015679006404879033,
+      "loss": 0.8892,
+      "step": 411
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.5926880656892722,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.9134,
+      "step": 412
+    },
+    {
+      "epoch": 0.3304,
+      "grad_norm": 0.6825984184560483,
+      "learning_rate": 0.00015636259724841222,
+      "loss": 0.9489,
+      "step": 413
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.44950861810545384,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7511,
+      "step": 414
+    },
+    {
+      "epoch": 0.332,
+      "grad_norm": 0.5198359021561324,
+      "learning_rate": 0.00015593361568561428,
+      "loss": 0.8457,
+      "step": 415
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.5696549519021159,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7875,
+      "step": 416
+    },
+    {
+      "epoch": 0.3336,
+      "grad_norm": 0.546694860620974,
+      "learning_rate": 0.0001555031308894101,
+      "loss": 0.8445,
+      "step": 417
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.5380282710970803,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.8209,
+      "step": 418
+    },
+    {
+      "epoch": 0.3352,
+      "grad_norm": 0.5937082015021489,
+      "learning_rate": 0.0001550711544292131,
+      "loss": 0.89,
+      "step": 419
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5287257831042314,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.8345,
+      "step": 420
+    },
+    {
+      "epoch": 0.3368,
+      "grad_norm": 0.6103183473899669,
+      "learning_rate": 0.00015463769791452574,
+      "loss": 0.8317,
+      "step": 421
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.5094586387221397,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7889,
+      "step": 422
+    },
+    {
+      "epoch": 0.3384,
+      "grad_norm": 0.6107261454152543,
+      "learning_rate": 0.00015420277299462736,
+      "loss": 0.9488,
+      "step": 423
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.6065117264413042,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.9215,
+      "step": 424
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.7053169198061089,
+      "learning_rate": 0.00015376639135826107,
+      "loss": 1.0075,
+      "step": 425
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.4228898589354549,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.8079,
+      "step": 426
+    },
+    {
+      "epoch": 0.3416,
+      "grad_norm": 0.487172024547341,
+      "learning_rate": 0.00015332856473331978,
+      "loss": 0.8748,
+      "step": 427
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.47144041066438463,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.732,
+      "step": 428
+    },
+    {
+      "epoch": 0.3432,
+      "grad_norm": 0.6818675806798845,
+      "learning_rate": 0.00015288930488653094,
+      "loss": 0.9766,
+      "step": 429
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.48689971024583084,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.8083,
+      "step": 430
+    },
+    {
+      "epoch": 0.3448,
+      "grad_norm": 0.47651144680168256,
+      "learning_rate": 0.0001524486236231402,
+      "loss": 0.8149,
+      "step": 431
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.5814637437027524,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.8853,
+      "step": 432
+    },
+    {
+      "epoch": 0.3464,
+      "grad_norm": 0.4746640362343065,
+      "learning_rate": 0.00015200653278659432,
+      "loss": 0.8162,
+      "step": 433
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.40043688621775886,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7352,
+      "step": 434
+    },
+    {
+      "epoch": 0.348,
+      "grad_norm": 0.4833429780552123,
+      "learning_rate": 0.00015156304425822267,
+      "loss": 0.7851,
+      "step": 435
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.4239299991049108,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7385,
+      "step": 436
+    },
+    {
+      "epoch": 0.3496,
+      "grad_norm": 0.5172926681098085,
+      "learning_rate": 0.00015111816995691809,
+      "loss": 0.8414,
+      "step": 437
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.5120086530270151,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.8446,
+      "step": 438
+    },
+    {
+      "epoch": 0.3512,
+      "grad_norm": 0.5669471654453373,
+      "learning_rate": 0.00015067192183881658,
+      "loss": 0.7746,
+      "step": 439
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5054731462512156,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7986,
+      "step": 440
+    },
+    {
+      "epoch": 0.3528,
+      "grad_norm": 0.4352437564325993,
+      "learning_rate": 0.00015022431189697568,
+      "loss": 0.7217,
+      "step": 441
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.5675270179816488,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8416,
+      "step": 442
+    },
+    {
+      "epoch": 0.3544,
+      "grad_norm": 0.4750355240151204,
+      "learning_rate": 0.0001497753521610526,
+      "loss": 0.8545,
+      "step": 443
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.5129569536686879,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.754,
+      "step": 444
+    },
+    {
+      "epoch": 0.356,
+      "grad_norm": 0.6952488173803737,
+      "learning_rate": 0.00014932505469698052,
+      "loss": 0.8875,
+      "step": 445
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.460637684999673,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7656,
+      "step": 446
+    },
+    {
+      "epoch": 0.3576,
+      "grad_norm": 0.4930923924746972,
+      "learning_rate": 0.0001488734316066446,
+      "loss": 0.8337,
+      "step": 447
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.5601297336887241,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7948,
+      "step": 448
+    },
+    {
+      "epoch": 0.3592,
+      "grad_norm": 0.4207991716017091,
+      "learning_rate": 0.0001484204950275565,
+      "loss": 0.7117,
+      "step": 449
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5585803045272063,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.7722,
+      "step": 450
+    },
+    {
+      "epoch": 0.3608,
+      "grad_norm": 0.4701896450197255,
+      "learning_rate": 0.00014796625713252848,
+      "loss": 0.8381,
+      "step": 451
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.7171183686662301,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.8059,
+      "step": 452
+    },
+    {
+      "epoch": 0.3624,
+      "grad_norm": 0.5987006577708592,
+      "learning_rate": 0.00014751073012934587,
+      "loss": 0.9456,
+      "step": 453
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.46718902609560226,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7905,
+      "step": 454
+    },
+    {
+      "epoch": 0.364,
+      "grad_norm": 0.4259181214410821,
+      "learning_rate": 0.0001470539262604393,
+      "loss": 0.7584,
+      "step": 455
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.6194362987661196,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.94,
+      "step": 456
+    },
+    {
+      "epoch": 0.3656,
+      "grad_norm": 0.4595245370288295,
+      "learning_rate": 0.00014659585780255556,
+      "loss": 0.8245,
+      "step": 457
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.5224041089925482,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.8764,
+      "step": 458
+    },
+    {
+      "epoch": 0.3672,
+      "grad_norm": 0.5138752347477095,
+      "learning_rate": 0.0001461365370664276,
+      "loss": 0.8138,
+      "step": 459
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.49270360152003395,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.82,
+      "step": 460
+    },
+    {
+      "epoch": 0.3688,
+      "grad_norm": 0.5344931409250059,
+      "learning_rate": 0.00014567597639644387,
+      "loss": 0.873,
+      "step": 461
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.44632385553108306,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.8228,
+      "step": 462
+    },
+    {
+      "epoch": 0.3704,
+      "grad_norm": 0.444515202856901,
+      "learning_rate": 0.00014521418817031628,
+      "loss": 0.7699,
+      "step": 463
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.520110469642323,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.7739,
+      "step": 464
+    },
+    {
+      "epoch": 0.372,
+      "grad_norm": 0.5023472914845316,
+      "learning_rate": 0.00014475118479874774,
+      "loss": 0.7977,
+      "step": 465
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.597348091144304,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.8887,
+      "step": 466
+    },
+    {
+      "epoch": 0.3736,
+      "grad_norm": 0.4302257534061434,
+      "learning_rate": 0.0001442869787250987,
+      "loss": 0.7073,
+      "step": 467
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.5753588957288104,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.7243,
+      "step": 468
+    },
+    {
+      "epoch": 0.3752,
+      "grad_norm": 0.4255194197110321,
+      "learning_rate": 0.00014382158242505234,
+      "loss": 0.7516,
+      "step": 469
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.49607289838443813,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.8165,
+      "step": 470
+    },
+    {
+      "epoch": 0.3768,
+      "grad_norm": 0.4420078578976846,
+      "learning_rate": 0.00014335500840627986,
+      "loss": 0.7923,
+      "step": 471
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.6061627696050104,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.8236,
+      "step": 472
+    },
+    {
+      "epoch": 0.3784,
+      "grad_norm": 0.5509076824112138,
+      "learning_rate": 0.0001428872692081038,
+      "loss": 0.8348,
+      "step": 473
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.4781353200177167,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.8636,
+      "step": 474
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.4993768599320977,
+      "learning_rate": 0.00014241837740116132,
+      "loss": 0.8997,
+      "step": 475
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.4531237317735709,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.6856,
+      "step": 476
+    },
+    {
+      "epoch": 0.3816,
+      "grad_norm": 0.470354206776997,
+      "learning_rate": 0.00014194834558706632,
+      "loss": 0.9099,
+      "step": 477
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.5095269991795404,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.8303,
+      "step": 478
+    },
+    {
+      "epoch": 0.3832,
+      "grad_norm": 0.5883162584844065,
+      "learning_rate": 0.0001414771863980707,
+      "loss": 0.9061,
+      "step": 479
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.439843653168383,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.7372,
+      "step": 480
+    },
+    {
+      "epoch": 0.3848,
+      "grad_norm": 0.43366569264201593,
+      "learning_rate": 0.00014100491249672498,
+      "loss": 0.7139,
+      "step": 481
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.521527189637558,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7783,
+      "step": 482
+    },
+    {
+      "epoch": 0.3864,
+      "grad_norm": 0.3994649107251338,
+      "learning_rate": 0.0001405315365755379,
+      "loss": 0.7339,
+      "step": 483
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.5860923480126341,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.9247,
+      "step": 484
+    },
+    {
+      "epoch": 0.388,
+      "grad_norm": 0.49501107106674097,
+      "learning_rate": 0.00014005707135663527,
+      "loss": 0.7976,
+      "step": 485
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.5062371266281463,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7824,
+      "step": 486
+    },
+    {
+      "epoch": 0.3896,
+      "grad_norm": 0.4713455662305919,
+      "learning_rate": 0.00013958152959141825,
+      "loss": 0.7888,
+      "step": 487
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.46974333470995056,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.817,
+      "step": 488
+    },
+    {
+      "epoch": 0.3912,
+      "grad_norm": 0.5424012949016316,
+      "learning_rate": 0.00013910492406022033,
+      "loss": 0.7656,
+      "step": 489
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.445227479307242,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7223,
+      "step": 490
+    },
+    {
+      "epoch": 0.3928,
+      "grad_norm": 0.5655662279610364,
+      "learning_rate": 0.0001386272675719642,
+      "loss": 0.8941,
+      "step": 491
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.5181512905899659,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7798,
+      "step": 492
+    },
+    {
+      "epoch": 0.3944,
+      "grad_norm": 0.4944307879552544,
+      "learning_rate": 0.00013814857296381728,
+      "loss": 0.8009,
+      "step": 493
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.4431718654723363,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.6793,
+      "step": 494
+    },
+    {
+      "epoch": 0.396,
+      "grad_norm": 0.47978384089329723,
+      "learning_rate": 0.00013766885310084688,
+      "loss": 0.8717,
+      "step": 495
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.41275828014932414,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7453,
+      "step": 496
+    },
+    {
+      "epoch": 0.3976,
+      "grad_norm": 0.40167635056969075,
+      "learning_rate": 0.00013718812087567414,
+      "loss": 0.7552,
+      "step": 497
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.4768311161085942,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7477,
+      "step": 498
+    },
+    {
+      "epoch": 0.3992,
+      "grad_norm": 0.5969868558835713,
+      "learning_rate": 0.000136706389208128,
+      "loss": 0.8123,
+      "step": 499
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5509063771489698,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.8173,
+      "step": 500
+    },
+    {
+      "epoch": 0.4008,
+      "grad_norm": 0.7314088969666789,
+      "learning_rate": 0.00013622367104489756,
+      "loss": 0.8875,
+      "step": 501
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.6703491668087395,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.743,
+      "step": 502
+    },
+    {
+      "epoch": 0.4024,
+      "grad_norm": 0.4617839957159876,
+      "learning_rate": 0.0001357399793591844,
+      "loss": 0.7303,
+      "step": 503
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.5592130260304664,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.8657,
+      "step": 504
+    },
+    {
+      "epoch": 0.404,
+      "grad_norm": 0.5487572328437771,
+      "learning_rate": 0.00013525532715035366,
+      "loss": 0.8064,
+      "step": 505
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.5293588109138585,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.8289,
+      "step": 506
+    },
+    {
+      "epoch": 0.4056,
+      "grad_norm": 0.44982002722735015,
+      "learning_rate": 0.00013476972744358507,
+      "loss": 0.7077,
+      "step": 507
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.7512862270318826,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 1.1271,
+      "step": 508
+    },
+    {
+      "epoch": 0.4072,
+      "grad_norm": 0.5766551507517041,
+      "learning_rate": 0.00013428319328952253,
+      "loss": 0.842,
+      "step": 509
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.47528773107321093,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7522,
+      "step": 510
+    },
+    {
+      "epoch": 0.4088,
+      "grad_norm": 0.5058145440794231,
+      "learning_rate": 0.0001337957377639235,
+      "loss": 0.8256,
+      "step": 511
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4909749501965408,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.8331,
+      "step": 512
+    },
+    {
+      "epoch": 0.4104,
+      "grad_norm": 0.5194354145101123,
+      "learning_rate": 0.0001333073739673076,
+      "loss": 0.7576,
+      "step": 513
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.46875827617231014,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.8161,
+      "step": 514
+    },
+    {
+      "epoch": 0.412,
+      "grad_norm": 0.5693573690198975,
+      "learning_rate": 0.0001328181150246045,
+      "loss": 0.9164,
+      "step": 515
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.46935357691176377,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7485,
+      "step": 516
+    },
+    {
+      "epoch": 0.4136,
+      "grad_norm": 0.5948283196167345,
+      "learning_rate": 0.00013232797408480127,
+      "loss": 0.902,
+      "step": 517
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.4766212239023237,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7893,
+      "step": 518
+    },
+    {
+      "epoch": 0.4152,
+      "grad_norm": 0.607765090026707,
+      "learning_rate": 0.00013183696432058888,
+      "loss": 0.9624,
+      "step": 519
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.5326289521595492,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.8394,
+      "step": 520
+    },
+    {
+      "epoch": 0.4168,
+      "grad_norm": 0.6048705341164726,
+      "learning_rate": 0.00013134509892800822,
+      "loss": 0.8717,
+      "step": 521
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.4825858005691562,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.79,
+      "step": 522
+    },
+    {
+      "epoch": 0.4184,
+      "grad_norm": 0.5709289382929376,
+      "learning_rate": 0.00013085239112609547,
+      "loss": 0.9274,
+      "step": 523
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4588906625153007,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.8087,
+      "step": 524
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.5509718129764554,
+      "learning_rate": 0.00013035885415652685,
+      "loss": 0.8854,
+      "step": 525
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.7697820546447985,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.8984,
+      "step": 526
+    },
+    {
+      "epoch": 0.4216,
+      "grad_norm": 0.5002318272586276,
+      "learning_rate": 0.00012986450128326266,
+      "loss": 0.8744,
+      "step": 527
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.5419619630601956,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.8159,
+      "step": 528
+    },
+    {
+      "epoch": 0.4232,
+      "grad_norm": 0.47735292588209827,
+      "learning_rate": 0.00012936934579219094,
+      "loss": 0.8128,
+      "step": 529
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.5990287239162032,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.8258,
+      "step": 530
+    },
+    {
+      "epoch": 0.4248,
+      "grad_norm": 0.49600692051562495,
+      "learning_rate": 0.00012887340099077024,
+      "loss": 0.8125,
+      "step": 531
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.6293896630873944,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.9108,
+      "step": 532
+    },
+    {
+      "epoch": 0.4264,
+      "grad_norm": 0.4947857180649281,
+      "learning_rate": 0.0001283766802076722,
+      "loss": 0.8444,
+      "step": 533
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.5639879587324572,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.8394,
+      "step": 534
+    },
+    {
+      "epoch": 0.428,
+      "grad_norm": 0.5397476844949073,
+      "learning_rate": 0.00012787919679242306,
+      "loss": 0.8269,
+      "step": 535
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4232618190448643,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7834,
+      "step": 536
+    },
+    {
+      "epoch": 0.4296,
+      "grad_norm": 0.49956002834638136,
+      "learning_rate": 0.00012738096411504522,
+      "loss": 0.7525,
+      "step": 537
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.6187204256102857,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.8394,
+      "step": 538
+    },
+    {
+      "epoch": 0.4312,
+      "grad_norm": 0.4215636842122824,
+      "learning_rate": 0.00012688199556569753,
+      "loss": 0.768,
+      "step": 539
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.45879407465937844,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.8202,
+      "step": 540
+    },
+    {
+      "epoch": 0.4328,
+      "grad_norm": 0.43491023553817093,
+      "learning_rate": 0.0001263823045543158,
+      "loss": 0.682,
+      "step": 541
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.5457799956740407,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.8866,
+      "step": 542
+    },
+    {
+      "epoch": 0.4344,
+      "grad_norm": 0.5041001973150158,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.8291,
+      "step": 543
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4610663851280987,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.7835,
+      "step": 544
+    },
+    {
+      "epoch": 0.436,
+      "grad_norm": 0.49938601271838723,
+      "learning_rate": 0.00012538080888191408,
+      "loss": 0.7862,
+      "step": 545
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.4424713587817567,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.79,
+      "step": 546
+    },
+    {
+      "epoch": 0.4376,
+      "grad_norm": 0.5091447721592515,
+      "learning_rate": 0.00012487903113640337,
+      "loss": 0.7629,
+      "step": 547
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.7851193524554307,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.9942,
+      "step": 548
+    },
+    {
+      "epoch": 0.4392,
+      "grad_norm": 0.4627528445492815,
+      "learning_rate": 0.00012437658475915377,
+      "loss": 0.8194,
+      "step": 549
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.5108947689777327,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.8145,
+      "step": 550
+    },
+    {
+      "epoch": 0.4408,
+      "grad_norm": 0.4296843232450707,
+      "learning_rate": 0.00012387348325356874,
+      "loss": 0.7779,
+      "step": 551
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.5198850742422576,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.7927,
+      "step": 552
+    },
+    {
+      "epoch": 0.4424,
+      "grad_norm": 0.44985237069445677,
+      "learning_rate": 0.00012336974014065844,
+      "loss": 0.7671,
+      "step": 553
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.44158884644247104,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.6889,
+      "step": 554
+    },
+    {
+      "epoch": 0.444,
+      "grad_norm": 0.4265756896029439,
+      "learning_rate": 0.00012286536895867654,
+      "loss": 0.739,
+      "step": 555
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.5390316290711439,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.8385,
+      "step": 556
+    },
+    {
+      "epoch": 0.4456,
+      "grad_norm": 0.73606467368334,
+      "learning_rate": 0.00012236038326275626,
+      "loss": 0.9169,
+      "step": 557
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.5547510346684561,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.8029,
+      "step": 558
+    },
+    {
+      "epoch": 0.4472,
+      "grad_norm": 0.6157708117641697,
+      "learning_rate": 0.00012185479662454595,
+      "loss": 0.8162,
+      "step": 559
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4592158639221032,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.8314,
+      "step": 560
+    },
+    {
+      "epoch": 0.4488,
+      "grad_norm": 0.5301100515025033,
+      "learning_rate": 0.00012134862263184467,
+      "loss": 0.834,
+      "step": 561
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.48034266500540934,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.8387,
+      "step": 562
+    },
+    {
+      "epoch": 0.4504,
+      "grad_norm": 0.5807002869606578,
+      "learning_rate": 0.00012084187488823657,
+      "loss": 0.806,
+      "step": 563
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.48886470315743324,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.8138,
+      "step": 564
+    },
+    {
+      "epoch": 0.452,
+      "grad_norm": 0.6142185112064632,
+      "learning_rate": 0.00012033456701272576,
+      "loss": 0.8387,
+      "step": 565
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.4425976264319214,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.8137,
+      "step": 566
+    },
+    {
+      "epoch": 0.4536,
+      "grad_norm": 0.46162394263837786,
+      "learning_rate": 0.00011982671263936995,
+      "loss": 0.7112,
+      "step": 567
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.5274038551293228,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.8424,
+      "step": 568
+    },
+    {
+      "epoch": 0.4552,
+      "grad_norm": 0.5630999009119136,
+      "learning_rate": 0.00011931832541691418,
+      "loss": 0.8762,
+      "step": 569
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.5849020792876066,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.7851,
+      "step": 570
+    },
+    {
+      "epoch": 0.4568,
+      "grad_norm": 0.4198420126887951,
+      "learning_rate": 0.00011880941900842397,
+      "loss": 0.7423,
+      "step": 571
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.5912880602623347,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7853,
+      "step": 572
+    },
+    {
+      "epoch": 0.4584,
+      "grad_norm": 0.39988609255560875,
+      "learning_rate": 0.00011830000709091815,
+      "loss": 0.7304,
+      "step": 573
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.40463911888659115,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7231,
+      "step": 574
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.4796130771098071,
+      "learning_rate": 0.0001177901033550012,
+      "loss": 0.8143,
+      "step": 575
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.6433056888849106,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 1.0016,
+      "step": 576
+    },
+    {
+      "epoch": 0.4616,
+      "grad_norm": 0.4127734266327962,
+      "learning_rate": 0.00011727972150449544,
+      "loss": 0.7422,
+      "step": 577
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.44914437089719506,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7655,
+      "step": 578
+    },
+    {
+      "epoch": 0.4632,
+      "grad_norm": 0.6011966541551302,
+      "learning_rate": 0.00011676887525607271,
+      "loss": 0.8456,
+      "step": 579
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3849250338300349,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.6709,
+      "step": 580
+    },
+    {
+      "epoch": 0.4648,
+      "grad_norm": 0.5141797468340683,
+      "learning_rate": 0.00011625757833888551,
+      "loss": 0.8416,
+      "step": 581
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.445264622635813,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7007,
+      "step": 582
+    },
+    {
+      "epoch": 0.4664,
+      "grad_norm": 0.5825263970937105,
+      "learning_rate": 0.0001157458444941984,
+      "loss": 0.8128,
+      "step": 583
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.4971690811475642,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7037,
+      "step": 584
+    },
+    {
+      "epoch": 0.468,
+      "grad_norm": 0.4421032507562734,
+      "learning_rate": 0.00011523368747501839,
+      "loss": 0.7006,
+      "step": 585
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.4505115695438612,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7256,
+      "step": 586
+    },
+    {
+      "epoch": 0.4696,
+      "grad_norm": 0.5301309168789724,
+      "learning_rate": 0.00011472112104572547,
+      "loss": 0.8689,
+      "step": 587
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.6159699802495321,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.9241,
+      "step": 588
+    },
+    {
+      "epoch": 0.4712,
+      "grad_norm": 0.48592950799355167,
+      "learning_rate": 0.0001142081589817027,
+      "loss": 0.7434,
+      "step": 589
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.47418559732631554,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7166,
+      "step": 590
+    },
+    {
+      "epoch": 0.4728,
+      "grad_norm": 0.41718431814920304,
+      "learning_rate": 0.00011369481506896582,
+      "loss": 0.7668,
+      "step": 591
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.4155365826767906,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7643,
+      "step": 592
+    },
+    {
+      "epoch": 0.4744,
+      "grad_norm": 0.4400801636486908,
+      "learning_rate": 0.00011318110310379301,
+      "loss": 0.7299,
+      "step": 593
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.44501033272795804,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.6985,
+      "step": 594
+    },
+    {
+      "epoch": 0.476,
+      "grad_norm": 0.5296054918048602,
+      "learning_rate": 0.00011266703689235394,
+      "loss": 0.7756,
+      "step": 595
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.7416615624398768,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.9492,
+      "step": 596
+    },
+    {
+      "epoch": 0.4776,
+      "grad_norm": 0.5162215630640894,
+      "learning_rate": 0.00011215263025033869,
+      "loss": 0.8065,
+      "step": 597
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.5324339114918692,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.769,
+      "step": 598
+    },
+    {
+      "epoch": 0.4792,
+      "grad_norm": 0.6489662996797839,
+      "learning_rate": 0.00011163789700258655,
+      "loss": 0.8399,
+      "step": 599
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.5316210959628527,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.8054,
+      "step": 600
+    },
+    {
+      "epoch": 0.4808,
+      "grad_norm": 0.5450723157158529,
+      "learning_rate": 0.00011112285098271451,
+      "loss": 0.7204,
+      "step": 601
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.6653921394791508,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.952,
+      "step": 602
+    },
+    {
+      "epoch": 0.4824,
+      "grad_norm": 0.42758237423704815,
+      "learning_rate": 0.00011060750603274535,
+      "loss": 0.7746,
+      "step": 603
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.37970567907221614,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.6783,
+      "step": 604
+    },
+    {
+      "epoch": 0.484,
+      "grad_norm": 0.5306101545577548,
+      "learning_rate": 0.00011009187600273566,
+      "loss": 0.7251,
+      "step": 605
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.4879322277705361,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7993,
+      "step": 606
+    },
+    {
+      "epoch": 0.4856,
+      "grad_norm": 0.5860036003574343,
+      "learning_rate": 0.00010957597475040373,
+      "loss": 0.8047,
+      "step": 607
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.5409230231224406,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.914,
+      "step": 608
+    },
+    {
+      "epoch": 0.4872,
+      "grad_norm": 0.5180729089295505,
+      "learning_rate": 0.00010905981614075693,
+      "loss": 0.712,
+      "step": 609
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.4638419079807667,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.7917,
+      "step": 610
+    },
+    {
+      "epoch": 0.4888,
+      "grad_norm": 0.498367616853681,
+      "learning_rate": 0.00010854341404571928,
+      "loss": 0.7935,
+      "step": 611
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.5020078746326063,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.8474,
+      "step": 612
+    },
+    {
+      "epoch": 0.4904,
+      "grad_norm": 0.49792991575536666,
+      "learning_rate": 0.00010802678234375851,
+      "loss": 0.7891,
+      "step": 613
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.42927436111692624,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7354,
+      "step": 614
+    },
+    {
+      "epoch": 0.492,
+      "grad_norm": 0.6189941094998067,
+      "learning_rate": 0.0001075099349195131,
+      "loss": 0.9778,
+      "step": 615
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.535158019745401,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.8887,
+      "step": 616
+    },
+    {
+      "epoch": 0.4936,
+      "grad_norm": 0.5421336286292466,
+      "learning_rate": 0.00010699288566341914,
+      "loss": 0.726,
+      "step": 617
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.4774234587003961,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.8258,
+      "step": 618
+    },
+    {
+      "epoch": 0.4952,
+      "grad_norm": 0.4393486984262681,
+      "learning_rate": 0.000106475648471337,
+      "loss": 0.8249,
+      "step": 619
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.5168133456503567,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7769,
+      "step": 620
+    },
+    {
+      "epoch": 0.4968,
+      "grad_norm": 0.4805966027030343,
+      "learning_rate": 0.00010595823724417795,
+      "loss": 0.7747,
+      "step": 621
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.36572368436725333,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.712,
+      "step": 622
+    },
+    {
+      "epoch": 0.4984,
+      "grad_norm": 0.4446834682288979,
+      "learning_rate": 0.00010544066588753044,
+      "loss": 0.7308,
+      "step": 623
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.5556975393599263,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.7701,
+      "step": 624
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.4251983124162712,
+      "learning_rate": 0.00010492294831128641,
+      "loss": 0.7161,
+      "step": 625
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.6331971990969477,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.88,
+      "step": 626
+    },
+    {
+      "epoch": 0.5016,
+      "grad_norm": 0.5519027354901856,
+      "learning_rate": 0.00010440509842926767,
+      "loss": 0.7777,
+      "step": 627
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.423190616566324,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.6877,
+      "step": 628
+    },
+    {
+      "epoch": 0.5032,
+      "grad_norm": 0.5503702795446749,
+      "learning_rate": 0.00010388713015885161,
+      "loss": 0.8817,
+      "step": 629
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.5388373266987113,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.8474,
+      "step": 630
+    },
+    {
+      "epoch": 0.5048,
+      "grad_norm": 0.6012561415901527,
+      "learning_rate": 0.00010336905742059742,
+      "loss": 0.8188,
+      "step": 631
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.6950778812475392,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.9335,
+      "step": 632
+    },
+    {
+      "epoch": 0.5064,
+      "grad_norm": 0.5187122891657171,
+      "learning_rate": 0.0001028508941378719,
+      "loss": 0.801,
+      "step": 633
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.5667565102338985,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.8518,
+      "step": 634
+    },
+    {
+      "epoch": 0.508,
+      "grad_norm": 0.44921171437003526,
+      "learning_rate": 0.00010233265423647523,
+      "loss": 0.727,
+      "step": 635
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.44041090396110116,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.7409,
+      "step": 636
+    },
+    {
+      "epoch": 0.5096,
+      "grad_norm": 0.4635285462260514,
+      "learning_rate": 0.00010181435164426676,
+      "loss": 0.8113,
+      "step": 637
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.4351640872935251,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.684,
+      "step": 638
+    },
+    {
+      "epoch": 0.5112,
+      "grad_norm": 0.7346651886825838,
+      "learning_rate": 0.00010129600029079072,
+      "loss": 0.8041,
+      "step": 639
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.43179688684376,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7608,
+      "step": 640
+    },
+    {
+      "epoch": 0.5128,
+      "grad_norm": 0.44408111726364724,
+      "learning_rate": 0.00010077761410690172,
+      "loss": 0.6759,
+      "step": 641
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.48160792171232253,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.7908,
+      "step": 642
+    },
+    {
+      "epoch": 0.5144,
+      "grad_norm": 0.6539498144594257,
+      "learning_rate": 0.00010025920702439051,
+      "loss": 0.8314,
+      "step": 643
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.44770276471829223,
+      "learning_rate": 0.0001,
+      "loss": 0.7071,
+      "step": 644
+    },
+    {
+      "epoch": 0.516,
+      "grad_norm": 0.5152156674652172,
+      "learning_rate": 9.97407929756095e-05,
+      "loss": 0.8245,
+      "step": 645
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.624011380930772,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7836,
+      "step": 646
+    },
+    {
+      "epoch": 0.5176,
+      "grad_norm": 0.56835724540912,
+      "learning_rate": 9.92223858930983e-05,
+      "loss": 0.7877,
+      "step": 647
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.4897329961960285,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.8135,
+      "step": 648
+    },
+    {
+      "epoch": 0.5192,
+      "grad_norm": 0.49578090759811055,
+      "learning_rate": 9.870399970920932e-05,
+      "loss": 0.8754,
+      "step": 649
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.48731142631830265,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7789,
+      "step": 650
+    },
+    {
+      "epoch": 0.5208,
+      "grad_norm": 0.5673459149137637,
+      "learning_rate": 9.818564835573323e-05,
+      "loss": 0.8433,
+      "step": 651
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.6188075664323361,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.8938,
+      "step": 652
+    },
+    {
+      "epoch": 0.5224,
+      "grad_norm": 0.46873132471454837,
+      "learning_rate": 9.766734576352478e-05,
+      "loss": 0.8145,
+      "step": 653
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.586928798556376,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.8484,
+      "step": 654
+    },
+    {
+      "epoch": 0.524,
+      "grad_norm": 0.5255526240068161,
+      "learning_rate": 9.714910586212816e-05,
+      "loss": 0.7714,
+      "step": 655
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.5059476711446602,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.8066,
+      "step": 656
+    },
+    {
+      "epoch": 0.5256,
+      "grad_norm": 0.49582010272907506,
+      "learning_rate": 9.663094257940258e-05,
+      "loss": 0.7973,
+      "step": 657
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.4372289417856676,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.6713,
+      "step": 658
+    },
+    {
+      "epoch": 0.5272,
+      "grad_norm": 0.5851419845977731,
+      "learning_rate": 9.611286984114841e-05,
+      "loss": 0.8903,
+      "step": 659
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.6412073563576178,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.8629,
+      "step": 660
+    },
+    {
+      "epoch": 0.5288,
+      "grad_norm": 0.5336461120146851,
+      "learning_rate": 9.559490157073236e-05,
+      "loss": 0.7686,
+      "step": 661
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.5473242230872678,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.8852,
+      "step": 662
+    },
+    {
+      "epoch": 0.5304,
+      "grad_norm": 0.4835820367008371,
+      "learning_rate": 9.507705168871358e-05,
+      "loss": 0.7318,
+      "step": 663
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.5887152325895428,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.888,
+      "step": 664
+    },
+    {
+      "epoch": 0.532,
+      "grad_norm": 0.4813882793581241,
+      "learning_rate": 9.455933411246958e-05,
+      "loss": 0.8032,
+      "step": 665
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.5416113889145342,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.7934,
+      "step": 666
+    },
+    {
+      "epoch": 0.5336,
+      "grad_norm": 0.4783917145856643,
+      "learning_rate": 9.404176275582208e-05,
+      "loss": 0.7595,
+      "step": 667
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.41668235251938335,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.7914,
+      "step": 668
+    },
+    {
+      "epoch": 0.5352,
+      "grad_norm": 0.505696911444653,
+      "learning_rate": 9.352435152866298e-05,
+      "loss": 0.722,
+      "step": 669
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.42344148579580215,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.7278,
+      "step": 670
+    },
+    {
+      "epoch": 0.5368,
+      "grad_norm": 0.5015991421348351,
+      "learning_rate": 9.300711433658087e-05,
+      "loss": 0.869,
+      "step": 671
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.4885835248231759,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.8106,
+      "step": 672
+    },
+    {
+      "epoch": 0.5384,
+      "grad_norm": 0.4210617527552405,
+      "learning_rate": 9.249006508048694e-05,
+      "loss": 0.7295,
+      "step": 673
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.4563666120496189,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.7539,
+      "step": 674
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.6281822494421835,
+      "learning_rate": 9.197321765624152e-05,
+      "loss": 0.8547,
+      "step": 675
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.5227123231829046,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7135,
+      "step": 676
+    },
+    {
+      "epoch": 0.5416,
+      "grad_norm": 0.5436077236107482,
+      "learning_rate": 9.145658595428074e-05,
+      "loss": 0.8205,
+      "step": 677
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.6576452598843193,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7732,
+      "step": 678
+    },
+    {
+      "epoch": 0.5432,
+      "grad_norm": 0.5460176922167431,
+      "learning_rate": 9.09401838592431e-05,
+      "loss": 0.8402,
+      "step": 679
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.5169191615165644,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.715,
+      "step": 680
+    },
+    {
+      "epoch": 0.5448,
+      "grad_norm": 0.4735339701945484,
+      "learning_rate": 9.04240252495963e-05,
+      "loss": 0.797,
+      "step": 681
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.583929608657358,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.8244,
+      "step": 682
+    },
+    {
+      "epoch": 0.5464,
+      "grad_norm": 0.5857100895089427,
+      "learning_rate": 8.990812399726435e-05,
+      "loss": 0.8555,
+      "step": 683
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.4287227609471326,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7004,
+      "step": 684
+    },
+    {
+      "epoch": 0.548,
+      "grad_norm": 0.5591560903597864,
+      "learning_rate": 8.939249396725467e-05,
+      "loss": 0.8802,
+      "step": 685
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.5709463423227509,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.9058,
+      "step": 686
+    },
+    {
+      "epoch": 0.5496,
+      "grad_norm": 0.47814781051246497,
+      "learning_rate": 8.887714901728551e-05,
+      "loss": 0.7459,
+      "step": 687
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3991936136993612,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.6433,
+      "step": 688
+    },
+    {
+      "epoch": 0.5512,
+      "grad_norm": 0.6474370255263097,
+      "learning_rate": 8.836210299741346e-05,
+      "loss": 0.9395,
+      "step": 689
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.47354372366850817,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7087,
+      "step": 690
+    },
+    {
+      "epoch": 0.5528,
+      "grad_norm": 0.5204739940092096,
+      "learning_rate": 8.784736974966135e-05,
+      "loss": 0.7485,
+      "step": 691
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4024037903604234,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.6583,
+      "step": 692
+    },
+    {
+      "epoch": 0.5544,
+      "grad_norm": 0.4328957613797475,
+      "learning_rate": 8.733296310764611e-05,
+      "loss": 0.7733,
+      "step": 693
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.5988603496036152,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.8826,
+      "step": 694
+    },
+    {
+      "epoch": 0.556,
+      "grad_norm": 0.5959098740282872,
+      "learning_rate": 8.6818896896207e-05,
+      "loss": 0.7896,
+      "step": 695
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.5481226538243807,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7953,
+      "step": 696
+    },
+    {
+      "epoch": 0.5576,
+      "grad_norm": 0.4418655243673763,
+      "learning_rate": 8.63051849310342e-05,
+      "loss": 0.7368,
+      "step": 697
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.5465076665795701,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.7109,
+      "step": 698
+    },
+    {
+      "epoch": 0.5592,
+      "grad_norm": 0.43624854053643497,
+      "learning_rate": 8.579184101829734e-05,
+      "loss": 0.7559,
+      "step": 699
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5818695364312464,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.857,
+      "step": 700
+    },
+    {
+      "epoch": 0.5608,
+      "grad_norm": 0.5649071509926393,
+      "learning_rate": 8.527887895427454e-05,
+      "loss": 0.7932,
+      "step": 701
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.5352732305024815,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.801,
+      "step": 702
+    },
+    {
+      "epoch": 0.5624,
+      "grad_norm": 0.5599173239277646,
+      "learning_rate": 8.476631252498162e-05,
+      "loss": 0.8714,
+      "step": 703
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.4900154560625499,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7859,
+      "step": 704
+    },
+    {
+      "epoch": 0.564,
+      "grad_norm": 0.5502231586252436,
+      "learning_rate": 8.425415550580162e-05,
+      "loss": 0.8628,
+      "step": 705
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.6632543823507182,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.8263,
+      "step": 706
+    },
+    {
+      "epoch": 0.5656,
+      "grad_norm": 0.5520641901799846,
+      "learning_rate": 8.374242166111448e-05,
+      "loss": 0.8177,
+      "step": 707
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.513119056715211,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.773,
+      "step": 708
+    },
+    {
+      "epoch": 0.5672,
+      "grad_norm": 0.5116341203636527,
+      "learning_rate": 8.323112474392731e-05,
+      "loss": 0.7843,
+      "step": 709
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.4503758921300252,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.7573,
+      "step": 710
+    },
+    {
+      "epoch": 0.5688,
+      "grad_norm": 0.4900627207577956,
+      "learning_rate": 8.272027849550457e-05,
+      "loss": 0.7897,
+      "step": 711
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4703910310994137,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.8865,
+      "step": 712
+    },
+    {
+      "epoch": 0.5704,
+      "grad_norm": 0.5531414201362082,
+      "learning_rate": 8.220989664499878e-05,
+      "loss": 0.8001,
+      "step": 713
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.4457053862521107,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.7449,
+      "step": 714
+    },
+    {
+      "epoch": 0.572,
+      "grad_norm": 0.45835200222634626,
+      "learning_rate": 8.169999290908188e-05,
+      "loss": 0.7697,
+      "step": 715
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.5391822751157387,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.9577,
+      "step": 716
+    },
+    {
+      "epoch": 0.5736,
+      "grad_norm": 0.41121558299077177,
+      "learning_rate": 8.119058099157604e-05,
+      "loss": 0.698,
+      "step": 717
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.5643814352461478,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.735,
+      "step": 718
+    },
+    {
+      "epoch": 0.5752,
+      "grad_norm": 0.5568828460124492,
+      "learning_rate": 8.068167458308582e-05,
+      "loss": 0.9282,
+      "step": 719
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.5539915368104986,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.8599,
+      "step": 720
+    },
+    {
+      "epoch": 0.5768,
+      "grad_norm": 0.45352224561414184,
+      "learning_rate": 8.017328736063006e-05,
+      "loss": 0.7158,
+      "step": 721
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.517710231897813,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.895,
+      "step": 722
+    },
+    {
+      "epoch": 0.5784,
+      "grad_norm": 0.4794346129983727,
+      "learning_rate": 7.966543298727425e-05,
+      "loss": 0.6596,
+      "step": 723
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.5452911706754874,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.7885,
+      "step": 724
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.43227433392495757,
+      "learning_rate": 7.915812511176347e-05,
+      "loss": 0.744,
+      "step": 725
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.5783187900232767,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.9126,
+      "step": 726
+    },
+    {
+      "epoch": 0.5816,
+      "grad_norm": 0.6757420671981376,
+      "learning_rate": 7.865137736815535e-05,
+      "loss": 0.8696,
+      "step": 727
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.5289380766238764,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.8126,
+      "step": 728
+    },
+    {
+      "epoch": 0.5832,
+      "grad_norm": 0.45917046745303486,
+      "learning_rate": 7.814520337545406e-05,
+      "loss": 0.7065,
+      "step": 729
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.5996754384273839,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.8266,
+      "step": 730
+    },
+    {
+      "epoch": 0.5848,
+      "grad_norm": 0.4390117214376881,
+      "learning_rate": 7.763961673724379e-05,
+      "loss": 0.6859,
+      "step": 731
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.5049988200186094,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.8494,
+      "step": 732
+    },
+    {
+      "epoch": 0.5864,
+      "grad_norm": 0.5542254634032855,
+      "learning_rate": 7.713463104132345e-05,
+      "loss": 0.7942,
+      "step": 733
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.471479053679602,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.7559,
+      "step": 734
+    },
+    {
+      "epoch": 0.588,
+      "grad_norm": 0.5412297686164421,
+      "learning_rate": 7.663025985934158e-05,
+      "loss": 0.8575,
+      "step": 735
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.47960063289980687,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.7282,
+      "step": 736
+    },
+    {
+      "epoch": 0.5896,
+      "grad_norm": 0.41780012519923804,
+      "learning_rate": 7.61265167464313e-05,
+      "loss": 0.7839,
+      "step": 737
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.5202860281173485,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.8104,
+      "step": 738
+    },
+    {
+      "epoch": 0.5912,
+      "grad_norm": 0.769126208089339,
+      "learning_rate": 7.562341524084623e-05,
+      "loss": 0.8103,
+      "step": 739
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.5996068989135162,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.8862,
+      "step": 740
+    },
+    {
+      "epoch": 0.5928,
+      "grad_norm": 0.5094074554830536,
+      "learning_rate": 7.512096886359664e-05,
+      "loss": 0.7985,
+      "step": 741
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.5087296490744276,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.8255,
+      "step": 742
+    },
+    {
+      "epoch": 0.5944,
+      "grad_norm": 0.587661220344711,
+      "learning_rate": 7.461919111808595e-05,
+      "loss": 0.8471,
+      "step": 743
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.5185898319881128,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.7476,
+      "step": 744
+    },
+    {
+      "epoch": 0.596,
+      "grad_norm": 0.544469500243794,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.8783,
+      "step": 745
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.5254123005813613,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.8087,
+      "step": 746
+    },
+    {
+      "epoch": 0.5976,
+      "grad_norm": 0.4406116146332978,
+      "learning_rate": 7.361769544568425e-05,
+      "loss": 0.7286,
+      "step": 747
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.5028576842819465,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.733,
+      "step": 748
+    },
+    {
+      "epoch": 0.5992,
+      "grad_norm": 0.41551135895499197,
+      "learning_rate": 7.311800443430251e-05,
+      "loss": 0.6644,
+      "step": 749
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4696895218060643,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7187,
+      "step": 750
+    },
+    {
+      "epoch": 0.6008,
+      "grad_norm": 0.41672726956612643,
+      "learning_rate": 7.26190358849548e-05,
+      "loss": 0.7584,
+      "step": 751
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.41873577517602656,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.7304,
+      "step": 752
+    },
+    {
+      "epoch": 0.6024,
+      "grad_norm": 0.4696259863617647,
+      "learning_rate": 7.212080320757695e-05,
+      "loss": 0.8508,
+      "step": 753
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.6143166042160803,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.9485,
+      "step": 754
+    },
+    {
+      "epoch": 0.604,
+      "grad_norm": 0.4613086902582385,
+      "learning_rate": 7.162331979232783e-05,
+      "loss": 0.7219,
+      "step": 755
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.39347765856491007,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.6624,
+      "step": 756
+    },
+    {
+      "epoch": 0.6056,
+      "grad_norm": 0.725989550678472,
+      "learning_rate": 7.112659900922976e-05,
+      "loss": 0.7544,
+      "step": 757
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.5415172592089366,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7987,
+      "step": 758
+    },
+    {
+      "epoch": 0.6072,
+      "grad_norm": 0.5502217968259229,
+      "learning_rate": 7.06306542078091e-05,
+      "loss": 0.7951,
+      "step": 759
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3424383843607733,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6563,
+      "step": 760
+    },
+    {
+      "epoch": 0.6088,
+      "grad_norm": 0.5428890927439577,
+      "learning_rate": 7.013549871673736e-05,
+      "loss": 0.8372,
+      "step": 761
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.4765435615003908,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.7171,
+      "step": 762
+    },
+    {
+      "epoch": 0.6104,
+      "grad_norm": 0.44134979864312174,
+      "learning_rate": 6.964114584347316e-05,
+      "loss": 0.6679,
+      "step": 763
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.6268717622111956,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.9558,
+      "step": 764
+    },
+    {
+      "epoch": 0.612,
+      "grad_norm": 0.5844528780224308,
+      "learning_rate": 6.914760887390452e-05,
+      "loss": 0.7697,
+      "step": 765
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.5941143028746608,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.7959,
+      "step": 766
+    },
+    {
+      "epoch": 0.6136,
+      "grad_norm": 0.46393220370096555,
+      "learning_rate": 6.865490107199181e-05,
+      "loss": 0.7649,
+      "step": 767
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.4765025939352185,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.7156,
+      "step": 768
+    },
+    {
+      "epoch": 0.6152,
+      "grad_norm": 0.542382071248004,
+      "learning_rate": 6.816303567941112e-05,
+      "loss": 0.7514,
+      "step": 769
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.5272328764131159,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.8734,
+      "step": 770
+    },
+    {
+      "epoch": 0.6168,
+      "grad_norm": 0.49204765303119435,
+      "learning_rate": 6.767202591519875e-05,
+      "loss": 0.7192,
+      "step": 771
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.5490594184428006,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.8106,
+      "step": 772
+    },
+    {
+      "epoch": 0.6184,
+      "grad_norm": 0.4904484145501484,
+      "learning_rate": 6.718188497539554e-05,
+      "loss": 0.7916,
+      "step": 773
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.67000629408774,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.7784,
+      "step": 774
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.6480219038042772,
+      "learning_rate": 6.669262603269246e-05,
+      "loss": 0.7911,
+      "step": 775
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.6361964397959794,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.8337,
+      "step": 776
+    },
+    {
+      "epoch": 0.6216,
+      "grad_norm": 0.5253640742734401,
+      "learning_rate": 6.620426223607654e-05,
+      "loss": 0.754,
+      "step": 777
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.5162740009828527,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7588,
+      "step": 778
+    },
+    {
+      "epoch": 0.6232,
+      "grad_norm": 0.5134935237825705,
+      "learning_rate": 6.571680671047749e-05,
+      "loss": 0.7404,
+      "step": 779
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.5766140612672064,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.9184,
+      "step": 780
+    },
+    {
+      "epoch": 0.6248,
+      "grad_norm": 0.4663832143024057,
+      "learning_rate": 6.523027255641493e-05,
+      "loss": 0.762,
+      "step": 781
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.48911355189488026,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.7633,
+      "step": 782
+    },
+    {
+      "epoch": 0.6264,
+      "grad_norm": 0.6694129803671309,
+      "learning_rate": 6.474467284964634e-05,
+      "loss": 0.8236,
+      "step": 783
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.5189762766415502,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.7676,
+      "step": 784
+    },
+    {
+      "epoch": 0.628,
+      "grad_norm": 0.4161464231880787,
+      "learning_rate": 6.426002064081565e-05,
+      "loss": 0.6936,
+      "step": 785
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.4984293204783021,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.8314,
+      "step": 786
+    },
+    {
+      "epoch": 0.6296,
+      "grad_norm": 0.39903904223045994,
+      "learning_rate": 6.377632895510248e-05,
+      "loss": 0.7616,
+      "step": 787
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.43805121875966846,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6689,
+      "step": 788
+    },
+    {
+      "epoch": 0.6312,
+      "grad_norm": 0.4895082281505452,
+      "learning_rate": 6.329361079187199e-05,
+      "loss": 0.7549,
+      "step": 789
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.46576964895451856,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7616,
+      "step": 790
+    },
+    {
+      "epoch": 0.6328,
+      "grad_norm": 0.4899997620883344,
+      "learning_rate": 6.281187912432587e-05,
+      "loss": 0.7636,
+      "step": 791
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.4278249266877664,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6874,
+      "step": 792
+    },
+    {
+      "epoch": 0.6344,
+      "grad_norm": 0.4422267158793512,
+      "learning_rate": 6.233114689915316e-05,
+      "loss": 0.7535,
+      "step": 793
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.592090602720976,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.7782,
+      "step": 794
+    },
+    {
+      "epoch": 0.636,
+      "grad_norm": 0.6157235118768961,
+      "learning_rate": 6.18514270361827e-05,
+      "loss": 0.8226,
+      "step": 795
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.6727932260245815,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.8099,
+      "step": 796
+    },
+    {
+      "epoch": 0.6376,
+      "grad_norm": 0.5703922364388165,
+      "learning_rate": 6.13727324280358e-05,
+      "loss": 0.7134,
+      "step": 797
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.4149199530047408,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.7523,
+      "step": 798
+    },
+    {
+      "epoch": 0.6392,
+      "grad_norm": 0.42823391157851937,
+      "learning_rate": 6.08950759397797e-05,
+      "loss": 0.6743,
+      "step": 799
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.9501696638008198,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.7228,
+      "step": 800
+    },
+    {
+      "epoch": 0.6408,
+      "grad_norm": 0.46931384274994215,
+      "learning_rate": 6.0418470408581774e-05,
+      "loss": 0.7168,
+      "step": 801
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.44711369177258536,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.8109,
+      "step": 802
+    },
+    {
+      "epoch": 0.6424,
+      "grad_norm": 0.44288117617273876,
+      "learning_rate": 5.9942928643364724e-05,
+      "loss": 0.7881,
+      "step": 803
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.6499248586366762,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.8619,
+      "step": 804
+    },
+    {
+      "epoch": 0.644,
+      "grad_norm": 0.41703322618341265,
+      "learning_rate": 5.946846342446214e-05,
+      "loss": 0.7083,
+      "step": 805
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.5095449365168471,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.756,
+      "step": 806
+    },
+    {
+      "epoch": 0.6456,
+      "grad_norm": 0.41427632543743154,
+      "learning_rate": 5.899508750327501e-05,
+      "loss": 0.7486,
+      "step": 807
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.6832171979501523,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.9427,
+      "step": 808
+    },
+    {
+      "epoch": 0.6472,
+      "grad_norm": 0.44443690827639093,
+      "learning_rate": 5.8522813601929324e-05,
+      "loss": 0.7332,
+      "step": 809
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.48159945658139397,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.7668,
+      "step": 810
+    },
+    {
+      "epoch": 0.6488,
+      "grad_norm": 0.4923218317336922,
+      "learning_rate": 5.80516544129337e-05,
+      "loss": 0.7557,
+      "step": 811
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.3999444938598147,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6873,
+      "step": 812
+    },
+    {
+      "epoch": 0.6504,
+      "grad_norm": 0.5202580334267757,
+      "learning_rate": 5.758162259883867e-05,
+      "loss": 0.7289,
+      "step": 813
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.526995723483989,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.8378,
+      "step": 814
+    },
+    {
+      "epoch": 0.652,
+      "grad_norm": 0.5259009754185895,
+      "learning_rate": 5.7112730791896207e-05,
+      "loss": 0.8784,
+      "step": 815
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.4148487564467498,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.7354,
+      "step": 816
+    },
+    {
+      "epoch": 0.6536,
+      "grad_norm": 0.3892718151581878,
+      "learning_rate": 5.664499159372017e-05,
+      "loss": 0.65,
+      "step": 817
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.5008382317275297,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.7473,
+      "step": 818
+    },
+    {
+      "epoch": 0.6552,
+      "grad_norm": 0.5651862447253267,
+      "learning_rate": 5.617841757494762e-05,
+      "loss": 0.8362,
+      "step": 819
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.46058756838487114,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.6934,
+      "step": 820
+    },
+    {
+      "epoch": 0.6568,
+      "grad_norm": 0.4993820774612727,
+      "learning_rate": 5.5713021274901335e-05,
+      "loss": 0.7971,
+      "step": 821
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.5434804356503512,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.798,
+      "step": 822
+    },
+    {
+      "epoch": 0.6584,
+      "grad_norm": 0.43500365960349463,
+      "learning_rate": 5.524881520125229e-05,
+      "loss": 0.6257,
+      "step": 823
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.49070277501854065,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.8306,
+      "step": 824
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.5631963321595915,
+      "learning_rate": 5.4785811829683764e-05,
+      "loss": 0.8129,
+      "step": 825
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.5609443791431765,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.7962,
+      "step": 826
+    },
+    {
+      "epoch": 0.6616,
+      "grad_norm": 0.3791295673912737,
+      "learning_rate": 5.432402360355615e-05,
+      "loss": 0.7109,
+      "step": 827
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.5859099014370488,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.9052,
+      "step": 828
+    },
+    {
+      "epoch": 0.6632,
+      "grad_norm": 0.37561326410012924,
+      "learning_rate": 5.386346293357242e-05,
+      "loss": 0.6309,
+      "step": 829
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.5266492046904344,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.7868,
+      "step": 830
+    },
+    {
+      "epoch": 0.6648,
+      "grad_norm": 0.5867416216649431,
+      "learning_rate": 5.3404142197444506e-05,
+      "loss": 0.8795,
+      "step": 831
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.4240560451003615,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.6787,
+      "step": 832
+    },
+    {
+      "epoch": 0.6664,
+      "grad_norm": 0.6062473435601824,
+      "learning_rate": 5.2946073739560706e-05,
+      "loss": 0.8306,
+      "step": 833
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.4980079779028096,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.7298,
+      "step": 834
+    },
+    {
+      "epoch": 0.668,
+      "grad_norm": 0.4661762480561208,
+      "learning_rate": 5.248926987065417e-05,
+      "loss": 0.7258,
+      "step": 835
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.5948880473411658,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.7926,
+      "step": 836
+    },
+    {
+      "epoch": 0.6696,
+      "grad_norm": 0.48975084576834266,
+      "learning_rate": 5.203374286747158e-05,
+      "loss": 0.7845,
+      "step": 837
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.5167217184044314,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.7723,
+      "step": 838
+    },
+    {
+      "epoch": 0.6712,
+      "grad_norm": 0.5239428466594733,
+      "learning_rate": 5.15795049724435e-05,
+      "loss": 0.7706,
+      "step": 839
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.5806756500414043,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.8032,
+      "step": 840
+    },
+    {
+      "epoch": 0.6728,
+      "grad_norm": 0.4596292999499451,
+      "learning_rate": 5.112656839335543e-05,
+      "loss": 0.7199,
+      "step": 841
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.5596227848306364,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.8312,
+      "step": 842
+    },
+    {
+      "epoch": 0.6744,
+      "grad_norm": 0.5591963136625346,
+      "learning_rate": 5.0674945303019526e-05,
+      "loss": 0.8315,
+      "step": 843
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4449170323794898,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6475,
+      "step": 844
+    },
+    {
+      "epoch": 0.676,
+      "grad_norm": 0.47705498742937347,
+      "learning_rate": 5.022464783894744e-05,
+      "loss": 0.8322,
+      "step": 845
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.4786649964050982,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7813,
+      "step": 846
+    },
+    {
+      "epoch": 0.6776,
+      "grad_norm": 0.7301904758939219,
+      "learning_rate": 4.977568810302432e-05,
+      "loss": 0.8948,
+      "step": 847
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.43763461326643843,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6743,
+      "step": 848
+    },
+    {
+      "epoch": 0.6792,
+      "grad_norm": 0.4314007227657756,
+      "learning_rate": 4.9328078161183464e-05,
+      "loss": 0.7339,
+      "step": 849
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.4134202505028603,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6809,
+      "step": 850
+    },
+    {
+      "epoch": 0.6808,
+      "grad_norm": 0.5127035873811302,
+      "learning_rate": 4.88818300430819e-05,
+      "loss": 0.8072,
+      "step": 851
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.5157247141627943,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6914,
+      "step": 852
+    },
+    {
+      "epoch": 0.6824,
+      "grad_norm": 0.5538438779175323,
+      "learning_rate": 4.843695574177737e-05,
+      "loss": 0.853,
+      "step": 853
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.5693373524521677,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.82,
+      "step": 854
+    },
+    {
+      "epoch": 0.684,
+      "grad_norm": 0.613013512189452,
+      "learning_rate": 4.7993467213405706e-05,
+      "loss": 0.8581,
+      "step": 855
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.5782273687721763,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.7864,
+      "step": 856
+    },
+    {
+      "epoch": 0.6856,
+      "grad_norm": 0.4832023140562939,
+      "learning_rate": 4.755137637685979e-05,
+      "loss": 0.7596,
+      "step": 857
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.5893373654741814,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.8659,
+      "step": 858
+    },
+    {
+      "epoch": 0.6872,
+      "grad_norm": 0.5728356489192235,
+      "learning_rate": 4.7110695113469085e-05,
+      "loss": 0.7982,
+      "step": 859
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4963888013123301,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.7811,
+      "step": 860
+    },
+    {
+      "epoch": 0.6888,
+      "grad_norm": 0.4593544644255751,
+      "learning_rate": 4.6671435266680216e-05,
+      "loss": 0.7239,
+      "step": 861
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.5292808478828301,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.777,
+      "step": 862
+    },
+    {
+      "epoch": 0.6904,
+      "grad_norm": 0.47716264961011856,
+      "learning_rate": 4.623360864173893e-05,
+      "loss": 0.7799,
+      "step": 863
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.48102047219338884,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.7952,
+      "step": 864
+    },
+    {
+      "epoch": 0.692,
+      "grad_norm": 0.5930481959978581,
+      "learning_rate": 4.579722700537268e-05,
+      "loss": 0.7012,
+      "step": 865
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.4680186213844873,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.7274,
+      "step": 866
+    },
+    {
+      "epoch": 0.6936,
+      "grad_norm": 0.481882307486294,
+      "learning_rate": 4.5362302085474254e-05,
+      "loss": 0.6557,
+      "step": 867
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.5636609433316219,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.7124,
+      "step": 868
+    },
+    {
+      "epoch": 0.6952,
+      "grad_norm": 0.42374086491731305,
+      "learning_rate": 4.492884557078688e-05,
+      "loss": 0.6853,
+      "step": 869
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.5122385038575532,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7265,
+      "step": 870
+    },
+    {
+      "epoch": 0.6968,
+      "grad_norm": 0.5590270489108338,
+      "learning_rate": 4.449686911058992e-05,
+      "loss": 0.7621,
+      "step": 871
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.4817673474442853,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.748,
+      "step": 872
+    },
+    {
+      "epoch": 0.6984,
+      "grad_norm": 0.44675247122412026,
+      "learning_rate": 4.406638431438576e-05,
+      "loss": 0.7411,
+      "step": 873
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.42361063787076775,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.6925,
+      "step": 874
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.41144769791298424,
+      "learning_rate": 4.36374027515878e-05,
+      "loss": 0.6548,
+      "step": 875
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.47171016221244494,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.703,
+      "step": 876
+    },
+    {
+      "epoch": 0.7016,
+      "grad_norm": 0.6101698325485143,
+      "learning_rate": 4.320993595120969e-05,
+      "loss": 0.8228,
+      "step": 877
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.5041520888592116,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.7677,
+      "step": 878
+    },
+    {
+      "epoch": 0.7032,
+      "grad_norm": 0.7814869692964698,
+      "learning_rate": 4.278399540155536e-05,
+      "loss": 0.683,
+      "step": 879
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.46844930916310995,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.7473,
+      "step": 880
+    },
+    {
+      "epoch": 0.7048,
+      "grad_norm": 0.5959483088682741,
+      "learning_rate": 4.2359592549910145e-05,
+      "loss": 0.6458,
+      "step": 881
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.5063605399129267,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.7905,
+      "step": 882
+    },
+    {
+      "epoch": 0.7064,
+      "grad_norm": 0.3658666952668689,
+      "learning_rate": 4.193673880223339e-05,
+      "loss": 0.6757,
+      "step": 883
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.4564275633578927,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.7749,
+      "step": 884
+    },
+    {
+      "epoch": 0.708,
+      "grad_norm": 0.5076235699518854,
+      "learning_rate": 4.1515445522851784e-05,
+      "loss": 0.7292,
+      "step": 885
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.46941124958967184,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.8078,
+      "step": 886
+    },
+    {
+      "epoch": 0.7096,
+      "grad_norm": 0.42706239756817554,
+      "learning_rate": 4.109572403415386e-05,
+      "loss": 0.7113,
+      "step": 887
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.6161008203576525,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.787,
+      "step": 888
+    },
+    {
+      "epoch": 0.7112,
+      "grad_norm": 0.8534797844851879,
+      "learning_rate": 4.0677585616285774e-05,
+      "loss": 0.8427,
+      "step": 889
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.5520642001035455,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6774,
+      "step": 890
+    },
+    {
+      "epoch": 0.7128,
+      "grad_norm": 0.4996239606791152,
+      "learning_rate": 4.026104150684835e-05,
+      "loss": 0.7721,
+      "step": 891
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.4880290230071659,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.7185,
+      "step": 892
+    },
+    {
+      "epoch": 0.7144,
+      "grad_norm": 0.37064325461692554,
+      "learning_rate": 3.984610290059467e-05,
+      "loss": 0.7046,
+      "step": 893
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.5034607956958921,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.7567,
+      "step": 894
+    },
+    {
+      "epoch": 0.716,
+      "grad_norm": 0.5465610376232676,
+      "learning_rate": 3.943278094912946e-05,
+      "loss": 0.7808,
+      "step": 895
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.4809947903038082,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.7492,
+      "step": 896
+    },
+    {
+      "epoch": 0.7176,
+      "grad_norm": 0.44723047852010356,
+      "learning_rate": 3.902108676060937e-05,
+      "loss": 0.6718,
+      "step": 897
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.4658776704784212,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.7193,
+      "step": 898
+    },
+    {
+      "epoch": 0.7192,
+      "grad_norm": 0.5914359748097043,
+      "learning_rate": 3.861103139944449e-05,
+      "loss": 0.8102,
+      "step": 899
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.489289904908363,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.7612,
+      "step": 900
+    },
+    {
+      "epoch": 0.7208,
+      "grad_norm": 0.576337648589474,
+      "learning_rate": 3.820262588600074e-05,
+      "loss": 0.8564,
+      "step": 901
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.5813448675355454,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.8075,
+      "step": 902
+    },
+    {
+      "epoch": 0.7224,
+      "grad_norm": 0.5253527903268184,
+      "learning_rate": 3.7795881196303995e-05,
+      "loss": 0.7886,
+      "step": 903
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.6103612355242102,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.9397,
+      "step": 904
+    },
+    {
+      "epoch": 0.724,
+      "grad_norm": 0.4577395100753244,
+      "learning_rate": 3.739080826174498e-05,
+      "loss": 0.7448,
+      "step": 905
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.4825704700026192,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.8243,
+      "step": 906
+    },
+    {
+      "epoch": 0.7256,
+      "grad_norm": 0.5219110921451579,
+      "learning_rate": 3.6987417968785366e-05,
+      "loss": 0.7542,
+      "step": 907
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.7871936069186822,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.8077,
+      "step": 908
+    },
+    {
+      "epoch": 0.7272,
+      "grad_norm": 0.47315481292649386,
+      "learning_rate": 3.658572115866541e-05,
+      "loss": 0.746,
+      "step": 909
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4748880749384918,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.8369,
+      "step": 910
+    },
+    {
+      "epoch": 0.7288,
+      "grad_norm": 0.4080798085611525,
+      "learning_rate": 3.618572862711247e-05,
+      "loss": 0.6531,
+      "step": 911
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.5119253583646589,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.7468,
+      "step": 912
+    },
+    {
+      "epoch": 0.7304,
+      "grad_norm": 0.6212700792170417,
+      "learning_rate": 3.578745112405083e-05,
+      "loss": 0.762,
+      "step": 913
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.5282916644005026,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.8204,
+      "step": 914
+    },
+    {
+      "epoch": 0.732,
+      "grad_norm": 0.5071041999449878,
+      "learning_rate": 3.539089935331294e-05,
+      "loss": 0.8328,
+      "step": 915
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4543491551374726,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.676,
+      "step": 916
+    },
+    {
+      "epoch": 0.7336,
+      "grad_norm": 0.4303808812872645,
+      "learning_rate": 3.4996083972351515e-05,
+      "loss": 0.6649,
+      "step": 917
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.4145040254576927,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6757,
+      "step": 918
+    },
+    {
+      "epoch": 0.7352,
+      "grad_norm": 0.46752384609791203,
+      "learning_rate": 3.4603015591953395e-05,
+      "loss": 0.8547,
+      "step": 919
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.5248016141240613,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.8223,
+      "step": 920
+    },
+    {
+      "epoch": 0.7368,
+      "grad_norm": 0.40006990522502706,
+      "learning_rate": 3.421170477595419e-05,
+      "loss": 0.6361,
+      "step": 921
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.41418076269554743,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7049,
+      "step": 922
+    },
+    {
+      "epoch": 0.7384,
+      "grad_norm": 0.4584166617163971,
+      "learning_rate": 3.3822162040954354e-05,
+      "loss": 0.7265,
+      "step": 923
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.5444286078527143,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.7774,
+      "step": 924
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.45218769533175474,
+      "learning_rate": 3.34343978560367e-05,
+      "loss": 0.8481,
+      "step": 925
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.4864396689150151,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.8067,
+      "step": 926
+    },
+    {
+      "epoch": 0.7416,
+      "grad_norm": 0.44434463233048926,
+      "learning_rate": 3.3048422642484886e-05,
+      "loss": 0.6853,
+      "step": 927
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.41700659337429324,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6749,
+      "step": 928
+    },
+    {
+      "epoch": 0.7432,
+      "grad_norm": 0.5993802512462391,
+      "learning_rate": 3.266424677350346e-05,
+      "loss": 0.7137,
+      "step": 929
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.44931539515794827,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6867,
+      "step": 930
+    },
+    {
+      "epoch": 0.7448,
+      "grad_norm": 0.45909105181412047,
+      "learning_rate": 3.228188057393895e-05,
+      "loss": 0.759,
+      "step": 931
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.681985951504881,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.8368,
+      "step": 932
+    },
+    {
+      "epoch": 0.7464,
+      "grad_norm": 0.8366986354033052,
+      "learning_rate": 3.190133432000252e-05,
+      "loss": 0.8576,
+      "step": 933
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.4921233294818476,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.7727,
+      "step": 934
+    },
+    {
+      "epoch": 0.748,
+      "grad_norm": 0.4771865298525171,
+      "learning_rate": 3.1522618238993725e-05,
+      "loss": 0.7724,
+      "step": 935
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.527380279188646,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.7268,
+      "step": 936
+    },
+    {
+      "epoch": 0.7496,
+      "grad_norm": 0.3751049915807199,
+      "learning_rate": 3.114574250902558e-05,
+      "loss": 0.6258,
+      "step": 937
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.5285171071742202,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6757,
+      "step": 938
+    },
+    {
+      "epoch": 0.7512,
+      "grad_norm": 0.5029451702331986,
+      "learning_rate": 3.077071725875116e-05,
+      "loss": 0.7242,
+      "step": 939
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.5881607167547737,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.7358,
+      "step": 940
+    },
+    {
+      "epoch": 0.7528,
+      "grad_norm": 0.44888944202160613,
+      "learning_rate": 3.0397552567091337e-05,
+      "loss": 0.7542,
+      "step": 941
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.5517416672102848,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.7485,
+      "step": 942
+    },
+    {
+      "epoch": 0.7544,
+      "grad_norm": 0.5063472555628153,
+      "learning_rate": 3.0026258462963787e-05,
+      "loss": 0.6773,
+      "step": 943
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.4078174656869059,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.793,
+      "step": 944
+    },
+    {
+      "epoch": 0.756,
+      "grad_norm": 0.6174211087839312,
+      "learning_rate": 2.9656844925013637e-05,
+      "loss": 0.8971,
+      "step": 945
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.4797403859123356,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.7631,
+      "step": 946
+    },
+    {
+      "epoch": 0.7576,
+      "grad_norm": 0.41077179789727625,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.7006,
+      "step": 947
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.4511949612040704,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.711,
+      "step": 948
+    },
+    {
+      "epoch": 0.7592,
+      "grad_norm": 0.42043055532718043,
+      "learning_rate": 2.8923699209255284e-05,
+      "loss": 0.6621,
+      "step": 949
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.5378329077146456,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.9106,
+      "step": 950
+    },
+    {
+      "epoch": 0.7608,
+      "grad_norm": 0.44452033249967243,
+      "learning_rate": 2.8559986734967282e-05,
+      "loss": 0.6982,
+      "step": 951
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.5084625352483807,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.7144,
+      "step": 952
+    },
+    {
+      "epoch": 0.7624,
+      "grad_norm": 0.46768349941192616,
+      "learning_rate": 2.819819423336775e-05,
+      "loss": 0.7016,
+      "step": 953
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.47694367327979464,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.7479,
+      "step": 954
+    },
+    {
+      "epoch": 0.764,
+      "grad_norm": 0.4765841625519169,
+      "learning_rate": 2.7838331427743282e-05,
+      "loss": 0.7587,
+      "step": 955
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.5965387934471909,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.7642,
+      "step": 956
+    },
+    {
+      "epoch": 0.7656,
+      "grad_norm": 0.456663109129941,
+      "learning_rate": 2.7480407989519198e-05,
+      "loss": 0.6274,
+      "step": 957
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.5917085485189117,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.7109,
+      "step": 958
+    },
+    {
+      "epoch": 0.7672,
+      "grad_norm": 0.4264667225940256,
+      "learning_rate": 2.712443353799984e-05,
+      "loss": 0.682,
+      "step": 959
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.49031682812728306,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.8333,
+      "step": 960
+    },
+    {
+      "epoch": 0.7688,
+      "grad_norm": 0.5445763973198763,
+      "learning_rate": 2.677041764010988e-05,
+      "loss": 0.588,
+      "step": 961
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.5167494042052067,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.7588,
+      "step": 962
+    },
+    {
+      "epoch": 0.7704,
+      "grad_norm": 0.4109214999293328,
+      "learning_rate": 2.6418369810137188e-05,
+      "loss": 0.6259,
+      "step": 963
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.46313638523489997,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.7309,
+      "step": 964
+    },
+    {
+      "epoch": 0.772,
+      "grad_norm": 0.5405431360774156,
+      "learning_rate": 2.6068299509477266e-05,
+      "loss": 0.8372,
+      "step": 965
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.46212905775295887,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6678,
+      "step": 966
+    },
+    {
+      "epoch": 0.7736,
+      "grad_norm": 0.481123042266737,
+      "learning_rate": 2.5720216146378917e-05,
+      "loss": 0.6835,
+      "step": 967
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4226882956652361,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.6223,
+      "step": 968
+    },
+    {
+      "epoch": 0.7752,
+      "grad_norm": 0.49527409119365456,
+      "learning_rate": 2.5374129075691265e-05,
+      "loss": 0.7526,
+      "step": 969
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.5038293645904581,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.8102,
+      "step": 970
+    },
+    {
+      "epoch": 0.7768,
+      "grad_norm": 0.46762425327328927,
+      "learning_rate": 2.503004759861258e-05,
+      "loss": 0.6715,
+      "step": 971
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.4714172170588027,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.7877,
+      "step": 972
+    },
+    {
+      "epoch": 0.7784,
+      "grad_norm": 0.41543249343540894,
+      "learning_rate": 2.4687980962440072e-05,
+      "loss": 0.682,
+      "step": 973
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.41678514930683885,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6574,
+      "step": 974
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.5473185055431781,
+      "learning_rate": 2.4347938360321566e-05,
+      "loss": 0.841,
+      "step": 975
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.5081509533859854,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.7358,
+      "step": 976
+    },
+    {
+      "epoch": 0.7816,
+      "grad_norm": 0.4717804951268603,
+      "learning_rate": 2.400992893100822e-05,
+      "loss": 0.8328,
+      "step": 977
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.5279633817232753,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.7876,
+      "step": 978
+    },
+    {
+      "epoch": 0.7832,
+      "grad_norm": 0.5469059591551831,
+      "learning_rate": 2.3673961758609152e-05,
+      "loss": 0.8143,
+      "step": 979
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.48881655675917796,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.7419,
+      "step": 980
+    },
+    {
+      "epoch": 0.7848,
+      "grad_norm": 0.8183661784871328,
+      "learning_rate": 2.334004587234717e-05,
+      "loss": 0.9628,
+      "step": 981
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.4443008674328188,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6465,
+      "step": 982
+    },
+    {
+      "epoch": 0.7864,
+      "grad_norm": 0.491375950496024,
+      "learning_rate": 2.300819024631603e-05,
+      "loss": 0.7629,
+      "step": 983
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.38746514211583577,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6489,
+      "step": 984
+    },
+    {
+      "epoch": 0.788,
+      "grad_norm": 0.5123092130360583,
+      "learning_rate": 2.26784037992395e-05,
+      "loss": 0.8181,
+      "step": 985
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.5640094489556691,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.7797,
+      "step": 986
+    },
+    {
+      "epoch": 0.7896,
+      "grad_norm": 0.4642024801709126,
+      "learning_rate": 2.2350695394231345e-05,
+      "loss": 0.8212,
+      "step": 987
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.6556017469175548,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.7891,
+      "step": 988
+    },
+    {
+      "epoch": 0.7912,
+      "grad_norm": 0.4015401241516933,
+      "learning_rate": 2.2025073838557454e-05,
+      "loss": 0.6336,
+      "step": 989
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.4381193049202663,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.7072,
+      "step": 990
+    },
+    {
+      "epoch": 0.7928,
+      "grad_norm": 0.4536050378109064,
+      "learning_rate": 2.1701547883398922e-05,
+      "loss": 0.6754,
+      "step": 991
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.368242729052764,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.5968,
+      "step": 992
+    },
+    {
+      "epoch": 0.7944,
+      "grad_norm": 0.5120152908755962,
+      "learning_rate": 2.138012622361689e-05,
+      "loss": 0.7909,
+      "step": 993
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.6537452367657269,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.7993,
+      "step": 994
+    },
+    {
+      "epoch": 0.796,
+      "grad_norm": 0.44996218961547835,
+      "learning_rate": 2.106081749751897e-05,
+      "loss": 0.7202,
+      "step": 995
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.45925587252682554,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.7205,
+      "step": 996
+    },
+    {
+      "epoch": 0.7976,
+      "grad_norm": 0.4784011669048103,
+      "learning_rate": 2.0743630286627002e-05,
+      "loss": 0.7557,
+      "step": 997
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.38377975270286485,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6405,
+      "step": 998
+    },
+    {
+      "epoch": 0.7992,
+      "grad_norm": 0.3738615814125053,
+      "learning_rate": 2.0428573115446392e-05,
+      "loss": 0.6211,
+      "step": 999
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.414407263661106,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6923,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8008,
+      "grad_norm": 0.546862686175386,
+      "learning_rate": 2.011565445123711e-05,
+      "loss": 0.8074,
+      "step": 1001
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.4721171141095232,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.722,
+      "step": 1002
+    },
+    {
+      "epoch": 0.8024,
+      "grad_norm": 0.5250980188045624,
+      "learning_rate": 1.980488270378612e-05,
+      "loss": 0.7206,
+      "step": 1003
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.39319681519493027,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6868,
+      "step": 1004
+    },
+    {
+      "epoch": 0.804,
+      "grad_norm": 0.5298182028077267,
+      "learning_rate": 1.9496266225181248e-05,
+      "loss": 0.6903,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.48201347038805087,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7593,
+      "step": 1006
+    },
+    {
+      "epoch": 0.8056,
+      "grad_norm": 0.5282216413510522,
+      "learning_rate": 1.918981330958678e-05,
+      "loss": 0.7375,
+      "step": 1007
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.38140745448157326,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6859,
+      "step": 1008
+    },
+    {
+      "epoch": 0.8072,
+      "grad_norm": 0.720905548280619,
+      "learning_rate": 1.8885532193020704e-05,
+      "loss": 0.7977,
+      "step": 1009
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.5943362314470606,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7603,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8088,
+      "grad_norm": 0.44026458461341533,
+      "learning_rate": 1.8583431053133127e-05,
+      "loss": 0.6954,
+      "step": 1011
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.39168038653597353,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.7052,
+      "step": 1012
+    },
+    {
+      "epoch": 0.8104,
+      "grad_norm": 0.5051585049106107,
+      "learning_rate": 1.8283518008986567e-05,
+      "loss": 0.803,
+      "step": 1013
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.5093371160760573,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.6966,
+      "step": 1014
+    },
+    {
+      "epoch": 0.812,
+      "grad_norm": 0.44492165541172995,
+      "learning_rate": 1.7985801120837865e-05,
+      "loss": 0.6336,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4586654762573073,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.7204,
+      "step": 1016
+    },
+    {
+      "epoch": 0.8136,
+      "grad_norm": 0.475042880175623,
+      "learning_rate": 1.7690288389921493e-05,
+      "loss": 0.6966,
+      "step": 1017
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.48692832005893966,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.7027,
+      "step": 1018
+    },
+    {
+      "epoch": 0.8152,
+      "grad_norm": 0.6533590380966955,
+      "learning_rate": 1.739698775823442e-05,
+      "loss": 0.7991,
+      "step": 1019
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.5088543228951543,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.6792,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8168,
+      "grad_norm": 0.5365077711844575,
+      "learning_rate": 1.7105907108322816e-05,
+      "loss": 0.8042,
+      "step": 1021
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.607910990415707,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.709,
+      "step": 1022
+    },
+    {
+      "epoch": 0.8184,
+      "grad_norm": 0.543830616087409,
+      "learning_rate": 1.6817054263070174e-05,
+      "loss": 0.8347,
+      "step": 1023
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.4479815487693433,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6691,
+      "step": 1024
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.5575118745140125,
+      "learning_rate": 1.6530436985486996e-05,
+      "loss": 0.7793,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.4623527812061081,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.691,
+      "step": 1026
+    },
+    {
+      "epoch": 0.8216,
+      "grad_norm": 0.47170960129034645,
+      "learning_rate": 1.6246062978502164e-05,
+      "loss": 0.7664,
+      "step": 1027
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.40589211723619584,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.695,
+      "step": 1028
+    },
+    {
+      "epoch": 0.8232,
+      "grad_norm": 0.5595609747168507,
+      "learning_rate": 1.5963939884756042e-05,
+      "loss": 0.7339,
+      "step": 1029
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.331177129924808,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6259,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8248,
+      "grad_norm": 0.42842244367933835,
+      "learning_rate": 1.5684075286394985e-05,
+      "loss": 0.6795,
+      "step": 1031
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.5303146911759715,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.7196,
+      "step": 1032
+    },
+    {
+      "epoch": 0.8264,
+      "grad_norm": 0.6413552273769738,
+      "learning_rate": 1.5406476704867524e-05,
+      "loss": 0.8235,
+      "step": 1033
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.4896679986097937,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6268,
+      "step": 1034
+    },
+    {
+      "epoch": 0.828,
+      "grad_norm": 0.49353178307848933,
+      "learning_rate": 1.5131151600722337e-05,
+      "loss": 0.7713,
+      "step": 1035
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.42714742055982385,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.7156,
+      "step": 1036
+    },
+    {
+      "epoch": 0.8296,
+      "grad_norm": 0.47129628936070617,
+      "learning_rate": 1.485810737340767e-05,
+      "loss": 0.7049,
+      "step": 1037
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.5655082901936167,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.7203,
+      "step": 1038
+    },
+    {
+      "epoch": 0.8312,
+      "grad_norm": 0.4797701282259538,
+      "learning_rate": 1.4587351361072454e-05,
+      "loss": 0.772,
+      "step": 1039
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.483579335699633,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.7338,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8328,
+      "grad_norm": 0.4856004197153419,
+      "learning_rate": 1.4318890840369182e-05,
+      "loss": 0.7339,
+      "step": 1041
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.4844727919392016,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.6873,
+      "step": 1042
+    },
+    {
+      "epoch": 0.8344,
+      "grad_norm": 0.4354352674543203,
+      "learning_rate": 1.4052733026258281e-05,
+      "loss": 0.6916,
+      "step": 1043
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.4382216003346178,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.6948,
+      "step": 1044
+    },
+    {
+      "epoch": 0.836,
+      "grad_norm": 0.6770346482301582,
+      "learning_rate": 1.3788885071814172e-05,
+      "loss": 0.7415,
+      "step": 1045
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.5115812748312439,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.7453,
+      "step": 1046
+    },
+    {
+      "epoch": 0.8376,
+      "grad_norm": 0.40621159507540683,
+      "learning_rate": 1.3527354068033139e-05,
+      "loss": 0.6829,
+      "step": 1047
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.41199024634843723,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.6893,
+      "step": 1048
+    },
+    {
+      "epoch": 0.8392,
+      "grad_norm": 0.49634731217081535,
+      "learning_rate": 1.326814704364262e-05,
+      "loss": 0.67,
+      "step": 1049
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.4959416286449828,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.732,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8408,
+      "grad_norm": 0.5879885666504012,
+      "learning_rate": 1.3011270964912459e-05,
+      "loss": 0.7008,
+      "step": 1051
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.5017874717397822,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.7023,
+      "step": 1052
+    },
+    {
+      "epoch": 0.8424,
+      "grad_norm": 0.4889156027327041,
+      "learning_rate": 1.275673273546758e-05,
+      "loss": 0.6859,
+      "step": 1053
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.41541902496673644,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.7111,
+      "step": 1054
+    },
+    {
+      "epoch": 0.844,
+      "grad_norm": 0.47242855088569513,
+      "learning_rate": 1.2504539196102439e-05,
+      "loss": 0.7048,
+      "step": 1055
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.5848713015965614,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.8022,
+      "step": 1056
+    },
+    {
+      "epoch": 0.8456,
+      "grad_norm": 0.44615163121800333,
+      "learning_rate": 1.2254697124597237e-05,
+      "loss": 0.7684,
+      "step": 1057
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.54850319105347,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.7982,
+      "step": 1058
+    },
+    {
+      "epoch": 0.8472,
+      "grad_norm": 0.5625949181314011,
+      "learning_rate": 1.2007213235535786e-05,
+      "loss": 0.7601,
+      "step": 1059
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.5050525040384994,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.7682,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8488,
+      "grad_norm": 0.7905231526074173,
+      "learning_rate": 1.176209418012495e-05,
+      "loss": 0.7779,
+      "step": 1061
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.4542040021729755,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6861,
+      "step": 1062
+    },
+    {
+      "epoch": 0.8504,
+      "grad_norm": 0.4644381199480783,
+      "learning_rate": 1.1519346546015907e-05,
+      "loss": 0.7356,
+      "step": 1063
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.5818575190161742,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.7372,
+      "step": 1064
+    },
+    {
+      "epoch": 0.852,
+      "grad_norm": 0.5567751820347714,
+      "learning_rate": 1.1278976857127311e-05,
+      "loss": 0.7911,
+      "step": 1065
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.60641735952812,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.8726,
+      "step": 1066
+    },
+    {
+      "epoch": 0.8536,
+      "grad_norm": 0.5685906303235803,
+      "learning_rate": 1.1040991573469629e-05,
+      "loss": 0.7652,
+      "step": 1067
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.4818487663254473,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.719,
+      "step": 1068
+    },
+    {
+      "epoch": 0.8552,
+      "grad_norm": 0.5263438363503978,
+      "learning_rate": 1.0805397090971737e-05,
+      "loss": 0.7884,
+      "step": 1069
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.5688085645171608,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.7585,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8568,
+      "grad_norm": 0.6889185312449415,
+      "learning_rate": 1.057219974130903e-05,
+      "loss": 0.8714,
+      "step": 1071
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.4431544075081975,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.7293,
+      "step": 1072
+    },
+    {
+      "epoch": 0.8584,
+      "grad_norm": 0.4892432936929515,
+      "learning_rate": 1.0341405791733183e-05,
+      "loss": 0.7583,
+      "step": 1073
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.5363432561645519,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.7062,
+      "step": 1074
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.48599545437169195,
+      "learning_rate": 1.0113021444903726e-05,
+      "loss": 0.7263,
+      "step": 1075
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.4877492160507757,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.8108,
+      "step": 1076
+    },
+    {
+      "epoch": 0.8616,
+      "grad_norm": 0.38602828530404476,
+      "learning_rate": 9.887052838721322e-06,
+      "loss": 0.6334,
+      "step": 1077
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.44578160607841233,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6839,
+      "step": 1078
+    },
+    {
+      "epoch": 0.8632,
+      "grad_norm": 0.4313393619764014,
+      "learning_rate": 9.663506046162985e-06,
+      "loss": 0.6248,
+      "step": 1079
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.5766723504134088,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.848,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8648,
+      "grad_norm": 0.41382160420692804,
+      "learning_rate": 9.44238707511862e-06,
+      "loss": 0.6548,
+      "step": 1081
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.5187264244623699,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.7366,
+      "step": 1082
+    },
+    {
+      "epoch": 0.8664,
+      "grad_norm": 0.63150061116092,
+      "learning_rate": 9.22370186822965e-06,
+      "loss": 0.7477,
+      "step": 1083
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.43520756347287115,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6083,
+      "step": 1084
+    },
+    {
+      "epoch": 0.868,
+      "grad_norm": 0.6954729281581934,
+      "learning_rate": 9.0074563027294e-06,
+      "loss": 0.87,
+      "step": 1085
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.6523736879654485,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.8212,
+      "step": 1086
+    },
+    {
+      "epoch": 0.8696,
+      "grad_norm": 0.4351809267256565,
+      "learning_rate": 8.79365619028507e-06,
+      "loss": 0.7426,
+      "step": 1087
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.49734748006175955,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.7176,
+      "step": 1088
+    },
+    {
+      "epoch": 0.8712,
+      "grad_norm": 0.42863184313110836,
+      "learning_rate": 8.582307276841462e-06,
+      "loss": 0.6746,
+      "step": 1089
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.39624365515091226,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.6346,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8728,
+      "grad_norm": 0.6070474678947206,
+      "learning_rate": 8.37341524246672e-06,
+      "loss": 0.8173,
+      "step": 1091
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.6152814779816568,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.7342,
+      "step": 1092
+    },
+    {
+      "epoch": 0.8744,
+      "grad_norm": 0.4735777668957982,
+      "learning_rate": 8.166985701199582e-06,
+      "loss": 0.7085,
+      "step": 1093
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.44779780181339546,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.7101,
+      "step": 1094
+    },
+    {
+      "epoch": 0.876,
+      "grad_norm": 0.3927049863563379,
+      "learning_rate": 7.963024200898462e-06,
+      "loss": 0.7166,
+      "step": 1095
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.44215943404038016,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.7443,
+      "step": 1096
+    },
+    {
+      "epoch": 0.8776,
+      "grad_norm": 0.5161580910333338,
+      "learning_rate": 7.761536223092458e-06,
+      "loss": 0.7128,
+      "step": 1097
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.4904210254954027,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.746,
+      "step": 1098
+    },
+    {
+      "epoch": 0.8792,
+      "grad_norm": 0.6310426985660955,
+      "learning_rate": 7.562527182833978e-06,
+      "loss": 0.73,
+      "step": 1099
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.484804684526844,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.7435,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8808,
+      "grad_norm": 0.4031087847091995,
+      "learning_rate": 7.366002428553153e-06,
+      "loss": 0.6395,
+      "step": 1101
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.48903277198413786,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.639,
+      "step": 1102
+    },
+    {
+      "epoch": 0.8824,
+      "grad_norm": 0.7331550646221433,
+      "learning_rate": 7.171967241914224e-06,
+      "loss": 0.814,
+      "step": 1103
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.4628809485755903,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6781,
+      "step": 1104
+    },
+    {
+      "epoch": 0.884,
+      "grad_norm": 0.3896616608964118,
+      "learning_rate": 6.980426837673437e-06,
+      "loss": 0.6409,
+      "step": 1105
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.4596513087094253,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6971,
+      "step": 1106
+    },
+    {
+      "epoch": 0.8856,
+      "grad_norm": 0.48381308759059494,
+      "learning_rate": 6.791386363539065e-06,
+      "loss": 0.6992,
+      "step": 1107
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.4567329216654885,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.7324,
+      "step": 1108
+    },
+    {
+      "epoch": 0.8872,
+      "grad_norm": 0.5193372823300771,
+      "learning_rate": 6.604850900032955e-06,
+      "loss": 0.6888,
+      "step": 1109
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.6117060657697789,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.8634,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8888,
+      "grad_norm": 0.44100918029650976,
+      "learning_rate": 6.420825460353974e-06,
+      "loss": 0.6763,
+      "step": 1111
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.37502419948636284,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6432,
+      "step": 1112
+    },
+    {
+      "epoch": 0.8904,
+      "grad_norm": 0.5334434133365746,
+      "learning_rate": 6.239314990243339e-06,
+      "loss": 0.6804,
+      "step": 1113
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.47365930471855267,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.7115,
+      "step": 1114
+    },
+    {
+      "epoch": 0.892,
+      "grad_norm": 0.6031283485154669,
+      "learning_rate": 6.0603243678516995e-06,
+      "loss": 0.6359,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.4574835901358478,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.7119,
+      "step": 1116
+    },
+    {
+      "epoch": 0.8936,
+      "grad_norm": 0.5026677386432215,
+      "learning_rate": 5.883858403607967e-06,
+      "loss": 0.6995,
+      "step": 1117
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.543562020854025,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.7052,
+      "step": 1118
+    },
+    {
+      "epoch": 0.8952,
+      "grad_norm": 0.5899728969075093,
+      "learning_rate": 5.7099218400900716e-06,
+      "loss": 0.731,
+      "step": 1119
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.37569145101022755,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6681,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8968,
+      "grad_norm": 0.5412787308770931,
+      "learning_rate": 5.538519351897575e-06,
+      "loss": 0.7451,
+      "step": 1121
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.4345081969623715,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.722,
+      "step": 1122
+    },
+    {
+      "epoch": 0.8984,
+      "grad_norm": 0.3891793799421602,
+      "learning_rate": 5.369655545525909e-06,
+      "loss": 0.641,
+      "step": 1123
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.6988491234269987,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.8525,
+      "step": 1124
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.4721345087077115,
+      "learning_rate": 5.2033349592426335e-06,
+      "loss": 0.7629,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.5047474078895655,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.7768,
+      "step": 1126
+    },
+    {
+      "epoch": 0.9016,
+      "grad_norm": 0.6228980544298343,
+      "learning_rate": 5.039562062965508e-06,
+      "loss": 0.8038,
+      "step": 1127
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.5415475953360327,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.7996,
+      "step": 1128
+    },
+    {
+      "epoch": 0.9032,
+      "grad_norm": 0.47207787846432325,
+      "learning_rate": 4.87834125814235e-06,
+      "loss": 0.7772,
+      "step": 1129
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.5748806476455358,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.811,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9048,
+      "grad_norm": 0.4220631995351488,
+      "learning_rate": 4.719676877632639e-06,
+      "loss": 0.6474,
+      "step": 1131
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.446191054611265,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6904,
+      "step": 1132
+    },
+    {
+      "epoch": 0.9064,
+      "grad_norm": 0.4354242087444171,
+      "learning_rate": 4.563573185591219e-06,
+      "loss": 0.6407,
+      "step": 1133
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.5315934381080039,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.7782,
+      "step": 1134
+    },
+    {
+      "epoch": 0.908,
+      "grad_norm": 0.5308006888632717,
+      "learning_rate": 4.4100343773536225e-06,
+      "loss": 0.7973,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.4714340989308455,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6507,
+      "step": 1136
+    },
+    {
+      "epoch": 0.9096,
+      "grad_norm": 0.4665239385162486,
+      "learning_rate": 4.259064579323302e-06,
+      "loss": 0.672,
+      "step": 1137
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.4634796827042525,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6559,
+      "step": 1138
+    },
+    {
+      "epoch": 0.9112,
+      "grad_norm": 0.4419499418289889,
+      "learning_rate": 4.1106678488607495e-06,
+      "loss": 0.6888,
+      "step": 1139
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.49012435287452877,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.8121,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9128,
+      "grad_norm": 0.41619064786271287,
+      "learning_rate": 3.964848174174541e-06,
+      "loss": 0.7139,
+      "step": 1141
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.4668223284459053,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.7529,
+      "step": 1142
+    },
+    {
+      "epoch": 0.9144,
+      "grad_norm": 0.5075018339710508,
+      "learning_rate": 3.821609474213983e-06,
+      "loss": 0.7548,
+      "step": 1143
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.5599300622049045,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.7873,
+      "step": 1144
+    },
+    {
+      "epoch": 0.916,
+      "grad_norm": 0.47822143605307643,
+      "learning_rate": 3.6809555985639068e-06,
+      "loss": 0.7526,
+      "step": 1145
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.383553168711321,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6544,
+      "step": 1146
+    },
+    {
+      "epoch": 0.9176,
+      "grad_norm": 0.5037697991826849,
+      "learning_rate": 3.5428903273411863e-06,
+      "loss": 0.7349,
+      "step": 1147
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.5003655799335224,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.7502,
+      "step": 1148
+    },
+    {
+      "epoch": 0.9192,
+      "grad_norm": 0.5214482615730851,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.7391,
+      "step": 1149
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.5721289899083863,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.7481,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9208,
+      "grad_norm": 0.5026848248839152,
+      "learning_rate": 3.2745403706978872e-06,
+      "loss": 0.7492,
+      "step": 1151
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.5282502110567238,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6344,
+      "step": 1152
+    },
+    {
+      "epoch": 0.9224,
+      "grad_norm": 0.5862578699477898,
+      "learning_rate": 3.1442628972662704e-06,
+      "loss": 0.8543,
+      "step": 1153
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.41667306780092783,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6662,
+      "step": 1154
+    },
+    {
+      "epoch": 0.924,
+      "grad_norm": 0.40082055089727653,
+      "learning_rate": 3.0165884520461316e-06,
+      "loss": 0.6595,
+      "step": 1155
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.550060804819665,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.847,
+      "step": 1156
+    },
+    {
+      "epoch": 0.9256,
+      "grad_norm": 0.5639909001781118,
+      "learning_rate": 2.8915204663281013e-06,
+      "loss": 0.7041,
+      "step": 1157
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.5914599041965697,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.739,
+      "step": 1158
+    },
+    {
+      "epoch": 0.9272,
+      "grad_norm": 0.5320383614652344,
+      "learning_rate": 2.7690623013533976e-06,
+      "loss": 0.7595,
+      "step": 1159
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.5121040065383884,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.649,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9288,
+      "grad_norm": 0.5353742601330876,
+      "learning_rate": 2.649217248223468e-06,
+      "loss": 0.8626,
+      "step": 1161
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.5437078836728698,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.8018,
+      "step": 1162
+    },
+    {
+      "epoch": 0.9304,
+      "grad_norm": 0.4711045843340895,
+      "learning_rate": 2.5319885278115906e-06,
+      "loss": 0.6585,
+      "step": 1163
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.41446496435810776,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.8136,
+      "step": 1164
+    },
+    {
+      "epoch": 0.932,
+      "grad_norm": 0.47359698246292387,
+      "learning_rate": 2.4173792906762804e-06,
+      "loss": 0.6875,
+      "step": 1165
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.481888636253016,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.7655,
+      "step": 1166
+    },
+    {
+      "epoch": 0.9336,
+      "grad_norm": 0.48866025654384593,
+      "learning_rate": 2.3053926169765984e-06,
+      "loss": 0.7292,
+      "step": 1167
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.5234734449607475,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.7774,
+      "step": 1168
+    },
+    {
+      "epoch": 0.9352,
+      "grad_norm": 0.45442866945514243,
+      "learning_rate": 2.1960315163894075e-06,
+      "loss": 0.744,
+      "step": 1169
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.5707376554409354,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.7974,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9368,
+      "grad_norm": 0.47615273754341614,
+      "learning_rate": 2.0892989280284823e-06,
+      "loss": 0.722,
+      "step": 1171
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.5980890476489649,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.7247,
+      "step": 1172
+    },
+    {
+      "epoch": 0.9384,
+      "grad_norm": 0.5078096465834021,
+      "learning_rate": 1.9851977203654835e-06,
+      "loss": 0.8045,
+      "step": 1173
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.4942281025658662,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.8084,
+      "step": 1174
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.5603283271593273,
+      "learning_rate": 1.8837306911529184e-06,
+      "loss": 0.7097,
+      "step": 1175
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.524348482958465,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.736,
+      "step": 1176
+    },
+    {
+      "epoch": 0.9416,
+      "grad_norm": 0.5103150111409667,
+      "learning_rate": 1.7849005673489127e-06,
+      "loss": 0.7533,
+      "step": 1177
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.47126320658347026,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6255,
+      "step": 1178
+    },
+    {
+      "epoch": 0.9432,
+      "grad_norm": 0.4374595099907046,
+      "learning_rate": 1.6887100050439587e-06,
+      "loss": 0.6519,
+      "step": 1179
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4303500396061869,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6718,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9448,
+      "grad_norm": 0.5027048542692559,
+      "learning_rate": 1.595161589389449e-06,
+      "loss": 0.7632,
+      "step": 1181
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.40345310504503545,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6327,
+      "step": 1182
+    },
+    {
+      "epoch": 0.9464,
+      "grad_norm": 0.5235635729752205,
+      "learning_rate": 1.5042578345283108e-06,
+      "loss": 0.7656,
+      "step": 1183
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.46775735024885157,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6709,
+      "step": 1184
+    },
+    {
+      "epoch": 0.948,
+      "grad_norm": 0.44824632687201565,
+      "learning_rate": 1.4160011835273934e-06,
+      "loss": 0.6463,
+      "step": 1185
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.43119055221852337,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.7009,
+      "step": 1186
+    },
+    {
+      "epoch": 0.9496,
+      "grad_norm": 0.49913531126759625,
+      "learning_rate": 1.3303940083117527e-06,
+      "loss": 0.6509,
+      "step": 1187
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.5102335232179838,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.7075,
+      "step": 1188
+    },
+    {
+      "epoch": 0.9512,
+      "grad_norm": 0.4969901617512161,
+      "learning_rate": 1.2474386096010039e-06,
+      "loss": 0.8466,
+      "step": 1189
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.3780136813925014,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.6087,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9528,
+      "grad_norm": 0.5465231620290661,
+      "learning_rate": 1.1671372168474138e-06,
+      "loss": 0.7149,
+      "step": 1191
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.49426935806759537,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.7381,
+      "step": 1192
+    },
+    {
+      "epoch": 0.9544,
+      "grad_norm": 0.4569170953485425,
+      "learning_rate": 1.089491988176017e-06,
+      "loss": 0.7881,
+      "step": 1193
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.5041880961100881,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.7477,
+      "step": 1194
+    },
+    {
+      "epoch": 0.956,
+      "grad_norm": 0.40728285040341056,
+      "learning_rate": 1.014505010326583e-06,
+      "loss": 0.7686,
+      "step": 1195
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.5505105644553496,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.8215,
+      "step": 1196
+    },
+    {
+      "epoch": 0.9576,
+      "grad_norm": 0.5172856369067419,
+      "learning_rate": 9.421782985976068e-07,
+      "loss": 0.7142,
+      "step": 1197
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.499399811683604,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.7746,
+      "step": 1198
+    },
+    {
+      "epoch": 0.9592,
+      "grad_norm": 0.5647289041005384,
+      "learning_rate": 8.725137967920738e-07,
+      "loss": 0.8716,
+      "step": 1199
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.558860908349905,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.7367,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9608,
+      "grad_norm": 0.4962091966102615,
+      "learning_rate": 8.055133771652345e-07,
+      "loss": 0.718,
+      "step": 1201
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.5761548093582705,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.7649,
+      "step": 1202
+    },
+    {
+      "epoch": 0.9624,
+      "grad_norm": 0.489217874580784,
+      "learning_rate": 7.411788403743237e-07,
+      "loss": 0.6173,
+      "step": 1203
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.49397258325557336,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.7394,
+      "step": 1204
+    },
+    {
+      "epoch": 0.964,
+      "grad_norm": 0.4770680027485727,
+      "learning_rate": 6.7951191543012e-07,
+      "loss": 0.7152,
+      "step": 1205
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.408646861827668,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6599,
+      "step": 1206
+    },
+    {
+      "epoch": 0.9656,
+      "grad_norm": 0.4389837381985532,
+      "learning_rate": 6.205142596505176e-07,
+      "loss": 0.7895,
+      "step": 1207
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.44752592832878274,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6156,
+      "step": 1208
+    },
+    {
+      "epoch": 0.9672,
+      "grad_norm": 0.5836464970149965,
+      "learning_rate": 5.64187458615939e-07,
+      "loss": 0.8533,
+      "step": 1209
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.45473012560542037,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.7159,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9688,
+      "grad_norm": 0.5057251789768997,
+      "learning_rate": 5.105330261267916e-07,
+      "loss": 0.6794,
+      "step": 1211
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.4317512882835771,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6841,
+      "step": 1212
+    },
+    {
+      "epoch": 0.9704,
+      "grad_norm": 0.5449199765588195,
+      "learning_rate": 4.5955240416271084e-07,
+      "loss": 0.8055,
+      "step": 1213
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.49405887408280086,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.7087,
+      "step": 1214
+    },
+    {
+      "epoch": 0.972,
+      "grad_norm": 0.7519298947851403,
+      "learning_rate": 4.112469628438365e-07,
+      "loss": 1.0253,
+      "step": 1215
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.47327704708410107,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6788,
+      "step": 1216
+    },
+    {
+      "epoch": 0.9736,
+      "grad_norm": 0.5334523235482905,
+      "learning_rate": 3.6561800039403016e-07,
+      "loss": 0.6963,
+      "step": 1217
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.527512599760187,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.8435,
+      "step": 1218
+    },
+    {
+      "epoch": 0.9752,
+      "grad_norm": 0.7552247746233401,
+      "learning_rate": 3.2266674310589273e-07,
+      "loss": 0.8224,
+      "step": 1219
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4870892290302616,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.76,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9768,
+      "grad_norm": 0.504087526922365,
+      "learning_rate": 2.8239434530792365e-07,
+      "loss": 0.6952,
+      "step": 1221
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.4738775389398773,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6668,
+      "step": 1222
+    },
+    {
+      "epoch": 0.9784,
+      "grad_norm": 0.45543660736999125,
+      "learning_rate": 2.448018893333681e-07,
+      "loss": 0.7908,
+      "step": 1223
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.4975029393577986,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.7367,
+      "step": 1224
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.5299062482370205,
+      "learning_rate": 2.098903854912515e-07,
+      "loss": 0.7523,
+      "step": 1225
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.4619442596125995,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.763,
+      "step": 1226
+    },
+    {
+      "epoch": 0.9816,
+      "grad_norm": 0.4146244321982028,
+      "learning_rate": 1.7766077203915655e-07,
+      "loss": 0.6935,
+      "step": 1227
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.5298289674346027,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.7382,
+      "step": 1228
+    },
+    {
+      "epoch": 0.9832,
+      "grad_norm": 0.3724034691465779,
+      "learning_rate": 1.481139151579991e-07,
+      "loss": 0.6287,
+      "step": 1229
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.5051314347173688,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.7146,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9848,
+      "grad_norm": 0.44418282139927207,
+      "learning_rate": 1.2125060892881346e-07,
+      "loss": 0.6672,
+      "step": 1231
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.5281467347436645,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.6843,
+      "step": 1232
+    },
+    {
+      "epoch": 0.9864,
+      "grad_norm": 0.5394669528581812,
+      "learning_rate": 9.707157531134713e-08,
+      "loss": 0.8433,
+      "step": 1233
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.673443803006134,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.7906,
+      "step": 1234
+    },
+    {
+      "epoch": 0.988,
+      "grad_norm": 0.473550295777955,
+      "learning_rate": 7.557746412468758e-08,
+      "loss": 0.6912,
+      "step": 1235
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.49779496772024334,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.7571,
+      "step": 1236
+    },
+    {
+      "epoch": 0.9896,
+      "grad_norm": 0.4834002939644358,
+      "learning_rate": 5.6768853029787184e-08,
+      "loss": 0.7499,
+      "step": 1237
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.5449031268397762,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.777,
+      "step": 1238
+    },
+    {
+      "epoch": 0.9912,
+      "grad_norm": 0.46602851849871574,
+      "learning_rate": 4.064624751394242e-08,
+      "loss": 0.7342,
+      "step": 1239
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.5296666070983255,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.7446,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9928,
+      "grad_norm": 0.47795206546734526,
+      "learning_rate": 2.7210080877237976e-08,
+      "loss": 0.6973,
+      "step": 1241
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.48428904753018837,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6719,
+      "step": 1242
+    },
+    {
+      "epoch": 0.9944,
+      "grad_norm": 0.5101617903481329,
+      "learning_rate": 1.646071422083395e-08,
+      "loss": 0.7046,
+      "step": 1243
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.5035560136801784,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.7437,
+      "step": 1244
+    },
+    {
+      "epoch": 0.996,
+      "grad_norm": 0.4949316018847181,
+      "learning_rate": 8.398436437317969e-09,
+      "loss": 0.8164,
+      "step": 1245
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.4514666113382206,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.7741,
+      "step": 1246
+    },
+    {
+      "epoch": 0.9976,
+      "grad_norm": 0.4555143143258106,
+      "learning_rate": 3.023464202944748e-09,
+      "loss": 0.7282,
+      "step": 1247
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.5364032120210537,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.781,
+      "step": 1248
+    },
+    {
+      "epoch": 0.9992,
+      "grad_norm": 0.4292570766269159,
+      "learning_rate": 3.3594197175190745e-10,
+      "loss": 0.6679,
+      "step": 1249
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.49169442964223203,
+      "learning_rate": 0.0,
+      "loss": 0.7719,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0,
+      "step": 1250,
+      "total_flos": 1011338945232896.0,
+      "train_loss": 0.8072394575595856,
+      "train_runtime": 18634.52,
+      "train_samples_per_second": 1.073,
+      "train_steps_per_second": 0.067
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1011338945232896.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..510a6be2b102e7665c940a276211fb291b0a8b18
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "down_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj",
+    "o_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7f54ce1a27d1f0d933d828f6ccd9ba59740825b4
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b28e891b8dc636a042936948931b6fd50b267e56a0860ba391f734347487bd74
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c8ab5f20ba85db1903a8ade3f488519eae8f383a
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd8b411e68e555a9e54f30cd99932b89ebc7ea20d16427f6b11e184de776b61d
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..503ead140761ab7a0e61d00bee04ac438dcc29c4
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,8792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.9080813842870766,
+      "learning_rate": 5.263157894736842e-06,
+      "loss": 1.3037,
+      "step": 1
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.8041941309224327,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.1017,
+      "step": 2
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 1.2219852888632978,
+      "learning_rate": 1.5789473684210526e-05,
+      "loss": 1.4947,
+      "step": 3
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 1.0625205619778837,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.405,
+      "step": 4
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.9969124865984521,
+      "learning_rate": 2.6315789473684212e-05,
+      "loss": 1.3029,
+      "step": 5
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.7206323835164766,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.3023,
+      "step": 6
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.7470650917108264,
+      "learning_rate": 3.6842105263157895e-05,
+      "loss": 1.2665,
+      "step": 7
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.948707380340728,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.3803,
+      "step": 8
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.8089592105967153,
+      "learning_rate": 4.736842105263158e-05,
+      "loss": 1.1756,
+      "step": 9
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.8676067765283894,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.0588,
+      "step": 10
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 1.263092953497496,
+      "learning_rate": 5.789473684210527e-05,
+      "loss": 1.248,
+      "step": 11
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.9473292732665697,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.1317,
+      "step": 12
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.8477610058044885,
+      "learning_rate": 6.842105263157895e-05,
+      "loss": 1.0623,
+      "step": 13
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.7667070066497372,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.0126,
+      "step": 14
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.9625049180961249,
+      "learning_rate": 7.894736842105263e-05,
+      "loss": 1.1966,
+      "step": 15
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.8126893313366784,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.9224,
+      "step": 16
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.5998893295288562,
+      "learning_rate": 8.947368421052632e-05,
+      "loss": 0.8809,
+      "step": 17
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.5525965962771149,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9377,
+      "step": 18
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.6807763485523121,
+      "learning_rate": 0.0001,
+      "loss": 1.0672,
+      "step": 19
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5174866286955669,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.9056,
+      "step": 20
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.6592880038492572,
+      "learning_rate": 0.0001105263157894737,
+      "loss": 1.0465,
+      "step": 21
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.6491568093567391,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 1.0704,
+      "step": 22
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.5331666347350892,
+      "learning_rate": 0.00012105263157894738,
+      "loss": 0.8278,
+      "step": 23
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5730405073227411,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8971,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.5443068513791588,
+      "learning_rate": 0.00013157894736842108,
+      "loss": 0.8812,
+      "step": 25
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.5864765390444897,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9682,
+      "step": 26
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.6398113996329627,
+      "learning_rate": 0.00014210526315789474,
+      "loss": 0.929,
+      "step": 27
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.6544333191146848,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 1.0644,
+      "step": 28
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.56998721759652,
+      "learning_rate": 0.00015263157894736845,
+      "loss": 0.8449,
+      "step": 29
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.576332085668357,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8513,
+      "step": 30
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.6675100680815883,
+      "learning_rate": 0.0001631578947368421,
+      "loss": 0.94,
+      "step": 31
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5728044946917537,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8683,
+      "step": 32
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.8221054156386518,
+      "learning_rate": 0.0001736842105263158,
+      "loss": 1.0743,
+      "step": 33
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.4980006872192115,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.9559,
+      "step": 34
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.6054537884300073,
+      "learning_rate": 0.00018421052631578948,
+      "loss": 1.0058,
+      "step": 35
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5007100149127811,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8745,
+      "step": 36
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.5122902755716746,
+      "learning_rate": 0.00019473684210526317,
+      "loss": 0.8939,
+      "step": 37
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.6651795910026952,
+      "learning_rate": 0.0002,
+      "loss": 0.9332,
+      "step": 38
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.7491880786339893,
+      "learning_rate": 0.00019999966405802826,
+      "loss": 1.0938,
+      "step": 39
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5636400991503409,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8543,
+      "step": 40
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.6360793584421368,
+      "learning_rate": 0.00019999697653579705,
+      "loss": 0.9355,
+      "step": 41
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.5589998853524988,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.9103,
+      "step": 42
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.5226562362394936,
+      "learning_rate": 0.0001999916015635627,
+      "loss": 0.7912,
+      "step": 43
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.540808102334626,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8746,
+      "step": 44
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.6609373211062586,
+      "learning_rate": 0.00019998353928577919,
+      "loss": 0.9303,
+      "step": 45
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.6490434451115461,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.9519,
+      "step": 46
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.6205685785171462,
+      "learning_rate": 0.0001999727899191228,
+      "loss": 0.985,
+      "step": 47
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.6233932799073175,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.9277,
+      "step": 48
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.5532631479296956,
+      "learning_rate": 0.00019995935375248606,
+      "loss": 0.9422,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.5500345487184016,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8876,
+      "step": 50
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.5753158846253149,
+      "learning_rate": 0.00019994323114697022,
+      "loss": 0.9135,
+      "step": 51
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5410274988154797,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.9717,
+      "step": 52
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.4639919243788228,
+      "learning_rate": 0.0001999244225358753,
+      "loss": 0.8245,
+      "step": 53
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.6341811700951703,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8575,
+      "step": 54
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.6186586598120916,
+      "learning_rate": 0.00019990292842468868,
+      "loss": 1.0047,
+      "step": 55
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.6975096235960725,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 1.0039,
+      "step": 56
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.6137359431792841,
+      "learning_rate": 0.0001998787493910712,
+      "loss": 0.9686,
+      "step": 57
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.56801863237565,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.9022,
+      "step": 58
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.6280518664851908,
+      "learning_rate": 0.000199851886084842,
+      "loss": 0.9231,
+      "step": 59
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.47910875481119425,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8217,
+      "step": 60
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.5688670302130326,
+      "learning_rate": 0.00019982233922796085,
+      "loss": 0.9648,
+      "step": 61
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.47222584023221503,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8544,
+      "step": 62
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.48411043524638503,
+      "learning_rate": 0.00019979010961450878,
+      "loss": 0.8043,
+      "step": 63
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5523628206241382,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.7661,
+      "step": 64
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.606102474993822,
+      "learning_rate": 0.00019975519811066663,
+      "loss": 0.88,
+      "step": 65
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.47264111938475795,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.7791,
+      "step": 66
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.6030545368737588,
+      "learning_rate": 0.0001997176056546921,
+      "loss": 0.7877,
+      "step": 67
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.5380818452529288,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.8597,
+      "step": 68
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.5668178890984674,
+      "learning_rate": 0.0001996773332568941,
+      "loss": 0.9309,
+      "step": 69
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.7492229436319809,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.8776,
+      "step": 70
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.45594386398155734,
+      "learning_rate": 0.00019963438199960599,
+      "loss": 0.8762,
+      "step": 71
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.6689489280264908,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.7738,
+      "step": 72
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.5824962388090401,
+      "learning_rate": 0.00019958875303715615,
+      "loss": 0.9494,
+      "step": 73
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.6651407546192796,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.9906,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5912319175461134,
+      "learning_rate": 0.0001995404475958373,
+      "loss": 0.9225,
+      "step": 75
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5877386519859602,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8798,
+      "step": 76
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.586893525784064,
+      "learning_rate": 0.0001994894669738732,
+      "loss": 0.8399,
+      "step": 77
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.4990675239565995,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.8533,
+      "step": 78
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.5485876587251447,
+      "learning_rate": 0.0001994358125413841,
+      "loss": 0.8259,
+      "step": 79
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5101625540764176,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8224,
+      "step": 80
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.5053166186801189,
+      "learning_rate": 0.0001993794857403495,
+      "loss": 0.8231,
+      "step": 81
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.6467634237745158,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.9116,
+      "step": 82
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.5479339960363587,
+      "learning_rate": 0.0001993204880845699,
+      "loss": 0.9351,
+      "step": 83
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.5165396779025556,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.8045,
+      "step": 84
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.4662633534647873,
+      "learning_rate": 0.00019925882115962568,
+      "loss": 0.8121,
+      "step": 85
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.548071998854012,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.8916,
+      "step": 86
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.4529264571570421,
+      "learning_rate": 0.00019919448662283478,
+      "loss": 0.8347,
+      "step": 87
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.6098404262902024,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.9271,
+      "step": 88
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.5601937657742456,
+      "learning_rate": 0.00019912748620320794,
+      "loss": 0.8876,
+      "step": 89
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.522349236836615,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.8808,
+      "step": 90
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.5918162051866122,
+      "learning_rate": 0.00019905782170140238,
+      "loss": 0.9784,
+      "step": 91
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.48353602339655577,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8732,
+      "step": 92
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.563166983744731,
+      "learning_rate": 0.00019898549498967343,
+      "loss": 0.9462,
+      "step": 93
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.48223871559795334,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.8935,
+      "step": 94
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.5142521212646851,
+      "learning_rate": 0.000198910508011824,
+      "loss": 0.8258,
+      "step": 95
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.6175859185606641,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.9889,
+      "step": 96
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.5818806624503104,
+      "learning_rate": 0.00019883286278315262,
+      "loss": 0.8124,
+      "step": 97
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.559895831107885,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.868,
+      "step": 98
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.6709827607983061,
+      "learning_rate": 0.00019875256139039902,
+      "loss": 1.036,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.45243485733229544,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.7627,
+      "step": 100
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.683941710270608,
+      "learning_rate": 0.00019866960599168826,
+      "loss": 0.9704,
+      "step": 101
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.6248216468352323,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 1.1019,
+      "step": 102
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.5535038671388882,
+      "learning_rate": 0.0001985839988164726,
+      "loss": 0.8644,
+      "step": 103
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.5286991560003955,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.8586,
+      "step": 104
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.7117394642571703,
+      "learning_rate": 0.00019849574216547171,
+      "loss": 0.9799,
+      "step": 105
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.5948683491083182,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8904,
+      "step": 106
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.5310452065515473,
+      "learning_rate": 0.00019840483841061058,
+      "loss": 0.8672,
+      "step": 107
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.6362836731096708,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.8982,
+      "step": 108
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.6631034346390334,
+      "learning_rate": 0.00019831128999495606,
+      "loss": 0.9353,
+      "step": 109
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.596396075120275,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.7807,
+      "step": 110
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.6183944289155431,
+      "learning_rate": 0.0001982150994326511,
+      "loss": 0.9543,
+      "step": 111
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.559201812127621,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.9205,
+      "step": 112
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.5083531814012113,
+      "learning_rate": 0.0001981162693088471,
+      "loss": 0.7971,
+      "step": 113
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.5671801338505673,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.924,
+      "step": 114
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.6865308042995342,
+      "learning_rate": 0.0001980148022796345,
+      "loss": 0.8445,
+      "step": 115
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.5813178583612161,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.9226,
+      "step": 116
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.5518225766620017,
+      "learning_rate": 0.00019791070107197153,
+      "loss": 0.8605,
+      "step": 117
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.5459956040550069,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.8972,
+      "step": 118
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.5806101685858397,
+      "learning_rate": 0.0001978039684836106,
+      "loss": 0.9246,
+      "step": 119
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.6021871429951688,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.9122,
+      "step": 120
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.5962927746824843,
+      "learning_rate": 0.0001976946073830234,
+      "loss": 0.9049,
+      "step": 121
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.4523411532532136,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8217,
+      "step": 122
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.5865078340068325,
+      "learning_rate": 0.00019758262070932375,
+      "loss": 0.9294,
+      "step": 123
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.45349956904909167,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7856,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.517464263535899,
+      "learning_rate": 0.00019746801147218842,
+      "loss": 0.9466,
+      "step": 125
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.6575914482573735,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 1.0001,
+      "step": 126
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.5446715985980961,
+      "learning_rate": 0.00019735078275177654,
+      "loss": 0.943,
+      "step": 127
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.7043591135383009,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.9803,
+      "step": 128
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.6092843978143063,
+      "learning_rate": 0.00019723093769864663,
+      "loss": 0.8894,
+      "step": 129
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.6176019096628385,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.9069,
+      "step": 130
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.6835264015786288,
+      "learning_rate": 0.0001971084795336719,
+      "loss": 1.055,
+      "step": 131
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.6602792610715091,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.9217,
+      "step": 132
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.7051153166006852,
+      "learning_rate": 0.00019698341154795389,
+      "loss": 0.9149,
+      "step": 133
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.5291715643966506,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.8375,
+      "step": 134
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.8489485432390967,
+      "learning_rate": 0.00019685573710273376,
+      "loss": 0.9701,
+      "step": 135
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.7030018259258961,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 1.0476,
+      "step": 136
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.41150624043839457,
+      "learning_rate": 0.00019672545962930215,
+      "loss": 0.7894,
+      "step": 137
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.6158901773012407,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.9524,
+      "step": 138
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.5306678189960342,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.8594,
+      "step": 139
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5973439965714,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8997,
+      "step": 140
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.562538296474552,
+      "learning_rate": 0.00019645710967265882,
+      "loss": 0.8929,
+      "step": 141
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.5964221253209941,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.9702,
+      "step": 142
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.6133126092378702,
+      "learning_rate": 0.00019631904440143612,
+      "loss": 0.9131,
+      "step": 143
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.5354137818878005,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.8523,
+      "step": 144
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.4640625679295804,
+      "learning_rate": 0.00019617839052578603,
+      "loss": 0.7739,
+      "step": 145
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.5512099210314915,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.9539,
+      "step": 146
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.5112422245899192,
+      "learning_rate": 0.0001960351518258255,
+      "loss": 0.809,
+      "step": 147
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5137823206631175,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8841,
+      "step": 148
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.5535363768420065,
+      "learning_rate": 0.00019588933215113926,
+      "loss": 0.9786,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.716718046449404,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.9377,
+      "step": 150
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.5273600234368598,
+      "learning_rate": 0.00019574093542067673,
+      "loss": 0.8908,
+      "step": 151
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.5659469100313295,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.9273,
+      "step": 152
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.5148339514433602,
+      "learning_rate": 0.0001955899656226464,
+      "loss": 0.8526,
+      "step": 153
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.5668077761671138,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8812,
+      "step": 154
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.5116988392192973,
+      "learning_rate": 0.0001954364268144088,
+      "loss": 0.8693,
+      "step": 155
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.5905348006861363,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.9094,
+      "step": 156
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.5206388139216594,
+      "learning_rate": 0.00019528032312236736,
+      "loss": 0.8352,
+      "step": 157
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.6214901397689111,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.9938,
+      "step": 158
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.737160634050996,
+      "learning_rate": 0.00019512165874185767,
+      "loss": 0.9611,
+      "step": 159
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.520320642155761,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.824,
+      "step": 160
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.6084476825668118,
+      "learning_rate": 0.0001949604379370345,
+      "loss": 0.8687,
+      "step": 161
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.47688399456411,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8424,
+      "step": 162
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.48499020012350524,
+      "learning_rate": 0.00019479666504075736,
+      "loss": 0.78,
+      "step": 163
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.540156343662923,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.9063,
+      "step": 164
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.5516619124746854,
+      "learning_rate": 0.0001946303444544741,
+      "loss": 1.0197,
+      "step": 165
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.6793270520647662,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.9356,
+      "step": 166
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.6449131975857184,
+      "learning_rate": 0.00019446148064810242,
+      "loss": 0.9387,
+      "step": 167
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5909935070229813,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 1.0363,
+      "step": 168
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.48583088614787706,
+      "learning_rate": 0.00019429007815990993,
+      "loss": 0.8313,
+      "step": 169
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.628898629621968,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.8713,
+      "step": 170
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.6085125380724073,
+      "learning_rate": 0.00019411614159639204,
+      "loss": 0.8776,
+      "step": 171
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5975998757056563,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.819,
+      "step": 172
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.6390380149254054,
+      "learning_rate": 0.00019393967563214833,
+      "loss": 0.8032,
+      "step": 173
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.46959578227076104,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.7833,
+      "step": 174
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.4762964480716448,
+      "learning_rate": 0.00019376068500975667,
+      "loss": 0.837,
+      "step": 175
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5105373450821388,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8487,
+      "step": 176
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.5468553130094058,
+      "learning_rate": 0.000193579174539646,
+      "loss": 0.8661,
+      "step": 177
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.6907998813306322,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 1.02,
+      "step": 178
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.6055411372103042,
+      "learning_rate": 0.00019339514909996706,
+      "loss": 0.8596,
+      "step": 179
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.49086621144765663,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8739,
+      "step": 180
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.5263248779575109,
+      "learning_rate": 0.00019320861363646095,
+      "loss": 0.8587,
+      "step": 181
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.5200412219564152,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.9819,
+      "step": 182
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.4774845980916723,
+      "learning_rate": 0.00019301957316232658,
+      "loss": 0.8869,
+      "step": 183
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.5959662925959347,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.9383,
+      "step": 184
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.9842793757570049,
+      "learning_rate": 0.0001928280327580858,
+      "loss": 0.8066,
+      "step": 185
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.522478559416786,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.8053,
+      "step": 186
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.6353334165247837,
+      "learning_rate": 0.00019263399757144683,
+      "loss": 0.9887,
+      "step": 187
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.4735932083368219,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.7937,
+      "step": 188
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.675790623641156,
+      "learning_rate": 0.000192437472817166,
+      "loss": 0.9149,
+      "step": 189
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.5456571446884041,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.8176,
+      "step": 190
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.6396353944202722,
+      "learning_rate": 0.00019223846377690754,
+      "loss": 0.8454,
+      "step": 191
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.6232851227873076,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.9224,
+      "step": 192
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.5150570016172883,
+      "learning_rate": 0.00019203697579910154,
+      "loss": 0.7933,
+      "step": 193
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.528689330359432,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.812,
+      "step": 194
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.4711657231777871,
+      "learning_rate": 0.00019183301429880043,
+      "loss": 0.8769,
+      "step": 195
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.5031768770285683,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.7461,
+      "step": 196
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.5753357539717228,
+      "learning_rate": 0.00019162658475753327,
+      "loss": 0.8839,
+      "step": 197
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.47826080000077054,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.7602,
+      "step": 198
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.5331109078884061,
+      "learning_rate": 0.00019141769272315858,
+      "loss": 0.9127,
+      "step": 199
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.513453303514768,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.9073,
+      "step": 200
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.5306547540075643,
+      "learning_rate": 0.00019120634380971496,
+      "loss": 0.909,
+      "step": 201
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.5097127261566166,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.9403,
+      "step": 202
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.5219176059890506,
+      "learning_rate": 0.0001909925436972706,
+      "loss": 0.7665,
+      "step": 203
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.47743145543637183,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.8187,
+      "step": 204
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.8883065430241422,
+      "learning_rate": 0.00019077629813177036,
+      "loss": 1.0057,
+      "step": 205
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.7178760236368346,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 1.003,
+      "step": 206
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.5944311916972052,
+      "learning_rate": 0.00019055761292488142,
+      "loss": 0.8245,
+      "step": 207
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.5477617007454727,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7779,
+      "step": 208
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.4945521838206525,
+      "learning_rate": 0.00019033649395383702,
+      "loss": 0.8471,
+      "step": 209
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.49935690116121134,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.8117,
+      "step": 210
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.5422040702495826,
+      "learning_rate": 0.00019011294716127867,
+      "loss": 0.8864,
+      "step": 211
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.5525455231214514,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.8636,
+      "step": 212
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.4661463876181464,
+      "learning_rate": 0.0001898869785550963,
+      "loss": 0.8049,
+      "step": 213
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.5076418761116616,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.7897,
+      "step": 214
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.4782565516179986,
+      "learning_rate": 0.00018965859420826684,
+      "loss": 0.8408,
+      "step": 215
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.4858303386449708,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.8167,
+      "step": 216
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.6632669237035178,
+      "learning_rate": 0.00018942780025869098,
+      "loss": 0.9443,
+      "step": 217
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.5484973819890846,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.8538,
+      "step": 218
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.4925168999699814,
+      "learning_rate": 0.00018919460290902826,
+      "loss": 0.8905,
+      "step": 219
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.5876668431809312,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.8862,
+      "step": 220
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.5314406517026361,
+      "learning_rate": 0.0001889590084265304,
+      "loss": 0.7883,
+      "step": 221
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.6817510756689252,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.9403,
+      "step": 222
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.6989926738514176,
+      "learning_rate": 0.0001887210231428727,
+      "loss": 0.8941,
+      "step": 223
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.5252580793669265,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.8209,
+      "step": 224
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5850348971319385,
+      "learning_rate": 0.0001884806534539841,
+      "loss": 0.8525,
+      "step": 225
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.5166293535713776,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.7429,
+      "step": 226
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.70187109719964,
+      "learning_rate": 0.0001882379058198751,
+      "loss": 0.891,
+      "step": 227
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.5867245972162806,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.8451,
+      "step": 228
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.5464230292978006,
+      "learning_rate": 0.00018799278676446423,
+      "loss": 0.8528,
+      "step": 229
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.6041812836460903,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.8383,
+      "step": 230
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.4595506522936158,
+      "learning_rate": 0.00018774530287540278,
+      "loss": 0.789,
+      "step": 231
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.6382950989160578,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.8557,
+      "step": 232
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.4611123108062563,
+      "learning_rate": 0.00018749546080389757,
+      "loss": 0.7605,
+      "step": 233
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.595657447518986,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7815,
+      "step": 234
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.4633808160151315,
+      "learning_rate": 0.00018724326726453244,
+      "loss": 0.8074,
+      "step": 235
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.6901121421546078,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.8163,
+      "step": 236
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.5616861858844762,
+      "learning_rate": 0.00018698872903508755,
+      "loss": 0.9254,
+      "step": 237
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.4155470020456329,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7098,
+      "step": 238
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.5129706886346935,
+      "learning_rate": 0.0001867318529563574,
+      "loss": 0.9079,
+      "step": 239
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.518005849891447,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8289,
+      "step": 240
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.4078707555014803,
+      "learning_rate": 0.00018647264593196688,
+      "loss": 0.7314,
+      "step": 241
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.6197715152766672,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.8875,
+      "step": 242
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.5114345849769838,
+      "learning_rate": 0.00018621111492818585,
+      "loss": 0.8891,
+      "step": 243
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.48995688769097845,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.8496,
+      "step": 244
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.44177139312483277,
+      "learning_rate": 0.00018594726697374175,
+      "loss": 0.7739,
+      "step": 245
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.5348604710766836,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.8421,
+      "step": 246
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.6622529640570501,
+      "learning_rate": 0.0001856811091596308,
+      "loss": 0.8967,
+      "step": 247
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.42488582416266113,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7882,
+      "step": 248
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.5272002452985016,
+      "learning_rate": 0.00018541264863892754,
+      "loss": 0.8122,
+      "step": 249
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.6144604512161106,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.8928,
+      "step": 250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.5690544597134083,
+      "learning_rate": 0.00018514189262659235,
+      "loss": 0.8953,
+      "step": 251
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.5218802931219912,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8486,
+      "step": 252
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.4688014307919003,
+      "learning_rate": 0.00018486884839927768,
+      "loss": 0.8059,
+      "step": 253
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.5704523846582193,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.8298,
+      "step": 254
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.5382313464182708,
+      "learning_rate": 0.0001845935232951325,
+      "loss": 0.8604,
+      "step": 255
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.7253872353623282,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 1.0071,
+      "step": 256
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.5985727808398382,
+      "learning_rate": 0.00018431592471360503,
+      "loss": 0.9913,
+      "step": 257
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.5356362639095812,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.8476,
+      "step": 258
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.48684948291038294,
+      "learning_rate": 0.000184036060115244,
+      "loss": 0.8489,
+      "step": 259
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.44091418628157353,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.8342,
+      "step": 260
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.45212614639546883,
+      "learning_rate": 0.00018375393702149787,
+      "loss": 0.7955,
+      "step": 261
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5450685948821071,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.8632,
+      "step": 262
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.4666481729146395,
+      "learning_rate": 0.00018346956301451304,
+      "loss": 0.7784,
+      "step": 263
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5425279104506674,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.8956,
+      "step": 264
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.6870473826461702,
+      "learning_rate": 0.00018318294573692985,
+      "loss": 1.0676,
+      "step": 265
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.5810030337078149,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.7934,
+      "step": 266
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.6519705109175864,
+      "learning_rate": 0.0001828940928916772,
+      "loss": 1.0016,
+      "step": 267
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.6754890438239762,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.8099,
+      "step": 268
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.721804852693335,
+      "learning_rate": 0.00018260301224176558,
+      "loss": 0.942,
+      "step": 269
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5476317566123734,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.9227,
+      "step": 270
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.5793651042918275,
+      "learning_rate": 0.00018230971161007853,
+      "loss": 0.8189,
+      "step": 271
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.5445884292655935,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7936,
+      "step": 272
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.523425477223313,
+      "learning_rate": 0.00018201419887916214,
+      "loss": 0.8962,
+      "step": 273
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.7693345684662526,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.988,
+      "step": 274
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.6309561995456173,
+      "learning_rate": 0.00018171648199101346,
+      "loss": 0.8887,
+      "step": 275
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4918388328990002,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.8453,
+      "step": 276
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.5032161536321187,
+      "learning_rate": 0.00018141656894686689,
+      "loss": 0.8303,
+      "step": 277
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.6350141023434128,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.9384,
+      "step": 278
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.5046679114657476,
+      "learning_rate": 0.00018111446780697929,
+      "loss": 0.8422,
+      "step": 279
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.6290049439223223,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.9196,
+      "step": 280
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.5724433204626115,
+      "learning_rate": 0.00018081018669041324,
+      "loss": 0.8146,
+      "step": 281
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.5571221248924423,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.8749,
+      "step": 282
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.6030624668703969,
+      "learning_rate": 0.00018050373377481878,
+      "loss": 0.9406,
+      "step": 283
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.5112945820734779,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.8608,
+      "step": 284
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.4452627787024938,
+      "learning_rate": 0.0001801951172962139,
+      "loss": 0.7551,
+      "step": 285
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.5760706488202555,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.8301,
+      "step": 286
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.4719479158394112,
+      "learning_rate": 0.0001798843455487629,
+      "loss": 0.8355,
+      "step": 287
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.5311096217325775,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.8228,
+      "step": 288
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.6070868831248841,
+      "learning_rate": 0.00017957142688455362,
+      "loss": 0.8855,
+      "step": 289
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.5910494736537589,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.8853,
+      "step": 290
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.49561716542204515,
+      "learning_rate": 0.00017925636971337304,
+      "loss": 0.8828,
+      "step": 291
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.5539093378586168,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.9094,
+      "step": 292
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.5477538088346802,
+      "learning_rate": 0.00017893918250248104,
+      "loss": 0.8865,
+      "step": 293
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.4932579666169797,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7896,
+      "step": 294
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.5939639494694906,
+      "learning_rate": 0.00017861987377638312,
+      "loss": 0.8902,
+      "step": 295
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.43342525098781615,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7462,
+      "step": 296
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.5679787973019467,
+      "learning_rate": 0.0001782984521166011,
+      "loss": 0.9087,
+      "step": 297
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.7676538546927215,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.9968,
+      "step": 298
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.5628902043632186,
+      "learning_rate": 0.00017797492616144256,
+      "loss": 0.8499,
+      "step": 299
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.6251348859155498,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.8519,
+      "step": 300
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.5419026827932641,
+      "learning_rate": 0.00017764930460576866,
+      "loss": 0.8473,
+      "step": 301
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.4681480734347631,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7225,
+      "step": 302
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.7028670736176164,
+      "learning_rate": 0.00017732159620076053,
+      "loss": 0.879,
+      "step": 303
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.5177270541055513,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7814,
+      "step": 304
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.4979783679728938,
+      "learning_rate": 0.00017699180975368396,
+      "loss": 0.8974,
+      "step": 305
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.46330771611366944,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7494,
+      "step": 306
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.5875281058129088,
+      "learning_rate": 0.00017665995412765285,
+      "loss": 0.8414,
+      "step": 307
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.5336432400910488,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.8316,
+      "step": 308
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.6295129876568483,
+      "learning_rate": 0.00017632603824139085,
+      "loss": 0.9598,
+      "step": 309
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.5151871712060029,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.826,
+      "step": 310
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.637025894741612,
+      "learning_rate": 0.0001759900710689918,
+      "loss": 0.8772,
+      "step": 311
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.44883188465552115,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.7352,
+      "step": 312
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.5037347267609362,
+      "learning_rate": 0.00017565206163967846,
+      "loss": 0.7849,
+      "step": 313
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.46999643690579695,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8446,
+      "step": 314
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.5486126383321374,
+      "learning_rate": 0.00017531201903755994,
+      "loss": 0.793,
+      "step": 315
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4690195039420612,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.7481,
+      "step": 316
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.4469499837433491,
+      "learning_rate": 0.00017496995240138744,
+      "loss": 0.8287,
+      "step": 317
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.4704646478693022,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7376,
+      "step": 318
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.5057383073850039,
+      "learning_rate": 0.00017462587092430875,
+      "loss": 0.8314,
+      "step": 319
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.6310987292783401,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.8951,
+      "step": 320
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.5449317163004509,
+      "learning_rate": 0.00017427978385362112,
+      "loss": 0.9102,
+      "step": 321
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.514102594990965,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.8377,
+      "step": 322
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.5559467329804153,
+      "learning_rate": 0.0001739317004905227,
+      "loss": 0.812,
+      "step": 323
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.48821193891200365,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.8487,
+      "step": 324
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.551004542218846,
+      "learning_rate": 0.00017358163018986282,
+      "loss": 0.8464,
+      "step": 325
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.5397512306039455,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.83,
+      "step": 326
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.48540731321502306,
+      "learning_rate": 0.00017322958235989016,
+      "loss": 0.809,
+      "step": 327
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.861999686549182,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.8021,
+      "step": 328
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.5336204933791293,
+      "learning_rate": 0.00017287556646200018,
+      "loss": 0.8654,
+      "step": 329
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.6022331354048954,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.8526,
+      "step": 330
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.4690650461656505,
+      "learning_rate": 0.00017251959201048083,
+      "loss": 0.8373,
+      "step": 331
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.6677068500987228,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.9869,
+      "step": 332
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.5612166144020337,
+      "learning_rate": 0.00017216166857225674,
+      "loss": 0.8014,
+      "step": 333
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.5268558472286875,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.8929,
+      "step": 334
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.5530749106640147,
+      "learning_rate": 0.00017180180576663228,
+      "loss": 0.8626,
+      "step": 335
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.5059780985423626,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8821,
+      "step": 336
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.47676617124447485,
+      "learning_rate": 0.00017144001326503273,
+      "loss": 0.8224,
+      "step": 337
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.4620426249713613,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.8328,
+      "step": 338
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.5028953666722216,
+      "learning_rate": 0.00017107630079074478,
+      "loss": 0.8562,
+      "step": 339
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.47652443163944547,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7072,
+      "step": 340
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.6066152782490621,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.9276,
+      "step": 341
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.5528112731017201,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.8803,
+      "step": 342
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.4918047438378406,
+      "learning_rate": 0.00017034315507498635,
+      "loss": 0.8163,
+      "step": 343
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4535225066788238,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7531,
+      "step": 344
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.5375438968538784,
+      "learning_rate": 0.00016997374153703625,
+      "loss": 0.8982,
+      "step": 345
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.5308274941140089,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.8181,
+      "step": 346
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.5609607519764662,
+      "learning_rate": 0.00016960244743290868,
+      "loss": 0.9398,
+      "step": 347
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.5618809545792961,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.9222,
+      "step": 348
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.49267474774738795,
+      "learning_rate": 0.00016922928274124886,
+      "loss": 0.7758,
+      "step": 349
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5455689194244441,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.8552,
+      "step": 350
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.4429486571180159,
+      "learning_rate": 0.00016885425749097444,
+      "loss": 0.7324,
+      "step": 351
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.5115503357592327,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7175,
+      "step": 352
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.542034705746471,
+      "learning_rate": 0.00016847738176100632,
+      "loss": 0.8652,
+      "step": 353
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.5700986682212333,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.8818,
+      "step": 354
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.6164349413931524,
+      "learning_rate": 0.0001680986656799975,
+      "loss": 0.8666,
+      "step": 355
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.7599405188320725,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 1.055,
+      "step": 356
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.5167876671609728,
+      "learning_rate": 0.00016771811942606108,
+      "loss": 0.8582,
+      "step": 357
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.5068026331777621,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7654,
+      "step": 358
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.44707436209700097,
+      "learning_rate": 0.00016733575322649657,
+      "loss": 0.7804,
+      "step": 359
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4080835450589432,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.7625,
+      "step": 360
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.43926437412321234,
+      "learning_rate": 0.00016695157735751513,
+      "loss": 0.7466,
+      "step": 361
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.4050385558699055,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7511,
+      "step": 362
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.5654226910960457,
+      "learning_rate": 0.0001665656021439633,
+      "loss": 0.8699,
+      "step": 363
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.5628593254082522,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.8403,
+      "step": 364
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.4425185566602141,
+      "learning_rate": 0.00016617783795904565,
+      "loss": 0.7543,
+      "step": 365
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.5181238955708138,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7846,
+      "step": 366
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.6049124328529043,
+      "learning_rate": 0.00016578829522404583,
+      "loss": 0.9702,
+      "step": 367
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4578067005222997,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7539,
+      "step": 368
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.49441672286942956,
+      "learning_rate": 0.00016539698440804661,
+      "loss": 0.8168,
+      "step": 369
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4076323538241769,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7347,
+      "step": 370
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.5343987743723532,
+      "learning_rate": 0.0001650039160276485,
+      "loss": 0.8339,
+      "step": 371
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.3896772629440146,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7062,
+      "step": 372
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.5018808715344389,
+      "learning_rate": 0.0001646091006466871,
+      "loss": 0.7814,
+      "step": 373
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.6303121820253016,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.8944,
+      "step": 374
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.4571025291047099,
+      "learning_rate": 0.00016421254887594917,
+      "loss": 0.749,
+      "step": 375
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.6095006648443052,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.9778,
+      "step": 376
+    },
+    {
+      "epoch": 0.3016,
+      "grad_norm": 0.41866928759649463,
+      "learning_rate": 0.00016381427137288754,
+      "loss": 0.6948,
+      "step": 377
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.5119080641088787,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.8283,
+      "step": 378
+    },
+    {
+      "epoch": 0.3032,
+      "grad_norm": 0.4479758812016671,
+      "learning_rate": 0.0001634142788413346,
+      "loss": 0.7014,
+      "step": 379
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.5338053773935066,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.8415,
+      "step": 380
+    },
+    {
+      "epoch": 0.3048,
+      "grad_norm": 0.5426629713846673,
+      "learning_rate": 0.00016301258203121462,
+      "loss": 0.9578,
+      "step": 381
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.4765158057171693,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.8188,
+      "step": 382
+    },
+    {
+      "epoch": 0.3064,
+      "grad_norm": 0.606064025013686,
+      "learning_rate": 0.00016260919173825508,
+      "loss": 0.7272,
+      "step": 383
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.46583597442321,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.747,
+      "step": 384
+    },
+    {
+      "epoch": 0.308,
+      "grad_norm": 0.49170481724700105,
+      "learning_rate": 0.00016220411880369601,
+      "loss": 0.7232,
+      "step": 385
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.6127045362104712,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.8813,
+      "step": 386
+    },
+    {
+      "epoch": 0.3096,
+      "grad_norm": 0.5205613144936889,
+      "learning_rate": 0.00016179737411399926,
+      "loss": 0.8447,
+      "step": 387
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.5048757231973682,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7569,
+      "step": 388
+    },
+    {
+      "epoch": 0.3112,
+      "grad_norm": 0.5623118605368121,
+      "learning_rate": 0.00016138896860055555,
+      "loss": 0.855,
+      "step": 389
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.43956424774060915,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7265,
+      "step": 390
+    },
+    {
+      "epoch": 0.3128,
+      "grad_norm": 0.4214455019569896,
+      "learning_rate": 0.00016097891323939062,
+      "loss": 0.7722,
+      "step": 391
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4784273054772774,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7692,
+      "step": 392
+    },
+    {
+      "epoch": 0.3144,
+      "grad_norm": 0.5834222022611343,
+      "learning_rate": 0.00016056721905087056,
+      "loss": 0.8919,
+      "step": 393
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.5106219005997958,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.8209,
+      "step": 394
+    },
+    {
+      "epoch": 0.316,
+      "grad_norm": 0.569192961747009,
+      "learning_rate": 0.00016015389709940538,
+      "loss": 0.907,
+      "step": 395
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.5621626398180208,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.8135,
+      "step": 396
+    },
+    {
+      "epoch": 0.3176,
+      "grad_norm": 0.48478603551166005,
+      "learning_rate": 0.0001597389584931517,
+      "loss": 0.737,
+      "step": 397
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.5159755890673956,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.8217,
+      "step": 398
+    },
+    {
+      "epoch": 0.3192,
+      "grad_norm": 0.4904250982213549,
+      "learning_rate": 0.0001593224143837142,
+      "loss": 0.8346,
+      "step": 399
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5040140792147169,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.9222,
+      "step": 400
+    },
+    {
+      "epoch": 0.3208,
+      "grad_norm": 0.45725979760283203,
+      "learning_rate": 0.00015890427596584617,
+      "loss": 0.7994,
+      "step": 401
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.46310184538021054,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7577,
+      "step": 402
+    },
+    {
+      "epoch": 0.3224,
+      "grad_norm": 0.4992877615928166,
+      "learning_rate": 0.00015848455447714822,
+      "loss": 0.8552,
+      "step": 403
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.4694773773028324,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.8345,
+      "step": 404
+    },
+    {
+      "epoch": 0.324,
+      "grad_norm": 0.5713594558988307,
+      "learning_rate": 0.00015806326119776663,
+      "loss": 0.9252,
+      "step": 405
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.4806296902614307,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7922,
+      "step": 406
+    },
+    {
+      "epoch": 0.3256,
+      "grad_norm": 0.48841973051979015,
+      "learning_rate": 0.00015764040745008988,
+      "loss": 0.7932,
+      "step": 407
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4759306612833967,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.8519,
+      "step": 408
+    },
+    {
+      "epoch": 0.3272,
+      "grad_norm": 0.7113908699203894,
+      "learning_rate": 0.00015721600459844468,
+      "loss": 0.9058,
+      "step": 409
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.5201090363863141,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.8838,
+      "step": 410
+    },
+    {
+      "epoch": 0.3288,
+      "grad_norm": 0.5090961313247779,
+      "learning_rate": 0.00015679006404879033,
+      "loss": 0.8772,
+      "step": 411
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.6185209914381692,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.9263,
+      "step": 412
+    },
+    {
+      "epoch": 0.3304,
+      "grad_norm": 0.5579821090590285,
+      "learning_rate": 0.00015636259724841222,
+      "loss": 0.8662,
+      "step": 413
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.4182061811503812,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7299,
+      "step": 414
+    },
+    {
+      "epoch": 0.332,
+      "grad_norm": 0.5313234861260732,
+      "learning_rate": 0.00015593361568561428,
+      "loss": 0.9152,
+      "step": 415
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.5699139186225048,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.6973,
+      "step": 416
+    },
+    {
+      "epoch": 0.3336,
+      "grad_norm": 0.539483811029819,
+      "learning_rate": 0.0001555031308894101,
+      "loss": 0.899,
+      "step": 417
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.5149907818436924,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7605,
+      "step": 418
+    },
+    {
+      "epoch": 0.3352,
+      "grad_norm": 0.6473197823751683,
+      "learning_rate": 0.0001550711544292131,
+      "loss": 0.8751,
+      "step": 419
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5248937236530749,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.8775,
+      "step": 420
+    },
+    {
+      "epoch": 0.3368,
+      "grad_norm": 0.5163199723121221,
+      "learning_rate": 0.00015463769791452574,
+      "loss": 0.8303,
+      "step": 421
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.5353908172771882,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.813,
+      "step": 422
+    },
+    {
+      "epoch": 0.3384,
+      "grad_norm": 0.4946626196922246,
+      "learning_rate": 0.00015420277299462736,
+      "loss": 0.8639,
+      "step": 423
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.5917448093868592,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.9257,
+      "step": 424
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.6444406512531591,
+      "learning_rate": 0.00015376639135826107,
+      "loss": 0.9207,
+      "step": 425
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.45635652337347343,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.781,
+      "step": 426
+    },
+    {
+      "epoch": 0.3416,
+      "grad_norm": 0.4949993574665331,
+      "learning_rate": 0.00015332856473331978,
+      "loss": 0.8141,
+      "step": 427
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.49600367041083593,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7994,
+      "step": 428
+    },
+    {
+      "epoch": 0.3432,
+      "grad_norm": 0.7260207881282403,
+      "learning_rate": 0.00015288930488653094,
+      "loss": 0.9608,
+      "step": 429
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.4720694087199054,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.8102,
+      "step": 430
+    },
+    {
+      "epoch": 0.3448,
+      "grad_norm": 0.4847251100010436,
+      "learning_rate": 0.0001524486236231402,
+      "loss": 0.8206,
+      "step": 431
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.6339699151906085,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7534,
+      "step": 432
+    },
+    {
+      "epoch": 0.3464,
+      "grad_norm": 0.5331544476642404,
+      "learning_rate": 0.00015200653278659432,
+      "loss": 0.8609,
+      "step": 433
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.381371833096432,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.6884,
+      "step": 434
+    },
+    {
+      "epoch": 0.348,
+      "grad_norm": 0.45499991014417324,
+      "learning_rate": 0.00015156304425822267,
+      "loss": 0.7211,
+      "step": 435
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.4305793653382324,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7718,
+      "step": 436
+    },
+    {
+      "epoch": 0.3496,
+      "grad_norm": 0.4969883493881603,
+      "learning_rate": 0.00015111816995691809,
+      "loss": 0.7574,
+      "step": 437
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.5599147758204644,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.8541,
+      "step": 438
+    },
+    {
+      "epoch": 0.3512,
+      "grad_norm": 0.7321788107994219,
+      "learning_rate": 0.00015067192183881658,
+      "loss": 0.8167,
+      "step": 439
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5037162560823695,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7933,
+      "step": 440
+    },
+    {
+      "epoch": 0.3528,
+      "grad_norm": 0.3798838749793671,
+      "learning_rate": 0.00015022431189697568,
+      "loss": 0.6813,
+      "step": 441
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.6728978647696912,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.9999,
+      "step": 442
+    },
+    {
+      "epoch": 0.3544,
+      "grad_norm": 0.48579585049914625,
+      "learning_rate": 0.0001497753521610526,
+      "loss": 0.8079,
+      "step": 443
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.6008468765117895,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.8463,
+      "step": 444
+    },
+    {
+      "epoch": 0.356,
+      "grad_norm": 0.669263341740366,
+      "learning_rate": 0.00014932505469698052,
+      "loss": 1.0313,
+      "step": 445
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.521168152258116,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.8429,
+      "step": 446
+    },
+    {
+      "epoch": 0.3576,
+      "grad_norm": 0.4890716157709086,
+      "learning_rate": 0.0001488734316066446,
+      "loss": 0.8428,
+      "step": 447
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.5917499099002614,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7181,
+      "step": 448
+    },
+    {
+      "epoch": 0.3592,
+      "grad_norm": 0.3608371868458814,
+      "learning_rate": 0.0001484204950275565,
+      "loss": 0.6941,
+      "step": 449
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.6327154948077564,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.8313,
+      "step": 450
+    },
+    {
+      "epoch": 0.3608,
+      "grad_norm": 0.5004531913410212,
+      "learning_rate": 0.00014796625713252848,
+      "loss": 0.747,
+      "step": 451
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.5275165881107703,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7813,
+      "step": 452
+    },
+    {
+      "epoch": 0.3624,
+      "grad_norm": 0.5222678604579563,
+      "learning_rate": 0.00014751073012934587,
+      "loss": 0.8047,
+      "step": 453
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.5502306110907442,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7823,
+      "step": 454
+    },
+    {
+      "epoch": 0.364,
+      "grad_norm": 0.4002829564402694,
+      "learning_rate": 0.0001470539262604393,
+      "loss": 0.6921,
+      "step": 455
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.5955630543614778,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.9125,
+      "step": 456
+    },
+    {
+      "epoch": 0.3656,
+      "grad_norm": 0.4775347402897507,
+      "learning_rate": 0.00014659585780255556,
+      "loss": 0.8071,
+      "step": 457
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.5417093409333327,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.8894,
+      "step": 458
+    },
+    {
+      "epoch": 0.3672,
+      "grad_norm": 0.4558793192299627,
+      "learning_rate": 0.0001461365370664276,
+      "loss": 0.7882,
+      "step": 459
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.5501540489717972,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.8402,
+      "step": 460
+    },
+    {
+      "epoch": 0.3688,
+      "grad_norm": 0.5412326751618114,
+      "learning_rate": 0.00014567597639644387,
+      "loss": 0.8552,
+      "step": 461
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.48300319301871625,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.8592,
+      "step": 462
+    },
+    {
+      "epoch": 0.3704,
+      "grad_norm": 0.439939727513309,
+      "learning_rate": 0.00014521418817031628,
+      "loss": 0.7545,
+      "step": 463
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.5902929394067173,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.8344,
+      "step": 464
+    },
+    {
+      "epoch": 0.372,
+      "grad_norm": 0.4993027137143001,
+      "learning_rate": 0.00014475118479874774,
+      "loss": 0.787,
+      "step": 465
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.6600975386600476,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.8586,
+      "step": 466
+    },
+    {
+      "epoch": 0.3736,
+      "grad_norm": 0.5292932095632575,
+      "learning_rate": 0.0001442869787250987,
+      "loss": 0.7707,
+      "step": 467
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.5408992613553245,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.7343,
+      "step": 468
+    },
+    {
+      "epoch": 0.3752,
+      "grad_norm": 0.4303682374368871,
+      "learning_rate": 0.00014382158242505234,
+      "loss": 0.7495,
+      "step": 469
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.44958426243468036,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7517,
+      "step": 470
+    },
+    {
+      "epoch": 0.3768,
+      "grad_norm": 0.4935174463060379,
+      "learning_rate": 0.00014335500840627986,
+      "loss": 0.7619,
+      "step": 471
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.5732863643126523,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.9159,
+      "step": 472
+    },
+    {
+      "epoch": 0.3784,
+      "grad_norm": 0.5341629172113251,
+      "learning_rate": 0.0001428872692081038,
+      "loss": 0.8164,
+      "step": 473
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.4983179961540753,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7818,
+      "step": 474
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.5476769205838636,
+      "learning_rate": 0.00014241837740116132,
+      "loss": 0.8014,
+      "step": 475
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.519853059327151,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.78,
+      "step": 476
+    },
+    {
+      "epoch": 0.3816,
+      "grad_norm": 0.5122673066399209,
+      "learning_rate": 0.00014194834558706632,
+      "loss": 0.9465,
+      "step": 477
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.4884806177478218,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7465,
+      "step": 478
+    },
+    {
+      "epoch": 0.3832,
+      "grad_norm": 0.5507582946850637,
+      "learning_rate": 0.0001414771863980707,
+      "loss": 0.8738,
+      "step": 479
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.42981490061468713,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.7587,
+      "step": 480
+    },
+    {
+      "epoch": 0.3848,
+      "grad_norm": 0.4197307756866334,
+      "learning_rate": 0.00014100491249672498,
+      "loss": 0.7554,
+      "step": 481
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.505519660624182,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.6974,
+      "step": 482
+    },
+    {
+      "epoch": 0.3864,
+      "grad_norm": 0.39481543226001375,
+      "learning_rate": 0.0001405315365755379,
+      "loss": 0.7646,
+      "step": 483
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.5922524035330967,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.8994,
+      "step": 484
+    },
+    {
+      "epoch": 0.388,
+      "grad_norm": 0.42693152478544344,
+      "learning_rate": 0.00014005707135663527,
+      "loss": 0.6892,
+      "step": 485
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.8510976031521688,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.9395,
+      "step": 486
+    },
+    {
+      "epoch": 0.3896,
+      "grad_norm": 0.5474386064002674,
+      "learning_rate": 0.00013958152959141825,
+      "loss": 0.8637,
+      "step": 487
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.4789876152303442,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7921,
+      "step": 488
+    },
+    {
+      "epoch": 0.3912,
+      "grad_norm": 0.5376569606274728,
+      "learning_rate": 0.00013910492406022033,
+      "loss": 0.839,
+      "step": 489
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.4426845745081761,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.6722,
+      "step": 490
+    },
+    {
+      "epoch": 0.3928,
+      "grad_norm": 0.6345291879942117,
+      "learning_rate": 0.0001386272675719642,
+      "loss": 0.9571,
+      "step": 491
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.5118545731832741,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7311,
+      "step": 492
+    },
+    {
+      "epoch": 0.3944,
+      "grad_norm": 0.4960660282057398,
+      "learning_rate": 0.00013814857296381728,
+      "loss": 0.8514,
+      "step": 493
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.539674335810636,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.824,
+      "step": 494
+    },
+    {
+      "epoch": 0.396,
+      "grad_norm": 0.4800850704177175,
+      "learning_rate": 0.00013766885310084688,
+      "loss": 0.8428,
+      "step": 495
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4351799728360236,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7292,
+      "step": 496
+    },
+    {
+      "epoch": 0.3976,
+      "grad_norm": 0.43285418219243915,
+      "learning_rate": 0.00013718812087567414,
+      "loss": 0.8136,
+      "step": 497
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.5216824155883797,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7571,
+      "step": 498
+    },
+    {
+      "epoch": 0.3992,
+      "grad_norm": 0.5413483672953808,
+      "learning_rate": 0.000136706389208128,
+      "loss": 0.8638,
+      "step": 499
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.6398444932660209,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.9068,
+      "step": 500
+    },
+    {
+      "epoch": 0.4008,
+      "grad_norm": 0.7444916635272512,
+      "learning_rate": 0.00013622367104489756,
+      "loss": 0.8793,
+      "step": 501
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.6656926945246038,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.9014,
+      "step": 502
+    },
+    {
+      "epoch": 0.4024,
+      "grad_norm": 0.4457908869026868,
+      "learning_rate": 0.0001357399793591844,
+      "loss": 0.7344,
+      "step": 503
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.5697359549783958,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.8217,
+      "step": 504
+    },
+    {
+      "epoch": 0.404,
+      "grad_norm": 0.5056678280708421,
+      "learning_rate": 0.00013525532715035366,
+      "loss": 0.8574,
+      "step": 505
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.5484756774270771,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.8512,
+      "step": 506
+    },
+    {
+      "epoch": 0.4056,
+      "grad_norm": 0.46505218004814153,
+      "learning_rate": 0.00013476972744358507,
+      "loss": 0.7835,
+      "step": 507
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.7119745829060073,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.96,
+      "step": 508
+    },
+    {
+      "epoch": 0.4072,
+      "grad_norm": 0.5449520855135285,
+      "learning_rate": 0.00013428319328952253,
+      "loss": 0.8196,
+      "step": 509
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.47892233269452206,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7682,
+      "step": 510
+    },
+    {
+      "epoch": 0.4088,
+      "grad_norm": 0.4848983418672802,
+      "learning_rate": 0.0001337957377639235,
+      "loss": 0.7724,
+      "step": 511
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.5019430321638111,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.83,
+      "step": 512
+    },
+    {
+      "epoch": 0.4104,
+      "grad_norm": 0.4731872095334749,
+      "learning_rate": 0.0001333073739673076,
+      "loss": 0.7831,
+      "step": 513
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.5077071364078982,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.8324,
+      "step": 514
+    },
+    {
+      "epoch": 0.412,
+      "grad_norm": 0.5813701285846412,
+      "learning_rate": 0.0001328181150246045,
+      "loss": 0.8837,
+      "step": 515
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4799917369825802,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.8781,
+      "step": 516
+    },
+    {
+      "epoch": 0.4136,
+      "grad_norm": 0.5159308332117593,
+      "learning_rate": 0.00013232797408480127,
+      "loss": 0.8789,
+      "step": 517
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.5270262593076608,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.8081,
+      "step": 518
+    },
+    {
+      "epoch": 0.4152,
+      "grad_norm": 0.6180301887875884,
+      "learning_rate": 0.00013183696432058888,
+      "loss": 0.9297,
+      "step": 519
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.6708371860981875,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.8549,
+      "step": 520
+    },
+    {
+      "epoch": 0.4168,
+      "grad_norm": 0.5707816679612164,
+      "learning_rate": 0.00013134509892800822,
+      "loss": 0.8378,
+      "step": 521
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.4923902211709554,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.8792,
+      "step": 522
+    },
+    {
+      "epoch": 0.4184,
+      "grad_norm": 0.5531441500947429,
+      "learning_rate": 0.00013085239112609547,
+      "loss": 0.9107,
+      "step": 523
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4419632815828972,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7463,
+      "step": 524
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.6061866078260476,
+      "learning_rate": 0.00013035885415652685,
+      "loss": 0.911,
+      "step": 525
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.6963024819971977,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 1.0253,
+      "step": 526
+    },
+    {
+      "epoch": 0.4216,
+      "grad_norm": 0.49940481513481,
+      "learning_rate": 0.00012986450128326266,
+      "loss": 0.845,
+      "step": 527
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.5293551303353214,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.8954,
+      "step": 528
+    },
+    {
+      "epoch": 0.4232,
+      "grad_norm": 0.5063413990157742,
+      "learning_rate": 0.00012936934579219094,
+      "loss": 0.8538,
+      "step": 529
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.5562949404718519,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7987,
+      "step": 530
+    },
+    {
+      "epoch": 0.4248,
+      "grad_norm": 0.48131314061880803,
+      "learning_rate": 0.00012887340099077024,
+      "loss": 0.8152,
+      "step": 531
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.5870675309038963,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 1.0352,
+      "step": 532
+    },
+    {
+      "epoch": 0.4264,
+      "grad_norm": 0.45971466569011993,
+      "learning_rate": 0.0001283766802076722,
+      "loss": 0.7168,
+      "step": 533
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.6035126308260672,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.8577,
+      "step": 534
+    },
+    {
+      "epoch": 0.428,
+      "grad_norm": 0.45144590547708024,
+      "learning_rate": 0.00012787919679242306,
+      "loss": 0.7752,
+      "step": 535
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.39970917530382416,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7236,
+      "step": 536
+    },
+    {
+      "epoch": 0.4296,
+      "grad_norm": 0.623590914707238,
+      "learning_rate": 0.00012738096411504522,
+      "loss": 0.7703,
+      "step": 537
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.7515130857546064,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.8271,
+      "step": 538
+    },
+    {
+      "epoch": 0.4312,
+      "grad_norm": 0.4347546521826034,
+      "learning_rate": 0.00012688199556569753,
+      "loss": 0.7611,
+      "step": 539
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4761557601343759,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7657,
+      "step": 540
+    },
+    {
+      "epoch": 0.4328,
+      "grad_norm": 0.44633336885236097,
+      "learning_rate": 0.0001263823045543158,
+      "loss": 0.7421,
+      "step": 541
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.5563343452610224,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.9967,
+      "step": 542
+    },
+    {
+      "epoch": 0.4344,
+      "grad_norm": 0.6471019126729325,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.8541,
+      "step": 543
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.47340527937997834,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.8474,
+      "step": 544
+    },
+    {
+      "epoch": 0.436,
+      "grad_norm": 0.46157400551191163,
+      "learning_rate": 0.00012538080888191408,
+      "loss": 0.7955,
+      "step": 545
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.45508509589196,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7653,
+      "step": 546
+    },
+    {
+      "epoch": 0.4376,
+      "grad_norm": 0.5077739722759781,
+      "learning_rate": 0.00012487903113640337,
+      "loss": 0.8425,
+      "step": 547
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.8141208292900931,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.948,
+      "step": 548
+    },
+    {
+      "epoch": 0.4392,
+      "grad_norm": 0.4076706826329385,
+      "learning_rate": 0.00012437658475915377,
+      "loss": 0.6998,
+      "step": 549
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.5118664937647278,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7696,
+      "step": 550
+    },
+    {
+      "epoch": 0.4408,
+      "grad_norm": 0.4816229683616941,
+      "learning_rate": 0.00012387348325356874,
+      "loss": 0.7504,
+      "step": 551
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.48930481676711723,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.7651,
+      "step": 552
+    },
+    {
+      "epoch": 0.4424,
+      "grad_norm": 0.45561086673747714,
+      "learning_rate": 0.00012336974014065844,
+      "loss": 0.7319,
+      "step": 553
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.4416422478225898,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7528,
+      "step": 554
+    },
+    {
+      "epoch": 0.444,
+      "grad_norm": 0.47418182478577525,
+      "learning_rate": 0.00012286536895867654,
+      "loss": 0.7091,
+      "step": 555
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.6401681485409784,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.9049,
+      "step": 556
+    },
+    {
+      "epoch": 0.4456,
+      "grad_norm": 0.6982634373095091,
+      "learning_rate": 0.00012236038326275626,
+      "loss": 0.9297,
+      "step": 557
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.507055788697773,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7311,
+      "step": 558
+    },
+    {
+      "epoch": 0.4472,
+      "grad_norm": 0.5807270252926734,
+      "learning_rate": 0.00012185479662454595,
+      "loss": 0.7908,
+      "step": 559
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4431858492829446,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.8259,
+      "step": 560
+    },
+    {
+      "epoch": 0.4488,
+      "grad_norm": 0.4849528454606528,
+      "learning_rate": 0.00012134862263184467,
+      "loss": 0.7853,
+      "step": 561
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.49166739471207144,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.7069,
+      "step": 562
+    },
+    {
+      "epoch": 0.4504,
+      "grad_norm": 0.5793994112069623,
+      "learning_rate": 0.00012084187488823657,
+      "loss": 0.8024,
+      "step": 563
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.5179173280187043,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.8411,
+      "step": 564
+    },
+    {
+      "epoch": 0.452,
+      "grad_norm": 0.6588825640045888,
+      "learning_rate": 0.00012033456701272576,
+      "loss": 0.8524,
+      "step": 565
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.4623706090981975,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7722,
+      "step": 566
+    },
+    {
+      "epoch": 0.4536,
+      "grad_norm": 0.47102123092467946,
+      "learning_rate": 0.00011982671263936995,
+      "loss": 0.8397,
+      "step": 567
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.4946138787683921,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.8148,
+      "step": 568
+    },
+    {
+      "epoch": 0.4552,
+      "grad_norm": 0.5486114882605688,
+      "learning_rate": 0.00011931832541691418,
+      "loss": 0.7791,
+      "step": 569
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.4755744045856606,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.7625,
+      "step": 570
+    },
+    {
+      "epoch": 0.4568,
+      "grad_norm": 0.497584922002352,
+      "learning_rate": 0.00011880941900842397,
+      "loss": 0.8277,
+      "step": 571
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.6326791729007039,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.8543,
+      "step": 572
+    },
+    {
+      "epoch": 0.4584,
+      "grad_norm": 0.45632696349487545,
+      "learning_rate": 0.00011830000709091815,
+      "loss": 0.801,
+      "step": 573
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.4239880741040958,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7306,
+      "step": 574
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.5627946101804462,
+      "learning_rate": 0.0001177901033550012,
+      "loss": 0.7922,
+      "step": 575
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.5875614063404327,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.9845,
+      "step": 576
+    },
+    {
+      "epoch": 0.4616,
+      "grad_norm": 0.4552509273066403,
+      "learning_rate": 0.00011727972150449544,
+      "loss": 0.7373,
+      "step": 577
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.4164798887182463,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7783,
+      "step": 578
+    },
+    {
+      "epoch": 0.4632,
+      "grad_norm": 0.5657442455430117,
+      "learning_rate": 0.00011676887525607271,
+      "loss": 0.7653,
+      "step": 579
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4009350967174863,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7872,
+      "step": 580
+    },
+    {
+      "epoch": 0.4648,
+      "grad_norm": 0.5234546794812653,
+      "learning_rate": 0.00011625757833888551,
+      "loss": 0.8488,
+      "step": 581
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.4983773846295474,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7584,
+      "step": 582
+    },
+    {
+      "epoch": 0.4664,
+      "grad_norm": 0.5481641154012888,
+      "learning_rate": 0.0001157458444941984,
+      "loss": 0.7982,
+      "step": 583
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.4353146816985957,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7253,
+      "step": 584
+    },
+    {
+      "epoch": 0.468,
+      "grad_norm": 0.451915751410752,
+      "learning_rate": 0.00011523368747501839,
+      "loss": 0.74,
+      "step": 585
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.643640556182646,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.8034,
+      "step": 586
+    },
+    {
+      "epoch": 0.4696,
+      "grad_norm": 0.4514025547080541,
+      "learning_rate": 0.00011472112104572547,
+      "loss": 0.7941,
+      "step": 587
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.5906026912192136,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.8287,
+      "step": 588
+    },
+    {
+      "epoch": 0.4712,
+      "grad_norm": 0.5653389985830627,
+      "learning_rate": 0.0001142081589817027,
+      "loss": 0.8367,
+      "step": 589
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.5052738515343731,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7988,
+      "step": 590
+    },
+    {
+      "epoch": 0.4728,
+      "grad_norm": 0.5161892275310288,
+      "learning_rate": 0.00011369481506896582,
+      "loss": 0.7462,
+      "step": 591
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.42183424458748847,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7683,
+      "step": 592
+    },
+    {
+      "epoch": 0.4744,
+      "grad_norm": 0.4048096584332975,
+      "learning_rate": 0.00011318110310379301,
+      "loss": 0.7521,
+      "step": 593
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.42705254193421605,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7286,
+      "step": 594
+    },
+    {
+      "epoch": 0.476,
+      "grad_norm": 0.436739457475295,
+      "learning_rate": 0.00011266703689235394,
+      "loss": 0.7054,
+      "step": 595
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.7333250606069377,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.911,
+      "step": 596
+    },
+    {
+      "epoch": 0.4776,
+      "grad_norm": 0.5989584703058212,
+      "learning_rate": 0.00011215263025033869,
+      "loss": 0.8636,
+      "step": 597
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.4987136821184533,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7914,
+      "step": 598
+    },
+    {
+      "epoch": 0.4792,
+      "grad_norm": 0.5538093469915446,
+      "learning_rate": 0.00011163789700258655,
+      "loss": 0.8548,
+      "step": 599
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.6225202094050077,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.8493,
+      "step": 600
+    },
+    {
+      "epoch": 0.4808,
+      "grad_norm": 0.48334131016487303,
+      "learning_rate": 0.00011112285098271451,
+      "loss": 0.7604,
+      "step": 601
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.6200355119258774,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.9533,
+      "step": 602
+    },
+    {
+      "epoch": 0.4824,
+      "grad_norm": 0.4299026309261422,
+      "learning_rate": 0.00011060750603274535,
+      "loss": 0.7193,
+      "step": 603
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.38030842079196864,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.6811,
+      "step": 604
+    },
+    {
+      "epoch": 0.484,
+      "grad_norm": 0.5467270629092524,
+      "learning_rate": 0.00011009187600273566,
+      "loss": 0.8047,
+      "step": 605
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.5068541306344564,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.8656,
+      "step": 606
+    },
+    {
+      "epoch": 0.4856,
+      "grad_norm": 0.6618083084463389,
+      "learning_rate": 0.00010957597475040373,
+      "loss": 0.8956,
+      "step": 607
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.48105475118151797,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7298,
+      "step": 608
+    },
+    {
+      "epoch": 0.4872,
+      "grad_norm": 0.5281532410356332,
+      "learning_rate": 0.00010905981614075693,
+      "loss": 0.7413,
+      "step": 609
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.47052106414012906,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.7982,
+      "step": 610
+    },
+    {
+      "epoch": 0.4888,
+      "grad_norm": 0.5108428806336127,
+      "learning_rate": 0.00010854341404571928,
+      "loss": 0.8337,
+      "step": 611
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.49989487284593914,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.8609,
+      "step": 612
+    },
+    {
+      "epoch": 0.4904,
+      "grad_norm": 0.47018363591967,
+      "learning_rate": 0.00010802678234375851,
+      "loss": 0.7778,
+      "step": 613
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.4238498581149461,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7382,
+      "step": 614
+    },
+    {
+      "epoch": 0.492,
+      "grad_norm": 0.690646476671765,
+      "learning_rate": 0.0001075099349195131,
+      "loss": 0.9422,
+      "step": 615
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.5640038941521277,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.9124,
+      "step": 616
+    },
+    {
+      "epoch": 0.4936,
+      "grad_norm": 0.5565706526595023,
+      "learning_rate": 0.00010699288566341914,
+      "loss": 0.7769,
+      "step": 617
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.5550763692749047,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.9181,
+      "step": 618
+    },
+    {
+      "epoch": 0.4952,
+      "grad_norm": 0.45104601971076125,
+      "learning_rate": 0.000106475648471337,
+      "loss": 0.7894,
+      "step": 619
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4876402253325286,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7976,
+      "step": 620
+    },
+    {
+      "epoch": 0.4968,
+      "grad_norm": 0.5019640463439436,
+      "learning_rate": 0.00010595823724417795,
+      "loss": 0.7927,
+      "step": 621
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.43433623562478296,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.7525,
+      "step": 622
+    },
+    {
+      "epoch": 0.4984,
+      "grad_norm": 0.4234622503940986,
+      "learning_rate": 0.00010544066588753044,
+      "loss": 0.731,
+      "step": 623
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.48766634284633525,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.8656,
+      "step": 624
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.4669931847818778,
+      "learning_rate": 0.00010492294831128641,
+      "loss": 0.7783,
+      "step": 625
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.5642451353933601,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.9083,
+      "step": 626
+    },
+    {
+      "epoch": 0.5016,
+      "grad_norm": 0.583895853150987,
+      "learning_rate": 0.00010440509842926767,
+      "loss": 0.908,
+      "step": 627
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.367874215358589,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.6426,
+      "step": 628
+    },
+    {
+      "epoch": 0.5032,
+      "grad_norm": 0.48006836951143605,
+      "learning_rate": 0.00010388713015885161,
+      "loss": 0.8314,
+      "step": 629
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.6162029310047483,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.8896,
+      "step": 630
+    },
+    {
+      "epoch": 0.5048,
+      "grad_norm": 0.5537137865423145,
+      "learning_rate": 0.00010336905742059742,
+      "loss": 0.7685,
+      "step": 631
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.6992075775089436,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.9342,
+      "step": 632
+    },
+    {
+      "epoch": 0.5064,
+      "grad_norm": 0.48711313691159946,
+      "learning_rate": 0.0001028508941378719,
+      "loss": 0.6825,
+      "step": 633
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.5367535076245012,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7888,
+      "step": 634
+    },
+    {
+      "epoch": 0.508,
+      "grad_norm": 0.5051390864160392,
+      "learning_rate": 0.00010233265423647523,
+      "loss": 0.7454,
+      "step": 635
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.4393702088050236,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.7754,
+      "step": 636
+    },
+    {
+      "epoch": 0.5096,
+      "grad_norm": 0.4357646948380654,
+      "learning_rate": 0.00010181435164426676,
+      "loss": 0.8445,
+      "step": 637
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.5655160401352094,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.8535,
+      "step": 638
+    },
+    {
+      "epoch": 0.5112,
+      "grad_norm": 0.7101369418409721,
+      "learning_rate": 0.00010129600029079072,
+      "loss": 0.8571,
+      "step": 639
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.40500970171845685,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7254,
+      "step": 640
+    },
+    {
+      "epoch": 0.5128,
+      "grad_norm": 0.4581178610983195,
+      "learning_rate": 0.00010077761410690172,
+      "loss": 0.7067,
+      "step": 641
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.5095812716333972,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.7643,
+      "step": 642
+    },
+    {
+      "epoch": 0.5144,
+      "grad_norm": 0.6242672143526947,
+      "learning_rate": 0.00010025920702439051,
+      "loss": 0.8023,
+      "step": 643
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.5095925423914407,
+      "learning_rate": 0.0001,
+      "loss": 0.8115,
+      "step": 644
+    },
+    {
+      "epoch": 0.516,
+      "grad_norm": 0.5205824194286691,
+      "learning_rate": 9.97407929756095e-05,
+      "loss": 0.8013,
+      "step": 645
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.527927691195445,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.6588,
+      "step": 646
+    },
+    {
+      "epoch": 0.5176,
+      "grad_norm": 0.5480290855603533,
+      "learning_rate": 9.92223858930983e-05,
+      "loss": 0.7971,
+      "step": 647
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.4645180080695204,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.7798,
+      "step": 648
+    },
+    {
+      "epoch": 0.5192,
+      "grad_norm": 0.5170689062427818,
+      "learning_rate": 9.870399970920932e-05,
+      "loss": 0.8951,
+      "step": 649
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.4998292336055176,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.743,
+      "step": 650
+    },
+    {
+      "epoch": 0.5208,
+      "grad_norm": 0.5147063173006544,
+      "learning_rate": 9.818564835573323e-05,
+      "loss": 0.7651,
+      "step": 651
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.5726403336065479,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.7562,
+      "step": 652
+    },
+    {
+      "epoch": 0.5224,
+      "grad_norm": 0.4208878882710979,
+      "learning_rate": 9.766734576352478e-05,
+      "loss": 0.6844,
+      "step": 653
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.5276694996108304,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.7678,
+      "step": 654
+    },
+    {
+      "epoch": 0.524,
+      "grad_norm": 0.6036959186877415,
+      "learning_rate": 9.714910586212816e-05,
+      "loss": 0.8126,
+      "step": 655
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.48157575829024774,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.8403,
+      "step": 656
+    },
+    {
+      "epoch": 0.5256,
+      "grad_norm": 0.43205050772310427,
+      "learning_rate": 9.663094257940258e-05,
+      "loss": 0.6966,
+      "step": 657
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.4741793002877333,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.736,
+      "step": 658
+    },
+    {
+      "epoch": 0.5272,
+      "grad_norm": 0.7250128268388728,
+      "learning_rate": 9.611286984114841e-05,
+      "loss": 0.9711,
+      "step": 659
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.5192983059700997,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.7779,
+      "step": 660
+    },
+    {
+      "epoch": 0.5288,
+      "grad_norm": 0.4821371547428007,
+      "learning_rate": 9.559490157073236e-05,
+      "loss": 0.7436,
+      "step": 661
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.5253102273548034,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.9746,
+      "step": 662
+    },
+    {
+      "epoch": 0.5304,
+      "grad_norm": 0.4640170393820187,
+      "learning_rate": 9.507705168871358e-05,
+      "loss": 0.7189,
+      "step": 663
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.5415548697443237,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.9317,
+      "step": 664
+    },
+    {
+      "epoch": 0.532,
+      "grad_norm": 0.5463924966747006,
+      "learning_rate": 9.455933411246958e-05,
+      "loss": 0.9206,
+      "step": 665
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.5597937045959331,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.9472,
+      "step": 666
+    },
+    {
+      "epoch": 0.5336,
+      "grad_norm": 0.47190876781340824,
+      "learning_rate": 9.404176275582208e-05,
+      "loss": 0.7587,
+      "step": 667
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.49711210331072075,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.878,
+      "step": 668
+    },
+    {
+      "epoch": 0.5352,
+      "grad_norm": 0.5562096756407807,
+      "learning_rate": 9.352435152866298e-05,
+      "loss": 0.7232,
+      "step": 669
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.38207380026345156,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.6733,
+      "step": 670
+    },
+    {
+      "epoch": 0.5368,
+      "grad_norm": 0.4911593464807909,
+      "learning_rate": 9.300711433658087e-05,
+      "loss": 0.848,
+      "step": 671
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.49585021140361113,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7304,
+      "step": 672
+    },
+    {
+      "epoch": 0.5384,
+      "grad_norm": 0.4361997781441195,
+      "learning_rate": 9.249006508048694e-05,
+      "loss": 0.8047,
+      "step": 673
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.48749707290843575,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.7379,
+      "step": 674
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.5585774710293935,
+      "learning_rate": 9.197321765624152e-05,
+      "loss": 0.8264,
+      "step": 675
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.6017964959135491,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7454,
+      "step": 676
+    },
+    {
+      "epoch": 0.5416,
+      "grad_norm": 0.4903811752775937,
+      "learning_rate": 9.145658595428074e-05,
+      "loss": 0.7924,
+      "step": 677
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.5826468987457163,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.8853,
+      "step": 678
+    },
+    {
+      "epoch": 0.5432,
+      "grad_norm": 0.5576332717239038,
+      "learning_rate": 9.09401838592431e-05,
+      "loss": 0.8411,
+      "step": 679
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4666424712498503,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.7513,
+      "step": 680
+    },
+    {
+      "epoch": 0.5448,
+      "grad_norm": 0.392175235102589,
+      "learning_rate": 9.04240252495963e-05,
+      "loss": 0.7406,
+      "step": 681
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.5988598795183644,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.8706,
+      "step": 682
+    },
+    {
+      "epoch": 0.5464,
+      "grad_norm": 0.4549947867456565,
+      "learning_rate": 8.990812399726435e-05,
+      "loss": 0.6844,
+      "step": 683
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.4391528918297825,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7775,
+      "step": 684
+    },
+    {
+      "epoch": 0.548,
+      "grad_norm": 0.4968950226404202,
+      "learning_rate": 8.939249396725467e-05,
+      "loss": 0.7743,
+      "step": 685
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.6191210643891846,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.9487,
+      "step": 686
+    },
+    {
+      "epoch": 0.5496,
+      "grad_norm": 0.42276547342130816,
+      "learning_rate": 8.887714901728551e-05,
+      "loss": 0.7004,
+      "step": 687
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.38049403358329675,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7383,
+      "step": 688
+    },
+    {
+      "epoch": 0.5512,
+      "grad_norm": 0.6769621657629243,
+      "learning_rate": 8.836210299741346e-05,
+      "loss": 0.7771,
+      "step": 689
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.8127544843604503,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.727,
+      "step": 690
+    },
+    {
+      "epoch": 0.5528,
+      "grad_norm": 0.44351221474892843,
+      "learning_rate": 8.784736974966135e-05,
+      "loss": 0.6973,
+      "step": 691
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4688048747951437,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.7191,
+      "step": 692
+    },
+    {
+      "epoch": 0.5544,
+      "grad_norm": 0.407535323330731,
+      "learning_rate": 8.733296310764611e-05,
+      "loss": 0.7516,
+      "step": 693
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.6488950511252963,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.9266,
+      "step": 694
+    },
+    {
+      "epoch": 0.556,
+      "grad_norm": 0.5090581336710193,
+      "learning_rate": 8.6818896896207e-05,
+      "loss": 0.8334,
+      "step": 695
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.5102832703365258,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7681,
+      "step": 696
+    },
+    {
+      "epoch": 0.5576,
+      "grad_norm": 0.4691147327015194,
+      "learning_rate": 8.63051849310342e-05,
+      "loss": 0.8435,
+      "step": 697
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.4425746375769298,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.6959,
+      "step": 698
+    },
+    {
+      "epoch": 0.5592,
+      "grad_norm": 0.47816355139058525,
+      "learning_rate": 8.579184101829734e-05,
+      "loss": 0.8025,
+      "step": 699
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.45335055516153394,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.7181,
+      "step": 700
+    },
+    {
+      "epoch": 0.5608,
+      "grad_norm": 0.5544787082237572,
+      "learning_rate": 8.527887895427454e-05,
+      "loss": 0.8752,
+      "step": 701
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.5453965897883242,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7189,
+      "step": 702
+    },
+    {
+      "epoch": 0.5624,
+      "grad_norm": 0.5688413162790307,
+      "learning_rate": 8.476631252498162e-05,
+      "loss": 0.9164,
+      "step": 703
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.5261329886502011,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7751,
+      "step": 704
+    },
+    {
+      "epoch": 0.564,
+      "grad_norm": 0.5893462892684176,
+      "learning_rate": 8.425415550580162e-05,
+      "loss": 0.9216,
+      "step": 705
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.6318813315358957,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.8991,
+      "step": 706
+    },
+    {
+      "epoch": 0.5656,
+      "grad_norm": 0.5114713319230283,
+      "learning_rate": 8.374242166111448e-05,
+      "loss": 0.7628,
+      "step": 707
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4994537707955278,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7752,
+      "step": 708
+    },
+    {
+      "epoch": 0.5672,
+      "grad_norm": 0.5040668896437293,
+      "learning_rate": 8.323112474392731e-05,
+      "loss": 0.8209,
+      "step": 709
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.42331110495934854,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.7614,
+      "step": 710
+    },
+    {
+      "epoch": 0.5688,
+      "grad_norm": 0.49310945050722876,
+      "learning_rate": 8.272027849550457e-05,
+      "loss": 0.8032,
+      "step": 711
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4438640468537216,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7209,
+      "step": 712
+    },
+    {
+      "epoch": 0.5704,
+      "grad_norm": 0.5539746222516068,
+      "learning_rate": 8.220989664499878e-05,
+      "loss": 0.8538,
+      "step": 713
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.46660431947531117,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.8591,
+      "step": 714
+    },
+    {
+      "epoch": 0.572,
+      "grad_norm": 0.38022037294727395,
+      "learning_rate": 8.169999290908188e-05,
+      "loss": 0.6972,
+      "step": 715
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.5449499620640735,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 1.0068,
+      "step": 716
+    },
+    {
+      "epoch": 0.5736,
+      "grad_norm": 0.4169898296910137,
+      "learning_rate": 8.119058099157604e-05,
+      "loss": 0.7496,
+      "step": 717
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.5203062122624942,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7352,
+      "step": 718
+    },
+    {
+      "epoch": 0.5752,
+      "grad_norm": 0.5865678547802423,
+      "learning_rate": 8.068167458308582e-05,
+      "loss": 0.8105,
+      "step": 719
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.44787543970841115,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.6951,
+      "step": 720
+    },
+    {
+      "epoch": 0.5768,
+      "grad_norm": 0.47026442836960747,
+      "learning_rate": 8.017328736063006e-05,
+      "loss": 0.7889,
+      "step": 721
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.7319729212196735,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.8015,
+      "step": 722
+    },
+    {
+      "epoch": 0.5784,
+      "grad_norm": 0.49252875503080346,
+      "learning_rate": 7.966543298727425e-05,
+      "loss": 0.8292,
+      "step": 723
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.5425899442850941,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.8161,
+      "step": 724
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.4573563520297278,
+      "learning_rate": 7.915812511176347e-05,
+      "loss": 0.6923,
+      "step": 725
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.6037199037531603,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.8714,
+      "step": 726
+    },
+    {
+      "epoch": 0.5816,
+      "grad_norm": 0.6271526509531546,
+      "learning_rate": 7.865137736815535e-05,
+      "loss": 0.8068,
+      "step": 727
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.5301535703360196,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.8232,
+      "step": 728
+    },
+    {
+      "epoch": 0.5832,
+      "grad_norm": 0.5787282734319904,
+      "learning_rate": 7.814520337545406e-05,
+      "loss": 0.7647,
+      "step": 729
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.6614524640109779,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.9463,
+      "step": 730
+    },
+    {
+      "epoch": 0.5848,
+      "grad_norm": 0.4926108007256998,
+      "learning_rate": 7.763961673724379e-05,
+      "loss": 0.7626,
+      "step": 731
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.4804850373978476,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.7277,
+      "step": 732
+    },
+    {
+      "epoch": 0.5864,
+      "grad_norm": 0.6747587556763391,
+      "learning_rate": 7.713463104132345e-05,
+      "loss": 0.8613,
+      "step": 733
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.4768374021425166,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.7535,
+      "step": 734
+    },
+    {
+      "epoch": 0.588,
+      "grad_norm": 0.42548895965153943,
+      "learning_rate": 7.663025985934158e-05,
+      "loss": 0.7386,
+      "step": 735
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.527810233538241,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.7931,
+      "step": 736
+    },
+    {
+      "epoch": 0.5896,
+      "grad_norm": 0.3945006668802455,
+      "learning_rate": 7.61265167464313e-05,
+      "loss": 0.7499,
+      "step": 737
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.5018677373299719,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.7797,
+      "step": 738
+    },
+    {
+      "epoch": 0.5912,
+      "grad_norm": 0.42433532115712186,
+      "learning_rate": 7.562341524084623e-05,
+      "loss": 0.7494,
+      "step": 739
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.534041498650102,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.7859,
+      "step": 740
+    },
+    {
+      "epoch": 0.5928,
+      "grad_norm": 0.512688615851546,
+      "learning_rate": 7.512096886359664e-05,
+      "loss": 0.8271,
+      "step": 741
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.511471743976831,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.8376,
+      "step": 742
+    },
+    {
+      "epoch": 0.5944,
+      "grad_norm": 0.4894391809863964,
+      "learning_rate": 7.461919111808595e-05,
+      "loss": 0.8335,
+      "step": 743
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.5514529144557104,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.8408,
+      "step": 744
+    },
+    {
+      "epoch": 0.596,
+      "grad_norm": 0.7223618761573463,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.8607,
+      "step": 745
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.5178506188237251,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.8074,
+      "step": 746
+    },
+    {
+      "epoch": 0.5976,
+      "grad_norm": 0.42340531056837144,
+      "learning_rate": 7.361769544568425e-05,
+      "loss": 0.7043,
+      "step": 747
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4964289596754657,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.728,
+      "step": 748
+    },
+    {
+      "epoch": 0.5992,
+      "grad_norm": 0.47985426517246027,
+      "learning_rate": 7.311800443430251e-05,
+      "loss": 0.7996,
+      "step": 749
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4588102653566252,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7243,
+      "step": 750
+    },
+    {
+      "epoch": 0.6008,
+      "grad_norm": 0.42569781572618853,
+      "learning_rate": 7.26190358849548e-05,
+      "loss": 0.7516,
+      "step": 751
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.436780018977493,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.7312,
+      "step": 752
+    },
+    {
+      "epoch": 0.6024,
+      "grad_norm": 0.4628449287698282,
+      "learning_rate": 7.212080320757695e-05,
+      "loss": 0.7581,
+      "step": 753
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.6621084612531626,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.895,
+      "step": 754
+    },
+    {
+      "epoch": 0.604,
+      "grad_norm": 0.5027121856510675,
+      "learning_rate": 7.162331979232783e-05,
+      "loss": 0.7909,
+      "step": 755
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.48726573400549317,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.6746,
+      "step": 756
+    },
+    {
+      "epoch": 0.6056,
+      "grad_norm": 0.437989626752331,
+      "learning_rate": 7.112659900922976e-05,
+      "loss": 0.7123,
+      "step": 757
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.5906032482168247,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7483,
+      "step": 758
+    },
+    {
+      "epoch": 0.6072,
+      "grad_norm": 0.5096323144956553,
+      "learning_rate": 7.06306542078091e-05,
+      "loss": 0.7342,
+      "step": 759
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.404384369704306,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.7219,
+      "step": 760
+    },
+    {
+      "epoch": 0.6088,
+      "grad_norm": 0.47916413300610006,
+      "learning_rate": 7.013549871673736e-05,
+      "loss": 0.8172,
+      "step": 761
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.5062713721851729,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.8235,
+      "step": 762
+    },
+    {
+      "epoch": 0.6104,
+      "grad_norm": 0.4565530114216757,
+      "learning_rate": 6.964114584347316e-05,
+      "loss": 0.8377,
+      "step": 763
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.5444683418349793,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.8577,
+      "step": 764
+    },
+    {
+      "epoch": 0.612,
+      "grad_norm": 0.4475498314739535,
+      "learning_rate": 6.914760887390452e-05,
+      "loss": 0.668,
+      "step": 765
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.6425774157029098,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.9029,
+      "step": 766
+    },
+    {
+      "epoch": 0.6136,
+      "grad_norm": 0.5034987160910696,
+      "learning_rate": 6.865490107199181e-05,
+      "loss": 0.7365,
+      "step": 767
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.6917235904630508,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.7839,
+      "step": 768
+    },
+    {
+      "epoch": 0.6152,
+      "grad_norm": 0.5605017746427178,
+      "learning_rate": 6.816303567941112e-05,
+      "loss": 0.713,
+      "step": 769
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.4738673000646273,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.7566,
+      "step": 770
+    },
+    {
+      "epoch": 0.6168,
+      "grad_norm": 0.46540916421865014,
+      "learning_rate": 6.767202591519875e-05,
+      "loss": 0.6973,
+      "step": 771
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.5410959645351576,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.8646,
+      "step": 772
+    },
+    {
+      "epoch": 0.6184,
+      "grad_norm": 0.5960088786759402,
+      "learning_rate": 6.718188497539554e-05,
+      "loss": 0.8148,
+      "step": 773
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.5077873608234905,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.7357,
+      "step": 774
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.6215644323546918,
+      "learning_rate": 6.669262603269246e-05,
+      "loss": 0.8109,
+      "step": 775
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.6707416014463301,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.8331,
+      "step": 776
+    },
+    {
+      "epoch": 0.6216,
+      "grad_norm": 0.5176939198395552,
+      "learning_rate": 6.620426223607654e-05,
+      "loss": 0.7384,
+      "step": 777
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.4708689942638851,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7224,
+      "step": 778
+    },
+    {
+      "epoch": 0.6232,
+      "grad_norm": 0.4930737896490059,
+      "learning_rate": 6.571680671047749e-05,
+      "loss": 0.8116,
+      "step": 779
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.5892752746539001,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.9113,
+      "step": 780
+    },
+    {
+      "epoch": 0.6248,
+      "grad_norm": 0.5044815712064862,
+      "learning_rate": 6.523027255641493e-05,
+      "loss": 0.7958,
+      "step": 781
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.43719563585620835,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.7293,
+      "step": 782
+    },
+    {
+      "epoch": 0.6264,
+      "grad_norm": 0.5366190725514896,
+      "learning_rate": 6.474467284964634e-05,
+      "loss": 0.8898,
+      "step": 783
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.5082613561131746,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.7893,
+      "step": 784
+    },
+    {
+      "epoch": 0.628,
+      "grad_norm": 0.41749140031129167,
+      "learning_rate": 6.426002064081565e-05,
+      "loss": 0.7309,
+      "step": 785
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.5170497650763862,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7839,
+      "step": 786
+    },
+    {
+      "epoch": 0.6296,
+      "grad_norm": 0.3961332652107408,
+      "learning_rate": 6.377632895510248e-05,
+      "loss": 0.7366,
+      "step": 787
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.49754985494595544,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.7653,
+      "step": 788
+    },
+    {
+      "epoch": 0.6312,
+      "grad_norm": 0.548924998318957,
+      "learning_rate": 6.329361079187199e-05,
+      "loss": 0.7745,
+      "step": 789
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.478178654903062,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7816,
+      "step": 790
+    },
+    {
+      "epoch": 0.6328,
+      "grad_norm": 0.49895033099227376,
+      "learning_rate": 6.281187912432587e-05,
+      "loss": 0.8334,
+      "step": 791
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.44381124234760644,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.8153,
+      "step": 792
+    },
+    {
+      "epoch": 0.6344,
+      "grad_norm": 0.4529039481898601,
+      "learning_rate": 6.233114689915316e-05,
+      "loss": 0.7455,
+      "step": 793
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.5426609709679072,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.7634,
+      "step": 794
+    },
+    {
+      "epoch": 0.636,
+      "grad_norm": 0.6334191463283818,
+      "learning_rate": 6.18514270361827e-05,
+      "loss": 0.8432,
+      "step": 795
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.6944304196751075,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.7671,
+      "step": 796
+    },
+    {
+      "epoch": 0.6376,
+      "grad_norm": 0.5278882305384767,
+      "learning_rate": 6.13727324280358e-05,
+      "loss": 0.7939,
+      "step": 797
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.36665114582951297,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.6488,
+      "step": 798
+    },
+    {
+      "epoch": 0.6392,
+      "grad_norm": 0.4360921501249804,
+      "learning_rate": 6.08950759397797e-05,
+      "loss": 0.6543,
+      "step": 799
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.46621713569092255,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6824,
+      "step": 800
+    },
+    {
+      "epoch": 0.6408,
+      "grad_norm": 0.5515014444454384,
+      "learning_rate": 6.0418470408581774e-05,
+      "loss": 0.7921,
+      "step": 801
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.41036853011171437,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.6904,
+      "step": 802
+    },
+    {
+      "epoch": 0.6424,
+      "grad_norm": 0.5171660362125989,
+      "learning_rate": 5.9942928643364724e-05,
+      "loss": 0.8403,
+      "step": 803
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.5072245001344229,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.7688,
+      "step": 804
+    },
+    {
+      "epoch": 0.644,
+      "grad_norm": 0.37861571404640515,
+      "learning_rate": 5.946846342446214e-05,
+      "loss": 0.6413,
+      "step": 805
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.5380753036323828,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.7867,
+      "step": 806
+    },
+    {
+      "epoch": 0.6456,
+      "grad_norm": 0.4303997532305666,
+      "learning_rate": 5.899508750327501e-05,
+      "loss": 0.726,
+      "step": 807
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.6367177088524028,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.8739,
+      "step": 808
+    },
+    {
+      "epoch": 0.6472,
+      "grad_norm": 0.4814612755099226,
+      "learning_rate": 5.8522813601929324e-05,
+      "loss": 0.7338,
+      "step": 809
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.5188486822929445,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.7792,
+      "step": 810
+    },
+    {
+      "epoch": 0.6488,
+      "grad_norm": 0.46625320330652303,
+      "learning_rate": 5.80516544129337e-05,
+      "loss": 0.697,
+      "step": 811
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.45902185863682476,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6859,
+      "step": 812
+    },
+    {
+      "epoch": 0.6504,
+      "grad_norm": 0.4638535553430814,
+      "learning_rate": 5.758162259883867e-05,
+      "loss": 0.7149,
+      "step": 813
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.48348600053977475,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.7693,
+      "step": 814
+    },
+    {
+      "epoch": 0.652,
+      "grad_norm": 0.4932746662890735,
+      "learning_rate": 5.7112730791896207e-05,
+      "loss": 0.8263,
+      "step": 815
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.5570560204822728,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.7424,
+      "step": 816
+    },
+    {
+      "epoch": 0.6536,
+      "grad_norm": 0.4687109667729972,
+      "learning_rate": 5.664499159372017e-05,
+      "loss": 0.6849,
+      "step": 817
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.5483076820634255,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.8234,
+      "step": 818
+    },
+    {
+      "epoch": 0.6552,
+      "grad_norm": 0.5445418867646071,
+      "learning_rate": 5.617841757494762e-05,
+      "loss": 0.8508,
+      "step": 819
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4614406941304066,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.7635,
+      "step": 820
+    },
+    {
+      "epoch": 0.6568,
+      "grad_norm": 0.5143174747810663,
+      "learning_rate": 5.5713021274901335e-05,
+      "loss": 0.7847,
+      "step": 821
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.4994639294282618,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.7482,
+      "step": 822
+    },
+    {
+      "epoch": 0.6584,
+      "grad_norm": 0.48428453212647665,
+      "learning_rate": 5.524881520125229e-05,
+      "loss": 0.6998,
+      "step": 823
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.5057027434820639,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.7062,
+      "step": 824
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.49891374589042337,
+      "learning_rate": 5.4785811829683764e-05,
+      "loss": 0.8326,
+      "step": 825
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.5038228377305515,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.7358,
+      "step": 826
+    },
+    {
+      "epoch": 0.6616,
+      "grad_norm": 0.47454180105446425,
+      "learning_rate": 5.432402360355615e-05,
+      "loss": 0.7259,
+      "step": 827
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.6162418744380285,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.7957,
+      "step": 828
+    },
+    {
+      "epoch": 0.6632,
+      "grad_norm": 0.3760689487003604,
+      "learning_rate": 5.386346293357242e-05,
+      "loss": 0.6411,
+      "step": 829
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.5022764707957805,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6897,
+      "step": 830
+    },
+    {
+      "epoch": 0.6648,
+      "grad_norm": 0.6651497049557669,
+      "learning_rate": 5.3404142197444506e-05,
+      "loss": 0.9318,
+      "step": 831
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.43376355417024504,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.6982,
+      "step": 832
+    },
+    {
+      "epoch": 0.6664,
+      "grad_norm": 0.6474226677331079,
+      "learning_rate": 5.2946073739560706e-05,
+      "loss": 0.8946,
+      "step": 833
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.48007278214523885,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.7405,
+      "step": 834
+    },
+    {
+      "epoch": 0.668,
+      "grad_norm": 0.4374660499808529,
+      "learning_rate": 5.248926987065417e-05,
+      "loss": 0.7482,
+      "step": 835
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.517081980205305,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.7528,
+      "step": 836
+    },
+    {
+      "epoch": 0.6696,
+      "grad_norm": 0.48323090918397094,
+      "learning_rate": 5.203374286747158e-05,
+      "loss": 0.7315,
+      "step": 837
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.5021286628734111,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.8212,
+      "step": 838
+    },
+    {
+      "epoch": 0.6712,
+      "grad_norm": 0.5141349387177144,
+      "learning_rate": 5.15795049724435e-05,
+      "loss": 0.7111,
+      "step": 839
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.5084201956002226,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.8196,
+      "step": 840
+    },
+    {
+      "epoch": 0.6728,
+      "grad_norm": 0.4490843664890864,
+      "learning_rate": 5.112656839335543e-05,
+      "loss": 0.7306,
+      "step": 841
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.4695671353572929,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7991,
+      "step": 842
+    },
+    {
+      "epoch": 0.6744,
+      "grad_norm": 0.5271477922335337,
+      "learning_rate": 5.0674945303019526e-05,
+      "loss": 0.8185,
+      "step": 843
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4195106319806952,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6767,
+      "step": 844
+    },
+    {
+      "epoch": 0.676,
+      "grad_norm": 0.42742675495065485,
+      "learning_rate": 5.022464783894744e-05,
+      "loss": 0.7576,
+      "step": 845
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.5216083687371429,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7573,
+      "step": 846
+    },
+    {
+      "epoch": 0.6776,
+      "grad_norm": 0.6288375895385837,
+      "learning_rate": 4.977568810302432e-05,
+      "loss": 0.8401,
+      "step": 847
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.4390594149564587,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.7256,
+      "step": 848
+    },
+    {
+      "epoch": 0.6792,
+      "grad_norm": 0.42755673057491517,
+      "learning_rate": 4.9328078161183464e-05,
+      "loss": 0.7145,
+      "step": 849
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.42681072223664257,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.7017,
+      "step": 850
+    },
+    {
+      "epoch": 0.6808,
+      "grad_norm": 0.43743370476074006,
+      "learning_rate": 4.88818300430819e-05,
+      "loss": 0.7513,
+      "step": 851
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.4737350149845334,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.7188,
+      "step": 852
+    },
+    {
+      "epoch": 0.6824,
+      "grad_norm": 0.6271243129210922,
+      "learning_rate": 4.843695574177737e-05,
+      "loss": 0.7849,
+      "step": 853
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.5890111805244727,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.7677,
+      "step": 854
+    },
+    {
+      "epoch": 0.684,
+      "grad_norm": 0.6277875462939261,
+      "learning_rate": 4.7993467213405706e-05,
+      "loss": 0.9239,
+      "step": 855
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.5372185883245542,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.8173,
+      "step": 856
+    },
+    {
+      "epoch": 0.6856,
+      "grad_norm": 0.46421961606055084,
+      "learning_rate": 4.755137637685979e-05,
+      "loss": 0.7272,
+      "step": 857
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.5514663286606182,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.7869,
+      "step": 858
+    },
+    {
+      "epoch": 0.6872,
+      "grad_norm": 0.5732940160158861,
+      "learning_rate": 4.7110695113469085e-05,
+      "loss": 0.8239,
+      "step": 859
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.47141455825534484,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.7133,
+      "step": 860
+    },
+    {
+      "epoch": 0.6888,
+      "grad_norm": 0.42295740322072267,
+      "learning_rate": 4.6671435266680216e-05,
+      "loss": 0.7267,
+      "step": 861
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.5371004747077944,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.8628,
+      "step": 862
+    },
+    {
+      "epoch": 0.6904,
+      "grad_norm": 0.45922615123491084,
+      "learning_rate": 4.623360864173893e-05,
+      "loss": 0.7993,
+      "step": 863
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.4484668906783044,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.7552,
+      "step": 864
+    },
+    {
+      "epoch": 0.692,
+      "grad_norm": 0.4361054783759581,
+      "learning_rate": 4.579722700537268e-05,
+      "loss": 0.6587,
+      "step": 865
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.48593883605117566,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.7353,
+      "step": 866
+    },
+    {
+      "epoch": 0.6936,
+      "grad_norm": 0.5085759460788811,
+      "learning_rate": 4.5362302085474254e-05,
+      "loss": 0.786,
+      "step": 867
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.550456404953214,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.8363,
+      "step": 868
+    },
+    {
+      "epoch": 0.6952,
+      "grad_norm": 0.448597836416631,
+      "learning_rate": 4.492884557078688e-05,
+      "loss": 0.7263,
+      "step": 869
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.5920039933884523,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.8562,
+      "step": 870
+    },
+    {
+      "epoch": 0.6968,
+      "grad_norm": 0.6058660173901074,
+      "learning_rate": 4.449686911058992e-05,
+      "loss": 0.9037,
+      "step": 871
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.4854403817693627,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.7899,
+      "step": 872
+    },
+    {
+      "epoch": 0.6984,
+      "grad_norm": 0.4452987540370817,
+      "learning_rate": 4.406638431438576e-05,
+      "loss": 0.6692,
+      "step": 873
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.4070353886719388,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.659,
+      "step": 874
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.46881195103092205,
+      "learning_rate": 4.36374027515878e-05,
+      "loss": 0.769,
+      "step": 875
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.4511038364938884,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6831,
+      "step": 876
+    },
+    {
+      "epoch": 0.7016,
+      "grad_norm": 0.5106310960872966,
+      "learning_rate": 4.320993595120969e-05,
+      "loss": 0.8039,
+      "step": 877
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.48236286700585296,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.7366,
+      "step": 878
+    },
+    {
+      "epoch": 0.7032,
+      "grad_norm": 0.5531838732376217,
+      "learning_rate": 4.278399540155536e-05,
+      "loss": 0.7126,
+      "step": 879
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4169564640631747,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.7581,
+      "step": 880
+    },
+    {
+      "epoch": 0.7048,
+      "grad_norm": 0.6070793415565825,
+      "learning_rate": 4.2359592549910145e-05,
+      "loss": 0.7351,
+      "step": 881
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.5040425072210082,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.7782,
+      "step": 882
+    },
+    {
+      "epoch": 0.7064,
+      "grad_norm": 0.3836134411498887,
+      "learning_rate": 4.193673880223339e-05,
+      "loss": 0.6477,
+      "step": 883
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.49566390695142093,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.7286,
+      "step": 884
+    },
+    {
+      "epoch": 0.708,
+      "grad_norm": 0.5542461919792804,
+      "learning_rate": 4.1515445522851784e-05,
+      "loss": 0.7657,
+      "step": 885
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.47384573394363444,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.7194,
+      "step": 886
+    },
+    {
+      "epoch": 0.7096,
+      "grad_norm": 0.40625182616849714,
+      "learning_rate": 4.109572403415386e-05,
+      "loss": 0.6802,
+      "step": 887
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.550120899048214,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7858,
+      "step": 888
+    },
+    {
+      "epoch": 0.7112,
+      "grad_norm": 0.9658197812369057,
+      "learning_rate": 4.0677585616285774e-05,
+      "loss": 0.8226,
+      "step": 889
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.5011003842913042,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6772,
+      "step": 890
+    },
+    {
+      "epoch": 0.7128,
+      "grad_norm": 0.5464217346303958,
+      "learning_rate": 4.026104150684835e-05,
+      "loss": 0.7787,
+      "step": 891
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.514591387424676,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.7904,
+      "step": 892
+    },
+    {
+      "epoch": 0.7144,
+      "grad_norm": 0.41602117726960824,
+      "learning_rate": 3.984610290059467e-05,
+      "loss": 0.6735,
+      "step": 893
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.5684147563389843,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.7678,
+      "step": 894
+    },
+    {
+      "epoch": 0.716,
+      "grad_norm": 0.5427250342476214,
+      "learning_rate": 3.943278094912946e-05,
+      "loss": 0.8478,
+      "step": 895
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.4462442901021536,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.7516,
+      "step": 896
+    },
+    {
+      "epoch": 0.7176,
+      "grad_norm": 0.47720348237429494,
+      "learning_rate": 3.902108676060937e-05,
+      "loss": 0.7417,
+      "step": 897
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.5340837883423885,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.748,
+      "step": 898
+    },
+    {
+      "epoch": 0.7192,
+      "grad_norm": 0.469677944178968,
+      "learning_rate": 3.861103139944449e-05,
+      "loss": 0.7223,
+      "step": 899
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.4166461423938719,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.7077,
+      "step": 900
+    },
+    {
+      "epoch": 0.7208,
+      "grad_norm": 0.5014958715161436,
+      "learning_rate": 3.820262588600074e-05,
+      "loss": 0.7866,
+      "step": 901
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.4834069293364297,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6782,
+      "step": 902
+    },
+    {
+      "epoch": 0.7224,
+      "grad_norm": 0.45015960082810896,
+      "learning_rate": 3.7795881196303995e-05,
+      "loss": 0.7348,
+      "step": 903
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.6624099916197879,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.9578,
+      "step": 904
+    },
+    {
+      "epoch": 0.724,
+      "grad_norm": 0.4489711710544979,
+      "learning_rate": 3.739080826174498e-05,
+      "loss": 0.7236,
+      "step": 905
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.45833194341547684,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.7572,
+      "step": 906
+    },
+    {
+      "epoch": 0.7256,
+      "grad_norm": 0.4564155732637293,
+      "learning_rate": 3.6987417968785366e-05,
+      "loss": 0.7495,
+      "step": 907
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.5571210878690318,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.7802,
+      "step": 908
+    },
+    {
+      "epoch": 0.7272,
+      "grad_norm": 0.5350746748027275,
+      "learning_rate": 3.658572115866541e-05,
+      "loss": 0.8594,
+      "step": 909
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.47110540715206695,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.7354,
+      "step": 910
+    },
+    {
+      "epoch": 0.7288,
+      "grad_norm": 0.40285950217554056,
+      "learning_rate": 3.618572862711247e-05,
+      "loss": 0.6676,
+      "step": 911
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.47110131286432433,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.7773,
+      "step": 912
+    },
+    {
+      "epoch": 0.7304,
+      "grad_norm": 0.6065167854178469,
+      "learning_rate": 3.578745112405083e-05,
+      "loss": 0.7721,
+      "step": 913
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.47817847440544814,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.7187,
+      "step": 914
+    },
+    {
+      "epoch": 0.732,
+      "grad_norm": 0.5278482236080575,
+      "learning_rate": 3.539089935331294e-05,
+      "loss": 0.8376,
+      "step": 915
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4923776927714505,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6548,
+      "step": 916
+    },
+    {
+      "epoch": 0.7336,
+      "grad_norm": 0.4171682018933475,
+      "learning_rate": 3.4996083972351515e-05,
+      "loss": 0.6879,
+      "step": 917
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.5248488852388212,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.7632,
+      "step": 918
+    },
+    {
+      "epoch": 0.7352,
+      "grad_norm": 0.4982641825250908,
+      "learning_rate": 3.4603015591953395e-05,
+      "loss": 0.7282,
+      "step": 919
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.5664112883196568,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.8043,
+      "step": 920
+    },
+    {
+      "epoch": 0.7368,
+      "grad_norm": 0.38128669321258,
+      "learning_rate": 3.421170477595419e-05,
+      "loss": 0.6371,
+      "step": 921
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.43300831689783975,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7134,
+      "step": 922
+    },
+    {
+      "epoch": 0.7384,
+      "grad_norm": 0.5295026247935329,
+      "learning_rate": 3.3822162040954354e-05,
+      "loss": 0.6705,
+      "step": 923
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.5457004272916515,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.8528,
+      "step": 924
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.48791989857955453,
+      "learning_rate": 3.34343978560367e-05,
+      "loss": 0.8148,
+      "step": 925
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.5319220887225498,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.7641,
+      "step": 926
+    },
+    {
+      "epoch": 0.7416,
+      "grad_norm": 0.48648570106610073,
+      "learning_rate": 3.3048422642484886e-05,
+      "loss": 0.7222,
+      "step": 927
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.4097223911261753,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6958,
+      "step": 928
+    },
+    {
+      "epoch": 0.7432,
+      "grad_norm": 0.45502173920914035,
+      "learning_rate": 3.266424677350346e-05,
+      "loss": 0.6441,
+      "step": 929
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.487560468625458,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.7631,
+      "step": 930
+    },
+    {
+      "epoch": 0.7448,
+      "grad_norm": 0.6753084605502865,
+      "learning_rate": 3.228188057393895e-05,
+      "loss": 0.7528,
+      "step": 931
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.634429477646962,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.8167,
+      "step": 932
+    },
+    {
+      "epoch": 0.7464,
+      "grad_norm": 0.6032494837975706,
+      "learning_rate": 3.190133432000252e-05,
+      "loss": 0.924,
+      "step": 933
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.5320878056424246,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.8351,
+      "step": 934
+    },
+    {
+      "epoch": 0.748,
+      "grad_norm": 0.5302958373609717,
+      "learning_rate": 3.1522618238993725e-05,
+      "loss": 0.7477,
+      "step": 935
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.5154687813376203,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.767,
+      "step": 936
+    },
+    {
+      "epoch": 0.7496,
+      "grad_norm": 0.4194848238512301,
+      "learning_rate": 3.114574250902558e-05,
+      "loss": 0.7517,
+      "step": 937
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.5657590494786042,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.7338,
+      "step": 938
+    },
+    {
+      "epoch": 0.7512,
+      "grad_norm": 0.46394036128855876,
+      "learning_rate": 3.077071725875116e-05,
+      "loss": 0.6656,
+      "step": 939
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.6585457520430493,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.7434,
+      "step": 940
+    },
+    {
+      "epoch": 0.7528,
+      "grad_norm": 0.45631314099479964,
+      "learning_rate": 3.0397552567091337e-05,
+      "loss": 0.6775,
+      "step": 941
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.5670975321636078,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.7364,
+      "step": 942
+    },
+    {
+      "epoch": 0.7544,
+      "grad_norm": 0.4286433402915187,
+      "learning_rate": 3.0026258462963787e-05,
+      "loss": 0.665,
+      "step": 943
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.42347337598355206,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.7033,
+      "step": 944
+    },
+    {
+      "epoch": 0.756,
+      "grad_norm": 0.6688146671083809,
+      "learning_rate": 2.9656844925013637e-05,
+      "loss": 1.0517,
+      "step": 945
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.43655168283507745,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6696,
+      "step": 946
+    },
+    {
+      "epoch": 0.7576,
+      "grad_norm": 0.4264804446317368,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.7025,
+      "step": 947
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.47865370372653143,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.7164,
+      "step": 948
+    },
+    {
+      "epoch": 0.7592,
+      "grad_norm": 0.43743867825112825,
+      "learning_rate": 2.8923699209255284e-05,
+      "loss": 0.6818,
+      "step": 949
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.5586615885344863,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.7692,
+      "step": 950
+    },
+    {
+      "epoch": 0.7608,
+      "grad_norm": 0.41130995308389967,
+      "learning_rate": 2.8559986734967282e-05,
+      "loss": 0.6293,
+      "step": 951
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.615142734187489,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.7167,
+      "step": 952
+    },
+    {
+      "epoch": 0.7624,
+      "grad_norm": 0.47711098917288874,
+      "learning_rate": 2.819819423336775e-05,
+      "loss": 0.7416,
+      "step": 953
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.4113701672329664,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6607,
+      "step": 954
+    },
+    {
+      "epoch": 0.764,
+      "grad_norm": 0.5226142070356697,
+      "learning_rate": 2.7838331427743282e-05,
+      "loss": 0.7642,
+      "step": 955
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.4994550804967028,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.7138,
+      "step": 956
+    },
+    {
+      "epoch": 0.7656,
+      "grad_norm": 0.4357954776441535,
+      "learning_rate": 2.7480407989519198e-05,
+      "loss": 0.7087,
+      "step": 957
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.5880842155071231,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.8285,
+      "step": 958
+    },
+    {
+      "epoch": 0.7672,
+      "grad_norm": 0.4636436847203281,
+      "learning_rate": 2.712443353799984e-05,
+      "loss": 0.7134,
+      "step": 959
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.48765809255480125,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.7824,
+      "step": 960
+    },
+    {
+      "epoch": 0.7688,
+      "grad_norm": 0.6074494031177282,
+      "learning_rate": 2.677041764010988e-05,
+      "loss": 0.79,
+      "step": 961
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.5426320419053459,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.8803,
+      "step": 962
+    },
+    {
+      "epoch": 0.7704,
+      "grad_norm": 0.43179728700368436,
+      "learning_rate": 2.6418369810137188e-05,
+      "loss": 0.6927,
+      "step": 963
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.4870535985635999,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.7859,
+      "step": 964
+    },
+    {
+      "epoch": 0.772,
+      "grad_norm": 0.5152243606756598,
+      "learning_rate": 2.6068299509477266e-05,
+      "loss": 0.7925,
+      "step": 965
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.4942498742428769,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6532,
+      "step": 966
+    },
+    {
+      "epoch": 0.7736,
+      "grad_norm": 0.49471572909917017,
+      "learning_rate": 2.5720216146378917e-05,
+      "loss": 0.7355,
+      "step": 967
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.515602610898312,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.8216,
+      "step": 968
+    },
+    {
+      "epoch": 0.7752,
+      "grad_norm": 0.48759582476970237,
+      "learning_rate": 2.5374129075691265e-05,
+      "loss": 0.7691,
+      "step": 969
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.5319259723239352,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.6913,
+      "step": 970
+    },
+    {
+      "epoch": 0.7768,
+      "grad_norm": 0.5226626351276598,
+      "learning_rate": 2.503004759861258e-05,
+      "loss": 0.7946,
+      "step": 971
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.4222056092368926,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6735,
+      "step": 972
+    },
+    {
+      "epoch": 0.7784,
+      "grad_norm": 0.5001031163121298,
+      "learning_rate": 2.4687980962440072e-05,
+      "loss": 0.6677,
+      "step": 973
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.4506197692500532,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6784,
+      "step": 974
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.5095624169639343,
+      "learning_rate": 2.4347938360321566e-05,
+      "loss": 0.8214,
+      "step": 975
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.5021497321557922,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.79,
+      "step": 976
+    },
+    {
+      "epoch": 0.7816,
+      "grad_norm": 0.5481051893245936,
+      "learning_rate": 2.400992893100822e-05,
+      "loss": 0.6788,
+      "step": 977
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.6346051133611106,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.8301,
+      "step": 978
+    },
+    {
+      "epoch": 0.7832,
+      "grad_norm": 0.5582374326562646,
+      "learning_rate": 2.3673961758609152e-05,
+      "loss": 0.78,
+      "step": 979
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.48300829039305565,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.8242,
+      "step": 980
+    },
+    {
+      "epoch": 0.7848,
+      "grad_norm": 0.9117293868139854,
+      "learning_rate": 2.334004587234717e-05,
+      "loss": 0.8505,
+      "step": 981
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.39984004320142186,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.5946,
+      "step": 982
+    },
+    {
+      "epoch": 0.7864,
+      "grad_norm": 0.4204587040766119,
+      "learning_rate": 2.300819024631603e-05,
+      "loss": 0.7047,
+      "step": 983
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.42752357682290293,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6708,
+      "step": 984
+    },
+    {
+      "epoch": 0.788,
+      "grad_norm": 0.51859445846189,
+      "learning_rate": 2.26784037992395e-05,
+      "loss": 0.8237,
+      "step": 985
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.6608745813205704,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.9515,
+      "step": 986
+    },
+    {
+      "epoch": 0.7896,
+      "grad_norm": 0.46954238976179,
+      "learning_rate": 2.2350695394231345e-05,
+      "loss": 0.7997,
+      "step": 987
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.5187667828336188,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.748,
+      "step": 988
+    },
+    {
+      "epoch": 0.7912,
+      "grad_norm": 0.5233972597576384,
+      "learning_rate": 2.2025073838557454e-05,
+      "loss": 0.6623,
+      "step": 989
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.46699149175851185,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6952,
+      "step": 990
+    },
+    {
+      "epoch": 0.7928,
+      "grad_norm": 0.5146291467073221,
+      "learning_rate": 2.1701547883398922e-05,
+      "loss": 0.7285,
+      "step": 991
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.41233758034817963,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6995,
+      "step": 992
+    },
+    {
+      "epoch": 0.7944,
+      "grad_norm": 0.45338313191059965,
+      "learning_rate": 2.138012622361689e-05,
+      "loss": 0.7164,
+      "step": 993
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.6167369290491044,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.7874,
+      "step": 994
+    },
+    {
+      "epoch": 0.796,
+      "grad_norm": 0.4608400987280651,
+      "learning_rate": 2.106081749751897e-05,
+      "loss": 0.7454,
+      "step": 995
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.5012559421002032,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.7885,
+      "step": 996
+    },
+    {
+      "epoch": 0.7976,
+      "grad_norm": 0.4750785039809705,
+      "learning_rate": 2.0743630286627002e-05,
+      "loss": 0.7909,
+      "step": 997
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.45985630757470286,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6701,
+      "step": 998
+    },
+    {
+      "epoch": 0.7992,
+      "grad_norm": 0.4270953884966443,
+      "learning_rate": 2.0428573115446392e-05,
+      "loss": 0.731,
+      "step": 999
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.4461302187521474,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6372,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8008,
+      "grad_norm": 0.5341156075049326,
+      "learning_rate": 2.011565445123711e-05,
+      "loss": 0.8135,
+      "step": 1001
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.473636341286951,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.7409,
+      "step": 1002
+    },
+    {
+      "epoch": 0.8024,
+      "grad_norm": 0.5258329610105286,
+      "learning_rate": 1.980488270378612e-05,
+      "loss": 0.7215,
+      "step": 1003
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.5026266818314066,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6262,
+      "step": 1004
+    },
+    {
+      "epoch": 0.804,
+      "grad_norm": 0.5681522310035081,
+      "learning_rate": 1.9496266225181248e-05,
+      "loss": 0.7847,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.5040600121056366,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.727,
+      "step": 1006
+    },
+    {
+      "epoch": 0.8056,
+      "grad_norm": 0.42975482071747706,
+      "learning_rate": 1.918981330958678e-05,
+      "loss": 0.6897,
+      "step": 1007
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.39088474908116566,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.7154,
+      "step": 1008
+    },
+    {
+      "epoch": 0.8072,
+      "grad_norm": 0.63331093049371,
+      "learning_rate": 1.8885532193020704e-05,
+      "loss": 0.7877,
+      "step": 1009
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.5440867686986005,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7196,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8088,
+      "grad_norm": 0.43657119763585545,
+      "learning_rate": 1.8583431053133127e-05,
+      "loss": 0.6968,
+      "step": 1011
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.43726062031161556,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.6916,
+      "step": 1012
+    },
+    {
+      "epoch": 0.8104,
+      "grad_norm": 0.49697995524720795,
+      "learning_rate": 1.8283518008986567e-05,
+      "loss": 0.7325,
+      "step": 1013
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.642165149449136,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.8114,
+      "step": 1014
+    },
+    {
+      "epoch": 0.812,
+      "grad_norm": 0.44671659843499995,
+      "learning_rate": 1.7985801120837865e-05,
+      "loss": 0.62,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4750374625031183,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.7681,
+      "step": 1016
+    },
+    {
+      "epoch": 0.8136,
+      "grad_norm": 0.5677882577982228,
+      "learning_rate": 1.7690288389921493e-05,
+      "loss": 0.6878,
+      "step": 1017
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.46059699211572874,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.8066,
+      "step": 1018
+    },
+    {
+      "epoch": 0.8152,
+      "grad_norm": 0.4889780511296242,
+      "learning_rate": 1.739698775823442e-05,
+      "loss": 0.6568,
+      "step": 1019
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.538242116224818,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.7828,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8168,
+      "grad_norm": 1.7775886069499676,
+      "learning_rate": 1.7105907108322816e-05,
+      "loss": 0.7322,
+      "step": 1021
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.522496704085688,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7564,
+      "step": 1022
+    },
+    {
+      "epoch": 0.8184,
+      "grad_norm": 0.5101087553943181,
+      "learning_rate": 1.6817054263070174e-05,
+      "loss": 0.8344,
+      "step": 1023
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.4908868315801233,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.7016,
+      "step": 1024
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.5870671112428668,
+      "learning_rate": 1.6530436985486996e-05,
+      "loss": 0.789,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.41822305107363533,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6779,
+      "step": 1026
+    },
+    {
+      "epoch": 0.8216,
+      "grad_norm": 0.5044443801201322,
+      "learning_rate": 1.6246062978502164e-05,
+      "loss": 0.7608,
+      "step": 1027
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.4144813264330897,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6744,
+      "step": 1028
+    },
+    {
+      "epoch": 0.8232,
+      "grad_norm": 0.4945295553903849,
+      "learning_rate": 1.5963939884756042e-05,
+      "loss": 0.7556,
+      "step": 1029
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.3674677670770876,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.661,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8248,
+      "grad_norm": 0.4172182450550059,
+      "learning_rate": 1.5684075286394985e-05,
+      "loss": 0.6737,
+      "step": 1031
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.5509059310722004,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.8365,
+      "step": 1032
+    },
+    {
+      "epoch": 0.8264,
+      "grad_norm": 0.7240049071896496,
+      "learning_rate": 1.5406476704867524e-05,
+      "loss": 0.898,
+      "step": 1033
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.43993964975766325,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.7361,
+      "step": 1034
+    },
+    {
+      "epoch": 0.828,
+      "grad_norm": 0.5558844401182051,
+      "learning_rate": 1.5131151600722337e-05,
+      "loss": 0.8276,
+      "step": 1035
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.4327273912279101,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.7568,
+      "step": 1036
+    },
+    {
+      "epoch": 0.8296,
+      "grad_norm": 0.4133468213755012,
+      "learning_rate": 1.485810737340767e-05,
+      "loss": 0.7052,
+      "step": 1037
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.495577941601173,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.8321,
+      "step": 1038
+    },
+    {
+      "epoch": 0.8312,
+      "grad_norm": 0.4883198956140364,
+      "learning_rate": 1.4587351361072454e-05,
+      "loss": 0.7206,
+      "step": 1039
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.4977080357466427,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.7019,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8328,
+      "grad_norm": 0.4608703927284041,
+      "learning_rate": 1.4318890840369182e-05,
+      "loss": 0.697,
+      "step": 1041
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.5371185255152909,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.7261,
+      "step": 1042
+    },
+    {
+      "epoch": 0.8344,
+      "grad_norm": 0.4258325901526125,
+      "learning_rate": 1.4052733026258281e-05,
+      "loss": 0.7144,
+      "step": 1043
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.4440874414866273,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.6816,
+      "step": 1044
+    },
+    {
+      "epoch": 0.836,
+      "grad_norm": 0.6526896713751453,
+      "learning_rate": 1.3788885071814172e-05,
+      "loss": 0.7963,
+      "step": 1045
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.47426110043298514,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.7609,
+      "step": 1046
+    },
+    {
+      "epoch": 0.8376,
+      "grad_norm": 0.39054648135296366,
+      "learning_rate": 1.3527354068033139e-05,
+      "loss": 0.6764,
+      "step": 1047
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.42222606460520595,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.6633,
+      "step": 1048
+    },
+    {
+      "epoch": 0.8392,
+      "grad_norm": 0.6289629979808175,
+      "learning_rate": 1.326814704364262e-05,
+      "loss": 0.7406,
+      "step": 1049
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.4661898031533964,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.7252,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8408,
+      "grad_norm": 0.4896951405228761,
+      "learning_rate": 1.3011270964912459e-05,
+      "loss": 0.7309,
+      "step": 1051
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.5683741886870963,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.7927,
+      "step": 1052
+    },
+    {
+      "epoch": 0.8424,
+      "grad_norm": 0.4927990120306898,
+      "learning_rate": 1.275673273546758e-05,
+      "loss": 0.7427,
+      "step": 1053
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.46204624575369263,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6762,
+      "step": 1054
+    },
+    {
+      "epoch": 0.844,
+      "grad_norm": 0.5224452763300297,
+      "learning_rate": 1.2504539196102439e-05,
+      "loss": 0.7033,
+      "step": 1055
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.4839244360898422,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.7519,
+      "step": 1056
+    },
+    {
+      "epoch": 0.8456,
+      "grad_norm": 0.48478311455515977,
+      "learning_rate": 1.2254697124597237e-05,
+      "loss": 0.7926,
+      "step": 1057
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.5675322597242162,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.7476,
+      "step": 1058
+    },
+    {
+      "epoch": 0.8472,
+      "grad_norm": 0.5558835033517052,
+      "learning_rate": 1.2007213235535786e-05,
+      "loss": 0.788,
+      "step": 1059
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.5021518913226307,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.7328,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8488,
+      "grad_norm": 0.7089194156298404,
+      "learning_rate": 1.176209418012495e-05,
+      "loss": 0.7407,
+      "step": 1061
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.44794808070935765,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.7134,
+      "step": 1062
+    },
+    {
+      "epoch": 0.8504,
+      "grad_norm": 0.5163707520447636,
+      "learning_rate": 1.1519346546015907e-05,
+      "loss": 0.8099,
+      "step": 1063
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.48425426097146074,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6852,
+      "step": 1064
+    },
+    {
+      "epoch": 0.852,
+      "grad_norm": 0.5636073280069763,
+      "learning_rate": 1.1278976857127311e-05,
+      "loss": 0.7657,
+      "step": 1065
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.6161920215418742,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.8373,
+      "step": 1066
+    },
+    {
+      "epoch": 0.8536,
+      "grad_norm": 0.6067370788621518,
+      "learning_rate": 1.1040991573469629e-05,
+      "loss": 0.6843,
+      "step": 1067
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.49878892018775045,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.7251,
+      "step": 1068
+    },
+    {
+      "epoch": 0.8552,
+      "grad_norm": 0.5538931343889086,
+      "learning_rate": 1.0805397090971737e-05,
+      "loss": 0.7267,
+      "step": 1069
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.5634469383423061,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.682,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8568,
+      "grad_norm": 0.6756498171458365,
+      "learning_rate": 1.057219974130903e-05,
+      "loss": 0.8469,
+      "step": 1071
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.5069613327102218,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.7163,
+      "step": 1072
+    },
+    {
+      "epoch": 0.8584,
+      "grad_norm": 0.5896151746440772,
+      "learning_rate": 1.0341405791733183e-05,
+      "loss": 0.8184,
+      "step": 1073
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.4194288233770586,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6887,
+      "step": 1074
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.47883310643741056,
+      "learning_rate": 1.0113021444903726e-05,
+      "loss": 0.6584,
+      "step": 1075
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.5616475742536037,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.7524,
+      "step": 1076
+    },
+    {
+      "epoch": 0.8616,
+      "grad_norm": 0.43184922997628633,
+      "learning_rate": 9.887052838721322e-06,
+      "loss": 0.6764,
+      "step": 1077
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.5000781498918973,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.8025,
+      "step": 1078
+    },
+    {
+      "epoch": 0.8632,
+      "grad_norm": 0.685194137409394,
+      "learning_rate": 9.663506046162985e-06,
+      "loss": 0.6676,
+      "step": 1079
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.5853873908548777,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.8585,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8648,
+      "grad_norm": 0.42778937703206454,
+      "learning_rate": 9.44238707511862e-06,
+      "loss": 0.7184,
+      "step": 1081
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.47639955095676556,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.706,
+      "step": 1082
+    },
+    {
+      "epoch": 0.8664,
+      "grad_norm": 0.618402241111424,
+      "learning_rate": 9.22370186822965e-06,
+      "loss": 0.7574,
+      "step": 1083
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.44633938289465214,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6354,
+      "step": 1084
+    },
+    {
+      "epoch": 0.868,
+      "grad_norm": 0.5875385436189163,
+      "learning_rate": 9.0074563027294e-06,
+      "loss": 0.8067,
+      "step": 1085
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.5605745440514615,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.8492,
+      "step": 1086
+    },
+    {
+      "epoch": 0.8696,
+      "grad_norm": 0.4730052711787034,
+      "learning_rate": 8.79365619028507e-06,
+      "loss": 0.7172,
+      "step": 1087
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.5580800303772229,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.7554,
+      "step": 1088
+    },
+    {
+      "epoch": 0.8712,
+      "grad_norm": 0.4254970757930749,
+      "learning_rate": 8.582307276841462e-06,
+      "loss": 0.585,
+      "step": 1089
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.45978322037020936,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.7118,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8728,
+      "grad_norm": 0.5907117314407938,
+      "learning_rate": 8.37341524246672e-06,
+      "loss": 0.7398,
+      "step": 1091
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4412387518837164,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.704,
+      "step": 1092
+    },
+    {
+      "epoch": 0.8744,
+      "grad_norm": 0.4995045790717866,
+      "learning_rate": 8.166985701199582e-06,
+      "loss": 0.7067,
+      "step": 1093
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.5051232470835921,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.8006,
+      "step": 1094
+    },
+    {
+      "epoch": 0.876,
+      "grad_norm": 0.3971957363970268,
+      "learning_rate": 7.963024200898462e-06,
+      "loss": 0.6495,
+      "step": 1095
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.49043589782127034,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6654,
+      "step": 1096
+    },
+    {
+      "epoch": 0.8776,
+      "grad_norm": 0.5428837478231188,
+      "learning_rate": 7.761536223092458e-06,
+      "loss": 0.6855,
+      "step": 1097
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.5282569591654496,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.7237,
+      "step": 1098
+    },
+    {
+      "epoch": 0.8792,
+      "grad_norm": 0.5818019327247147,
+      "learning_rate": 7.562527182833978e-06,
+      "loss": 0.7772,
+      "step": 1099
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.47712081603654866,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.7363,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8808,
+      "grad_norm": 0.34622702827014823,
+      "learning_rate": 7.366002428553153e-06,
+      "loss": 0.6029,
+      "step": 1101
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.6092477978535284,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6862,
+      "step": 1102
+    },
+    {
+      "epoch": 0.8824,
+      "grad_norm": 0.5636533235759575,
+      "learning_rate": 7.171967241914224e-06,
+      "loss": 0.7414,
+      "step": 1103
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.7189132278455704,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.8008,
+      "step": 1104
+    },
+    {
+      "epoch": 0.884,
+      "grad_norm": 0.4069774053525663,
+      "learning_rate": 6.980426837673437e-06,
+      "loss": 0.6373,
+      "step": 1105
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.41969446413223566,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6691,
+      "step": 1106
+    },
+    {
+      "epoch": 0.8856,
+      "grad_norm": 0.43341089210286354,
+      "learning_rate": 6.791386363539065e-06,
+      "loss": 0.7241,
+      "step": 1107
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.4763150523225858,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.7293,
+      "step": 1108
+    },
+    {
+      "epoch": 0.8872,
+      "grad_norm": 0.5243786982758026,
+      "learning_rate": 6.604850900032955e-06,
+      "loss": 0.7858,
+      "step": 1109
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.6383878187859945,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.8287,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8888,
+      "grad_norm": 0.47023598685149814,
+      "learning_rate": 6.420825460353974e-06,
+      "loss": 0.784,
+      "step": 1111
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.4655342507169999,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6534,
+      "step": 1112
+    },
+    {
+      "epoch": 0.8904,
+      "grad_norm": 0.5541439582539616,
+      "learning_rate": 6.239314990243339e-06,
+      "loss": 0.8094,
+      "step": 1113
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.5354696327059384,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.7385,
+      "step": 1114
+    },
+    {
+      "epoch": 0.892,
+      "grad_norm": 0.6277838369350939,
+      "learning_rate": 6.0603243678516995e-06,
+      "loss": 0.767,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.41775621115792394,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6789,
+      "step": 1116
+    },
+    {
+      "epoch": 0.8936,
+      "grad_norm": 0.4706322350516361,
+      "learning_rate": 5.883858403607967e-06,
+      "loss": 0.6697,
+      "step": 1117
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.5686920934875289,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.8155,
+      "step": 1118
+    },
+    {
+      "epoch": 0.8952,
+      "grad_norm": 0.4840851706332836,
+      "learning_rate": 5.7099218400900716e-06,
+      "loss": 0.7131,
+      "step": 1119
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.38741107870169045,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6598,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8968,
+      "grad_norm": 0.5555666846750233,
+      "learning_rate": 5.538519351897575e-06,
+      "loss": 0.7491,
+      "step": 1121
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.4618925596656953,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.7333,
+      "step": 1122
+    },
+    {
+      "epoch": 0.8984,
+      "grad_norm": 0.5138487900812424,
+      "learning_rate": 5.369655545525909e-06,
+      "loss": 0.762,
+      "step": 1123
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.6272940777747218,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.7763,
+      "step": 1124
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.5078429540376489,
+      "learning_rate": 5.2033349592426335e-06,
+      "loss": 0.7963,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.4946979306301698,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.7682,
+      "step": 1126
+    },
+    {
+      "epoch": 0.9016,
+      "grad_norm": 0.5182946181907617,
+      "learning_rate": 5.039562062965508e-06,
+      "loss": 0.6991,
+      "step": 1127
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.5235203112797608,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.8014,
+      "step": 1128
+    },
+    {
+      "epoch": 0.9032,
+      "grad_norm": 0.4425756060303903,
+      "learning_rate": 4.87834125814235e-06,
+      "loss": 0.6503,
+      "step": 1129
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.6513597876716953,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.9007,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9048,
+      "grad_norm": 0.8321938932815445,
+      "learning_rate": 4.719676877632639e-06,
+      "loss": 0.7324,
+      "step": 1131
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.46951710995420187,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.7557,
+      "step": 1132
+    },
+    {
+      "epoch": 0.9064,
+      "grad_norm": 0.46986659494656524,
+      "learning_rate": 4.563573185591219e-06,
+      "loss": 0.6842,
+      "step": 1133
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.46907519031176875,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6883,
+      "step": 1134
+    },
+    {
+      "epoch": 0.908,
+      "grad_norm": 0.5157370874627774,
+      "learning_rate": 4.4100343773536225e-06,
+      "loss": 0.7977,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.4449648889231507,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.669,
+      "step": 1136
+    },
+    {
+      "epoch": 0.9096,
+      "grad_norm": 0.5722331027119578,
+      "learning_rate": 4.259064579323302e-06,
+      "loss": 0.7645,
+      "step": 1137
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.4162921246583935,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6824,
+      "step": 1138
+    },
+    {
+      "epoch": 0.9112,
+      "grad_norm": 0.4815905410333443,
+      "learning_rate": 4.1106678488607495e-06,
+      "loss": 0.7512,
+      "step": 1139
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.5050075775858367,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.8343,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9128,
+      "grad_norm": 0.456676045657739,
+      "learning_rate": 3.964848174174541e-06,
+      "loss": 0.6766,
+      "step": 1141
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.453441285772829,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.728,
+      "step": 1142
+    },
+    {
+      "epoch": 0.9144,
+      "grad_norm": 0.5350497495986325,
+      "learning_rate": 3.821609474213983e-06,
+      "loss": 0.7291,
+      "step": 1143
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.49917331824024186,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.7508,
+      "step": 1144
+    },
+    {
+      "epoch": 0.916,
+      "grad_norm": 0.48142429639175954,
+      "learning_rate": 3.6809555985639068e-06,
+      "loss": 0.7127,
+      "step": 1145
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.4142110499243709,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6757,
+      "step": 1146
+    },
+    {
+      "epoch": 0.9176,
+      "grad_norm": 0.5430439435262364,
+      "learning_rate": 3.5428903273411863e-06,
+      "loss": 0.6672,
+      "step": 1147
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.44330305598763314,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.7225,
+      "step": 1148
+    },
+    {
+      "epoch": 0.9192,
+      "grad_norm": 0.5096423082478944,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.7237,
+      "step": 1149
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.6415859400900558,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.7913,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9208,
+      "grad_norm": 0.5364277875549572,
+      "learning_rate": 3.2745403706978872e-06,
+      "loss": 0.8011,
+      "step": 1151
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4728954859163424,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6631,
+      "step": 1152
+    },
+    {
+      "epoch": 0.9224,
+      "grad_norm": 0.5739372532945478,
+      "learning_rate": 3.1442628972662704e-06,
+      "loss": 0.8542,
+      "step": 1153
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.4742345238629844,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.808,
+      "step": 1154
+    },
+    {
+      "epoch": 0.924,
+      "grad_norm": 0.4364079485028247,
+      "learning_rate": 3.0165884520461316e-06,
+      "loss": 0.7793,
+      "step": 1155
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.5331155865940347,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.7316,
+      "step": 1156
+    },
+    {
+      "epoch": 0.9256,
+      "grad_norm": 0.5758016730532783,
+      "learning_rate": 2.8915204663281013e-06,
+      "loss": 0.6958,
+      "step": 1157
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.5318202745282977,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.753,
+      "step": 1158
+    },
+    {
+      "epoch": 0.9272,
+      "grad_norm": 0.45960030309214356,
+      "learning_rate": 2.7690623013533976e-06,
+      "loss": 0.6986,
+      "step": 1159
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4748676117771573,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.6698,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9288,
+      "grad_norm": 0.49488892162902537,
+      "learning_rate": 2.649217248223468e-06,
+      "loss": 0.7741,
+      "step": 1161
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.6756885390812936,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.9493,
+      "step": 1162
+    },
+    {
+      "epoch": 0.9304,
+      "grad_norm": 0.4292986738342194,
+      "learning_rate": 2.5319885278115906e-06,
+      "loss": 0.6534,
+      "step": 1163
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.46699781607588653,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.6717,
+      "step": 1164
+    },
+    {
+      "epoch": 0.932,
+      "grad_norm": 0.4680392648836856,
+      "learning_rate": 2.4173792906762804e-06,
+      "loss": 0.6896,
+      "step": 1165
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.48369802762826214,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.7021,
+      "step": 1166
+    },
+    {
+      "epoch": 0.9336,
+      "grad_norm": 0.6039238486058927,
+      "learning_rate": 2.3053926169765984e-06,
+      "loss": 0.7789,
+      "step": 1167
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.5113843239607643,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.7344,
+      "step": 1168
+    },
+    {
+      "epoch": 0.9352,
+      "grad_norm": 0.4748128856557313,
+      "learning_rate": 2.1960315163894075e-06,
+      "loss": 0.7957,
+      "step": 1169
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.6193323473187806,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.7601,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9368,
+      "grad_norm": 0.5466716178846842,
+      "learning_rate": 2.0892989280284823e-06,
+      "loss": 0.7283,
+      "step": 1171
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.6398194238066671,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.819,
+      "step": 1172
+    },
+    {
+      "epoch": 0.9384,
+      "grad_norm": 0.5169431248383446,
+      "learning_rate": 1.9851977203654835e-06,
+      "loss": 0.7962,
+      "step": 1173
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.5401491941477569,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.7109,
+      "step": 1174
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.7791898394432363,
+      "learning_rate": 1.8837306911529184e-06,
+      "loss": 0.7562,
+      "step": 1175
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.47300366205848676,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6712,
+      "step": 1176
+    },
+    {
+      "epoch": 0.9416,
+      "grad_norm": 0.5138926803184707,
+      "learning_rate": 1.7849005673489127e-06,
+      "loss": 0.7554,
+      "step": 1177
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.44476488021254323,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.691,
+      "step": 1178
+    },
+    {
+      "epoch": 0.9432,
+      "grad_norm": 0.4243693394099248,
+      "learning_rate": 1.6887100050439587e-06,
+      "loss": 0.6572,
+      "step": 1179
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.5404335045720218,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.7063,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9448,
+      "grad_norm": 0.48037402819702374,
+      "learning_rate": 1.595161589389449e-06,
+      "loss": 0.6838,
+      "step": 1181
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.4427890539725505,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6735,
+      "step": 1182
+    },
+    {
+      "epoch": 0.9464,
+      "grad_norm": 0.46952013285789856,
+      "learning_rate": 1.5042578345283108e-06,
+      "loss": 0.6838,
+      "step": 1183
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.46652083777150416,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.7435,
+      "step": 1184
+    },
+    {
+      "epoch": 0.948,
+      "grad_norm": 0.5603154803427823,
+      "learning_rate": 1.4160011835273934e-06,
+      "loss": 0.7164,
+      "step": 1185
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.43953229098015056,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.7354,
+      "step": 1186
+    },
+    {
+      "epoch": 0.9496,
+      "grad_norm": 0.5030118359576425,
+      "learning_rate": 1.3303940083117527e-06,
+      "loss": 0.7117,
+      "step": 1187
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.6079376020902644,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.7948,
+      "step": 1188
+    },
+    {
+      "epoch": 0.9512,
+      "grad_norm": 0.6204696509532744,
+      "learning_rate": 1.2474386096010039e-06,
+      "loss": 0.7316,
+      "step": 1189
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.35035978662590106,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.6629,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9528,
+      "grad_norm": 0.5372213133397009,
+      "learning_rate": 1.1671372168474138e-06,
+      "loss": 0.7294,
+      "step": 1191
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.5643803187500273,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6886,
+      "step": 1192
+    },
+    {
+      "epoch": 0.9544,
+      "grad_norm": 0.4306253841050723,
+      "learning_rate": 1.089491988176017e-06,
+      "loss": 0.751,
+      "step": 1193
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 1.0330732280513122,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.671,
+      "step": 1194
+    },
+    {
+      "epoch": 0.956,
+      "grad_norm": 0.4447517153562386,
+      "learning_rate": 1.014505010326583e-06,
+      "loss": 0.7121,
+      "step": 1195
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.6492945091666067,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.7758,
+      "step": 1196
+    },
+    {
+      "epoch": 0.9576,
+      "grad_norm": 0.5360553457492165,
+      "learning_rate": 9.421782985976068e-07,
+      "loss": 0.7489,
+      "step": 1197
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.4555753036082643,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6926,
+      "step": 1198
+    },
+    {
+      "epoch": 0.9592,
+      "grad_norm": 0.5126847788360619,
+      "learning_rate": 8.725137967920738e-07,
+      "loss": 0.7871,
+      "step": 1199
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.5762488567433169,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.7813,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9608,
+      "grad_norm": 0.5245685500496474,
+      "learning_rate": 8.055133771652345e-07,
+      "loss": 0.7394,
+      "step": 1201
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.6669227035607159,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.8453,
+      "step": 1202
+    },
+    {
+      "epoch": 0.9624,
+      "grad_norm": 0.8007736012426442,
+      "learning_rate": 7.411788403743237e-07,
+      "loss": 0.859,
+      "step": 1203
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.4503205112473484,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.7615,
+      "step": 1204
+    },
+    {
+      "epoch": 0.964,
+      "grad_norm": 0.49124340433023866,
+      "learning_rate": 6.7951191543012e-07,
+      "loss": 0.7482,
+      "step": 1205
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.42003071858441315,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6472,
+      "step": 1206
+    },
+    {
+      "epoch": 0.9656,
+      "grad_norm": 0.4920302625092041,
+      "learning_rate": 6.205142596505176e-07,
+      "loss": 0.7542,
+      "step": 1207
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.37614228484179074,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.5923,
+      "step": 1208
+    },
+    {
+      "epoch": 0.9672,
+      "grad_norm": 0.8642822948189379,
+      "learning_rate": 5.64187458615939e-07,
+      "loss": 0.7638,
+      "step": 1209
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.46753756589344825,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.7803,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9688,
+      "grad_norm": 0.5012484711267575,
+      "learning_rate": 5.105330261267916e-07,
+      "loss": 0.7366,
+      "step": 1211
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.4652899199121445,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.7007,
+      "step": 1212
+    },
+    {
+      "epoch": 0.9704,
+      "grad_norm": 0.4954135796719314,
+      "learning_rate": 4.5955240416271084e-07,
+      "loss": 0.7854,
+      "step": 1213
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.49258990364938865,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6938,
+      "step": 1214
+    },
+    {
+      "epoch": 0.972,
+      "grad_norm": 0.7968378755440436,
+      "learning_rate": 4.112469628438365e-07,
+      "loss": 0.9773,
+      "step": 1215
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.5009259157520817,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6898,
+      "step": 1216
+    },
+    {
+      "epoch": 0.9736,
+      "grad_norm": 0.6449277388507836,
+      "learning_rate": 3.6561800039403016e-07,
+      "loss": 0.7816,
+      "step": 1217
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.5317943900934,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.8529,
+      "step": 1218
+    },
+    {
+      "epoch": 0.9752,
+      "grad_norm": 0.713439857473323,
+      "learning_rate": 3.2266674310589273e-07,
+      "loss": 0.7477,
+      "step": 1219
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4758822624453974,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.8277,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9768,
+      "grad_norm": 0.4926167423651641,
+      "learning_rate": 2.8239434530792365e-07,
+      "loss": 0.7139,
+      "step": 1221
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.5341949380143953,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.7415,
+      "step": 1222
+    },
+    {
+      "epoch": 0.9784,
+      "grad_norm": 0.426429622925696,
+      "learning_rate": 2.448018893333681e-07,
+      "loss": 0.7606,
+      "step": 1223
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.5305430558327908,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.7863,
+      "step": 1224
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.45737929758712137,
+      "learning_rate": 2.098903854912515e-07,
+      "loss": 0.7229,
+      "step": 1225
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.4494166283772884,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6684,
+      "step": 1226
+    },
+    {
+      "epoch": 0.9816,
+      "grad_norm": 0.4768000600089465,
+      "learning_rate": 1.7766077203915655e-07,
+      "loss": 0.772,
+      "step": 1227
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.4988486343729917,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.7593,
+      "step": 1228
+    },
+    {
+      "epoch": 0.9832,
+      "grad_norm": 0.40423663767248535,
+      "learning_rate": 1.481139151579991e-07,
+      "loss": 0.669,
+      "step": 1229
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.508845487296791,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.8378,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9848,
+      "grad_norm": 0.4633942933703335,
+      "learning_rate": 1.2125060892881346e-07,
+      "loss": 0.6417,
+      "step": 1231
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.476644606850743,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.7603,
+      "step": 1232
+    },
+    {
+      "epoch": 0.9864,
+      "grad_norm": 0.5714997209934206,
+      "learning_rate": 9.707157531134713e-08,
+      "loss": 0.8296,
+      "step": 1233
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.6631071655175154,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.8803,
+      "step": 1234
+    },
+    {
+      "epoch": 0.988,
+      "grad_norm": 0.48653574690084533,
+      "learning_rate": 7.557746412468758e-08,
+      "loss": 0.7311,
+      "step": 1235
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.4000816322132727,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6838,
+      "step": 1236
+    },
+    {
+      "epoch": 0.9896,
+      "grad_norm": 0.45438122903368056,
+      "learning_rate": 5.6768853029787184e-08,
+      "loss": 0.6974,
+      "step": 1237
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.508389340822278,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.7088,
+      "step": 1238
+    },
+    {
+      "epoch": 0.9912,
+      "grad_norm": 0.8140920471901532,
+      "learning_rate": 4.064624751394242e-08,
+      "loss": 0.7153,
+      "step": 1239
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.5118056867395696,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.7829,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9928,
+      "grad_norm": 0.4814940872961647,
+      "learning_rate": 2.7210080877237976e-08,
+      "loss": 0.7192,
+      "step": 1241
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.456682871619528,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.7038,
+      "step": 1242
+    },
+    {
+      "epoch": 0.9944,
+      "grad_norm": 0.6349546519149382,
+      "learning_rate": 1.646071422083395e-08,
+      "loss": 0.7399,
+      "step": 1243
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.49693359188520736,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.7429,
+      "step": 1244
+    },
+    {
+      "epoch": 0.996,
+      "grad_norm": 0.4505753949008942,
+      "learning_rate": 8.398436437317969e-09,
+      "loss": 0.7351,
+      "step": 1245
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.5128317699007771,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.7282,
+      "step": 1246
+    },
+    {
+      "epoch": 0.9976,
+      "grad_norm": 0.5283093131821548,
+      "learning_rate": 3.023464202944748e-09,
+      "loss": 0.7268,
+      "step": 1247
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.523943892468989,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.7612,
+      "step": 1248
+    },
+    {
+      "epoch": 0.9992,
+      "grad_norm": 0.46158018682087965,
+      "learning_rate": 3.3594197175190745e-10,
+      "loss": 0.6429,
+      "step": 1249
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.5046438985591529,
+      "learning_rate": 0.0,
+      "loss": 0.7855,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0,
+      "step": 1250,
+      "total_flos": 1026379869290496.0,
+      "train_loss": 0.8078127761363983,
+      "train_runtime": 18699.9059,
+      "train_samples_per_second": 1.07,
+      "train_steps_per_second": 0.067
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1026379869290496.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..65a18ac0ddc625e9b82c8f5e53d9256de168fde4
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "o_proj",
+    "q_proj",
+    "gate_proj",
+    "k_proj",
+    "v_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fa0c2839efffa06861cceb07cf9fb3120741d395
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a225dc5d95a6ae1c7bf5588ef1bf4ec64ac15f287f3a3160363872eab388aa1b
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9f3a8904ce65dcf46592de761d9190b14e4d6f3a
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:256a8d5f058d7d4982bc5b91fb56e78638bba9a1a33d405895391adf2f6adabd
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..227027f4723ce951a6382010afd8937e12b3aede
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.002475123335683,
+      "learning_rate": 5e-05,
+      "loss": 1.4698,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8049937709697602,
+      "learning_rate": 0.0001,
+      "loss": 1.1835,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.8183093836391889,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.2271,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.7683921707563225,
+      "learning_rate": 0.0002,
+      "loss": 1.1947,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.0510309251197087,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 1.2203,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.8134488573383152,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 1.0905,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.6526412161282288,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 1.0465,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5204967415290006,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 0.9215,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.6511808300296046,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 0.988,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.813306426855869,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 0.9748,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.6112244073376245,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 0.9328,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.6296798323148467,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 1.0615,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5638071334528209,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 0.8301,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.7128440290106496,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 0.846,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.6380147511290136,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.9178,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.6283060315082349,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 0.991,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.5178639087438065,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 0.9272,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.551827741755611,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 0.9921,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.4462509568812092,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 0.8086,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.6584174654719149,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 0.9826,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.5995121455746469,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 0.9427,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.5839687699018469,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 0.9771,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.5688782708463275,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 0.8553,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.550008976055812,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 0.9807,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5891229610132139,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 1.07,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.557817217847609,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 1.0502,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.4672201450304275,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 0.7614,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.583233571588004,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 0.8497,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.5361346707919389,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 0.9454,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.7578504266330326,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 0.9485,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.7930891130981601,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 1.0536,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.5180379940487552,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 0.9213,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.5701771012145226,
+      "learning_rate": 0.000172967916579403,
+      "loss": 0.8956,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.7284617561125063,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 0.993,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.4134043312534286,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 0.9732,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.42496793413863926,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 0.7763,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.6619793121099012,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.9008,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.6522244445478148,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 1.0672,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.5462383817752346,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 0.9031,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5676805053548298,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 0.8455,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.5240811947165249,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 0.902,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4613906058985293,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 0.8615,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.4646253048462273,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 0.8244,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.529737159176755,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 0.8921,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4771248270697291,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 0.8796,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.6050629894691126,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 0.9151,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.7838777188238114,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 0.8912,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.6902887239461467,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.9096,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.5683919970051982,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 0.9269,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5290672853989341,
+      "learning_rate": 0.000136764169663272,
+      "loss": 0.8106,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.6170953146475162,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 0.965,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4521329285137652,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 0.9227,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.5521583271278084,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 0.9496,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4666459577181258,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 0.8592,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.4292954670514068,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 0.7974,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.5466551052523965,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 0.8403,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.5883302435010964,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 0.8502,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.557397470521136,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 0.9663,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.5412293057735887,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.9034,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.489899862903999,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 0.9012,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.5260608475920923,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 0.8843,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.6241407824627501,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 0.8676,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.5395216158918755,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.8135,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.49240885512518473,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.8643,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.5511692850840025,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.9358,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.5114676398612253,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.8654,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.5612846646984437,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.9446,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.5697157540640136,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.9043,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.6772087664591744,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.9843,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3986979971884232,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.7799,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.526004756548793,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.8681,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.5035036852178381,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.8732,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.6202080509189101,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.9516,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.5163561353247864,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.8012,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4611214523140949,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.7428,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4680448157741042,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.8145,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.5157699384388195,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.8448,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.42118567269139934,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.7818,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.4084175987467916,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.7473,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.7765002711996135,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 1.0617,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.5141505553150338,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.8848,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4665505106546281,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.7238,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.497247057356739,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.8234,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4914760684720225,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.8842,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.5224002630076798,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.7794,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.5024783519544875,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.889,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.6938810000753591,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.8999,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.47416623333153984,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.8048,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.4149860597614193,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.8108,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.7010817596406334,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 1.0248,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.6575269786064363,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.9302,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.5316026929850952,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.9774,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4776089538625679,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.8102,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.5600853475954763,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.8381,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.41374368607395756,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.7665,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4837638603469293,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.8811,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.4972027931391034,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.8563,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4990914932563448,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.8019,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.4319357944618215,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.7844,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.5958916752528824,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 1.026,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.5311068727150814,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.8616,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.6460726823451469,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 1.0715,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.5343341046453493,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.8666,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.4809311266333434,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.8524,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.47321674516129725,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.8301,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.5182733171026215,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.8791,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.4645511558470242,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.891,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.47446677400627907,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.8293,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.4163475564341525,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.7825,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.49642122885816725,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.8895,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.5637361463741167,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.7237,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.5863037005074132,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.9806,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.46124574810331515,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.7719,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.5926944784931045,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.875,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.5545491147233557,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.8167,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.43671807283873015,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.8003,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.6098249666296813,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.8736,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4601261313002717,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.8029,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4248790991858984,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.8119,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.5713345517214953,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.961,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.42342878395245415,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.834,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4709530957161904,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.7841,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.6171273556109618,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 1.0197,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4367515608451683,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.7927,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.6110106581034457,
+      "learning_rate": 0.0,
+      "loss": 0.9813,
+      "step": 125
+    },
+    {
+      "epoch": 1.0,
+      "step": 125,
+      "total_flos": 97576377352192.0,
+      "train_loss": 0.9055721774101257,
+      "train_runtime": 1848.2224,
+      "train_samples_per_second": 1.082,
+      "train_steps_per_second": 0.068
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 97576377352192.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9aee7ed8abfbef9fe7c30417e210c0d1bcc6e33
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "q_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..27983adf166e44fc657e2e63a687702682d388e2
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46ebc18e44273048ffabcbd6989f0070ad2770ebe3b610b1f6233da3c0473f0e
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b19207afc66091ffecde940adcdab111c0a2b02e
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53050a5bad14abe47326aa75a3b469ebb320c2737545f12d905bb248e1a3a2e3
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a2990cea99da5b26aa69d7c9339df54f5d331e6
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,476 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.992,
+  "eval_steps": 500,
+  "global_step": 62,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.7732680457900044,
+      "learning_rate": 0.0001,
+      "loss": 1.3267,
+      "step": 1
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.7315180877831058,
+      "learning_rate": 0.0002,
+      "loss": 1.3075,
+      "step": 2
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5820563140825258,
+      "learning_rate": 0.0001998629534754574,
+      "loss": 1.2362,
+      "step": 3
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 1.0746012742648456,
+      "learning_rate": 0.00019945218953682734,
+      "loss": 1.1503,
+      "step": 4
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.8470915908655696,
+      "learning_rate": 0.00019876883405951377,
+      "loss": 1.0586,
+      "step": 5
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.44503583996845364,
+      "learning_rate": 0.00019781476007338058,
+      "loss": 1.0279,
+      "step": 6
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.44536491624905683,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.8624,
+      "step": 7
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.5470997705620867,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 0.9922,
+      "step": 8
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.5609845265919665,
+      "learning_rate": 0.00019335804264972018,
+      "loss": 0.9773,
+      "step": 9
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4590082649410531,
+      "learning_rate": 0.0001913545457642601,
+      "loss": 0.9101,
+      "step": 10
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.5114746549361595,
+      "learning_rate": 0.0001891006524188368,
+      "loss": 0.979,
+      "step": 11
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4431229621480573,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.9336,
+      "step": 12
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4668034216043737,
+      "learning_rate": 0.00018386705679454242,
+      "loss": 1.0725,
+      "step": 13
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.39231345197066647,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.8184,
+      "step": 14
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.48905103020214946,
+      "learning_rate": 0.0001777145961456971,
+      "loss": 0.9652,
+      "step": 15
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.518970830511111,
+      "learning_rate": 0.00017431448254773944,
+      "loss": 1.0049,
+      "step": 16
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.47750721567014115,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.9495,
+      "step": 17
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.36048296396596413,
+      "learning_rate": 0.00016691306063588583,
+      "loss": 0.8709,
+      "step": 18
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4572367034332387,
+      "learning_rate": 0.00016293203910498376,
+      "loss": 0.9819,
+      "step": 19
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3922447630076946,
+      "learning_rate": 0.00015877852522924732,
+      "loss": 0.878,
+      "step": 20
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.34455434430176324,
+      "learning_rate": 0.00015446390350150273,
+      "loss": 0.8797,
+      "step": 21
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3481663865777293,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8555,
+      "step": 22
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3951295872344038,
+      "learning_rate": 0.00014539904997395468,
+      "loss": 0.9022,
+      "step": 23
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.45425405167930893,
+      "learning_rate": 0.00014067366430758004,
+      "loss": 0.9084,
+      "step": 24
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3888790701288202,
+      "learning_rate": 0.00013583679495453,
+      "loss": 0.8691,
+      "step": 25
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3859572309538235,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.9427,
+      "step": 26
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.3750342958666849,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.8988,
+      "step": 27
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.42707945856135066,
+      "learning_rate": 0.00012079116908177593,
+      "loss": 0.8242,
+      "step": 28
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3569583535912753,
+      "learning_rate": 0.0001156434465040231,
+      "loss": 0.9102,
+      "step": 29
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3975920783643425,
+      "learning_rate": 0.00011045284632676536,
+      "loss": 0.9073,
+      "step": 30
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4446733801857913,
+      "learning_rate": 0.0001052335956242944,
+      "loss": 0.8722,
+      "step": 31
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.36800471454131656,
+      "learning_rate": 0.0001,
+      "loss": 0.8467,
+      "step": 32
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3721984511543991,
+      "learning_rate": 9.476640437570562e-05,
+      "loss": 0.9088,
+      "step": 33
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.38777813205326916,
+      "learning_rate": 8.954715367323468e-05,
+      "loss": 0.9306,
+      "step": 34
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.35423324887463065,
+      "learning_rate": 8.435655349597689e-05,
+      "loss": 0.8836,
+      "step": 35
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.36569409428500577,
+      "learning_rate": 7.920883091822408e-05,
+      "loss": 0.8736,
+      "step": 36
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4578563744683934,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.8879,
+      "step": 37
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3501501062668216,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.7837,
+      "step": 38
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.33086131280733955,
+      "learning_rate": 6.416320504546997e-05,
+      "loss": 0.8213,
+      "step": 39
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.45835208970698804,
+      "learning_rate": 5.9326335692419995e-05,
+      "loss": 0.9113,
+      "step": 40
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.36550765994451895,
+      "learning_rate": 5.4600950026045326e-05,
+      "loss": 0.8105,
+      "step": 41
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3361874225881034,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.8611,
+      "step": 42
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3722661952123568,
+      "learning_rate": 4.5536096498497295e-05,
+      "loss": 0.8403,
+      "step": 43
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4210787324219839,
+      "learning_rate": 4.12214747707527e-05,
+      "loss": 0.857,
+      "step": 44
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.4282075988975985,
+      "learning_rate": 3.7067960895016275e-05,
+      "loss": 0.9317,
+      "step": 45
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.43157948987013084,
+      "learning_rate": 3.308693936411421e-05,
+      "loss": 0.9615,
+      "step": 46
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.38491521771794995,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.833,
+      "step": 47
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3171022379562048,
+      "learning_rate": 2.5685517452260567e-05,
+      "loss": 0.8299,
+      "step": 48
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4273540932961301,
+      "learning_rate": 2.2285403854302912e-05,
+      "loss": 0.8365,
+      "step": 49
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3909218340959966,
+      "learning_rate": 1.9098300562505266e-05,
+      "loss": 0.9118,
+      "step": 50
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.44627480284108617,
+      "learning_rate": 1.6132943205457606e-05,
+      "loss": 0.9798,
+      "step": 51
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.37564635907441385,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.8687,
+      "step": 52
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3964836458446199,
+      "learning_rate": 1.0899347581163221e-05,
+      "loss": 0.8678,
+      "step": 53
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3335011216773086,
+      "learning_rate": 8.645454235739903e-06,
+      "loss": 0.8725,
+      "step": 54
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.353338514931596,
+      "learning_rate": 6.6419573502798374e-06,
+      "loss": 0.8435,
+      "step": 55
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.40299785540917815,
+      "learning_rate": 4.8943483704846475e-06,
+      "loss": 0.8651,
+      "step": 56
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3805449892675675,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.8376,
+      "step": 57
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.354414880275706,
+      "learning_rate": 2.1852399266194314e-06,
+      "loss": 0.822,
+      "step": 58
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.40618551125903773,
+      "learning_rate": 1.231165940486234e-06,
+      "loss": 0.8489,
+      "step": 59
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.340197098413167,
+      "learning_rate": 5.478104631726711e-07,
+      "loss": 0.8971,
+      "step": 60
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.30969710317886306,
+      "learning_rate": 1.3704652454261668e-07,
+      "loss": 0.8196,
+      "step": 61
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.41364963711720143,
+      "learning_rate": 0.0,
+      "loss": 0.9221,
+      "step": 62
+    },
+    {
+      "epoch": 0.992,
+      "step": 62,
+      "total_flos": 142969741377536.0,
+      "train_loss": 0.9199699201891499,
+      "train_runtime": 1832.8279,
+      "train_samples_per_second": 1.091,
+      "train_steps_per_second": 0.034
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 62,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 142969741377536.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b400cc756ab92d6dd0bbdd0ee89424d3f3eda6bd
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "o_proj",
+    "q_proj",
+    "k_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8bc85224d0cd7e560b907506e636b2b16b5e3b17
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca558bafced7c7b27a9208da4ee3181a6b8d61c75e762ec47d008fe6fbd2140c
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a78cbc4c43ab4be89716e7aa80974519654f631b
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc284ccd6a5a914da3aaf3a30045df73f4add33df7f51fb65b6511edb9adabc5
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..37cbc589e91f12d02d4552c4f67eeb5e4d9d585f
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.0256564435424111,
+      "learning_rate": 5e-05,
+      "loss": 1.4655,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.7552355489285593,
+      "learning_rate": 0.0001,
+      "loss": 1.2215,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.6883786739819368,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.1932,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.7970757164530996,
+      "learning_rate": 0.0002,
+      "loss": 1.2414,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.900673855041327,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 1.1636,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.8148780701606347,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 1.0844,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.677259070559955,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 0.9484,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5546823400726815,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 0.981,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.8640394420809945,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 1.0034,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7573926449411329,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 1.1,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.5785453292954227,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 0.8881,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.6166851826434284,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 0.9721,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5915963542639949,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 0.9505,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5694992463916143,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 0.9008,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5813339466623496,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.9677,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.631903506762042,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 1.0503,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.49686139134072715,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 0.8793,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.5712952636157388,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 0.9202,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.47221550019952907,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 0.9442,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.6481338782372316,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 0.9982,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.49626606548379415,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 0.8958,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.5263065418122417,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 0.9311,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.5978993780520426,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 0.8855,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.48422608170661724,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 0.8992,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5583000466790079,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 0.9376,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.571615352349519,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.9632,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.4227469990522799,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 0.7566,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4975002354693412,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 0.8079,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.5206883816667652,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 0.9136,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.6640775128452036,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 1.002,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.7499661444791574,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 1.0519,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.5344465855257942,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 0.8921,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.5469745110497275,
+      "learning_rate": 0.000172967916579403,
+      "loss": 0.8649,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.7278110424470223,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 1.0113,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5756491784352804,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 0.9774,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4363257043622499,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 0.8408,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.6394076037709749,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.9368,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.5240474084698316,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 0.9945,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.47424846415648575,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 0.9055,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.6321195449640729,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 0.9368,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.46626876205754336,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 0.8828,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4697338340572203,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 0.8327,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.5418381396777817,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 0.8599,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5393492791835172,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 0.9383,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4327656946256297,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 0.8041,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.5974899791865946,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 0.9177,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.6401617960325086,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 0.9027,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.5481131970503021,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.8235,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.6073544798834368,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 0.9103,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.565209541323449,
+      "learning_rate": 0.000136764169663272,
+      "loss": 0.7908,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.5448502238275968,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 0.8594,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.5523586951983569,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 0.9575,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.5250307199317558,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 0.9733,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.5532536577531625,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 0.9504,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.5052948111209579,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 0.8633,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4805987502443205,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 0.8458,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.46115274580823334,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 0.899,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.5760045108097972,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 1.0322,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.5570620890429253,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.9192,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3867470289209327,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 0.7405,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.5035270531240534,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 0.8517,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.7185476011366976,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 1.0731,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.5910265699485117,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.853,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.5851423007341354,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.8727,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.7056454049413521,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.9447,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4891741063331434,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.8942,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.5732278436397883,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.8911,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.485622139957875,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.8543,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.6907666024932004,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.8492,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.33329903514610504,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.6644,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.4918765786267516,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.8767,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.46601601831477785,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.8073,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.6301470263304556,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.9524,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4644794063278805,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.769,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4642745345431068,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.8602,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.5431808435710237,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.9022,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.4667366354704192,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.8729,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.4262573768737331,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.856,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.4153950446060949,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.7869,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.7707225868989106,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 1.131,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.4561390581971247,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.7539,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4794829311105941,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.786,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4895576383157025,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.8944,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4273926670520935,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.8019,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.556449141244985,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.8711,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.47015114999902236,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.8838,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.5704985671822204,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.8481,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4365452710110725,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.8386,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.391305650590876,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.8043,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.6824405839379895,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 0.9423,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.660416355913274,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.8834,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.6227410059725017,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.9125,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4583331907499313,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.8201,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.5392615246175726,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.7673,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.4023904332506835,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.6999,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.48547058856396635,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.885,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.4544749425094669,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.792,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.48299342465590717,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.7639,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.39134288979882637,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.7161,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.5137604085244599,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.8608,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.47536392660264287,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.7838,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.564710086900722,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 0.9152,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.4969437135139688,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.8222,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.49254817501894227,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.9312,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.48952543980152713,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.8452,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4424646360215586,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.8221,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.4175512352633237,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.7827,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.5831510931897785,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.9001,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.5059003584438967,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.8749,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4736929126239816,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.7588,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.430356609626266,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.7268,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.46958716530124167,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.8987,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.4162473319543384,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.8112,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.5341610230495429,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.9766,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.5720919227278933,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.8654,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4560340817159988,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.776,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.6335819308236421,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.8167,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.43926428679607193,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.8853,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4925124228161911,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.883,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.41754865671962843,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.8502,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.47875102943981157,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.9114,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3997925650898424,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.7552,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.6125224289311584,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.9852,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4513526909585683,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.8328,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.5018800693252005,
+      "learning_rate": 0.0,
+      "loss": 0.7772,
+      "step": 125
+    },
+    {
+      "epoch": 1.0,
+      "step": 125,
+      "total_flos": 102017789034496.0,
+      "train_loss": 0.8980674777030945,
+      "train_runtime": 1842.54,
+      "train_samples_per_second": 1.085,
+      "train_steps_per_second": 0.068
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 102017789034496.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..00f124d2409ee80e0a03741e688c81d3af18b475
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "k_proj",
+    "o_proj",
+    "q_proj",
+    "up_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9832d429f53322da94e68110a0081c2e8aff015b
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26036b497c88ae220df32bcfed67582fd83decba854b1fbb7306af08bba9cc9a
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f53428b1fc659506742fbd14baa7edaea389476e
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a55781e86d42bff8bcbd7ff9f97ecbb1b2e38408ccabd43d24ee3686edaad85
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a4101fcf05ffb025b481e90ecab1245ff5a0b891
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,476 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.992,
+  "eval_steps": 500,
+  "global_step": 62,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.782165330544756,
+      "learning_rate": 0.0001,
+      "loss": 1.3435,
+      "step": 1
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.754070307459071,
+      "learning_rate": 0.0002,
+      "loss": 1.3293,
+      "step": 2
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5461919719204326,
+      "learning_rate": 0.0001998629534754574,
+      "loss": 1.1966,
+      "step": 3
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.9835561940313817,
+      "learning_rate": 0.00019945218953682734,
+      "loss": 1.114,
+      "step": 4
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.9635137567561632,
+      "learning_rate": 0.00019876883405951377,
+      "loss": 1.129,
+      "step": 5
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.45232540275106836,
+      "learning_rate": 0.00019781476007338058,
+      "loss": 0.9551,
+      "step": 6
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.46885643899795243,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.9536,
+      "step": 7
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.47240411115859143,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 1.0253,
+      "step": 8
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.469293314652227,
+      "learning_rate": 0.00019335804264972018,
+      "loss": 0.9232,
+      "step": 9
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5104546446757983,
+      "learning_rate": 0.0001913545457642601,
+      "loss": 0.985,
+      "step": 10
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4601532263027663,
+      "learning_rate": 0.0001891006524188368,
+      "loss": 0.9291,
+      "step": 11
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4486806246135983,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.9031,
+      "step": 12
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4571343318182727,
+      "learning_rate": 0.00018386705679454242,
+      "loss": 0.9608,
+      "step": 13
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.36898115931198266,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.7977,
+      "step": 14
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.45407015287167707,
+      "learning_rate": 0.0001777145961456971,
+      "loss": 0.9692,
+      "step": 15
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4624428629659114,
+      "learning_rate": 0.00017431448254773944,
+      "loss": 0.9795,
+      "step": 16
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.43933391342487677,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.9406,
+      "step": 17
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.38659300160601445,
+      "learning_rate": 0.00016691306063588583,
+      "loss": 0.9119,
+      "step": 18
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.44012941016542223,
+      "learning_rate": 0.00016293203910498376,
+      "loss": 0.9715,
+      "step": 19
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.41558944105943874,
+      "learning_rate": 0.00015877852522924732,
+      "loss": 0.9331,
+      "step": 20
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.37477800540519907,
+      "learning_rate": 0.00015446390350150273,
+      "loss": 0.8592,
+      "step": 21
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3865244154226343,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.9036,
+      "step": 22
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3723977046786962,
+      "learning_rate": 0.00014539904997395468,
+      "loss": 0.8628,
+      "step": 23
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.5174938737735406,
+      "learning_rate": 0.00014067366430758004,
+      "loss": 0.8709,
+      "step": 24
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4309191480685511,
+      "learning_rate": 0.00013583679495453,
+      "loss": 0.8449,
+      "step": 25
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3858404886322895,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.9132,
+      "step": 26
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.3947316248290328,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.9646,
+      "step": 27
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3732869809484617,
+      "learning_rate": 0.00012079116908177593,
+      "loss": 0.8615,
+      "step": 28
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.40598163636604373,
+      "learning_rate": 0.0001156434465040231,
+      "loss": 0.9674,
+      "step": 29
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3805757812339103,
+      "learning_rate": 0.00011045284632676536,
+      "loss": 0.8304,
+      "step": 30
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.43863699893987973,
+      "learning_rate": 0.0001052335956242944,
+      "loss": 0.9668,
+      "step": 31
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4098167304825637,
+      "learning_rate": 0.0001,
+      "loss": 0.8626,
+      "step": 32
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4571546550706454,
+      "learning_rate": 9.476640437570562e-05,
+      "loss": 0.9252,
+      "step": 33
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4084843884323696,
+      "learning_rate": 8.954715367323468e-05,
+      "loss": 0.8752,
+      "step": 34
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3636969358131221,
+      "learning_rate": 8.435655349597689e-05,
+      "loss": 0.7612,
+      "step": 35
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.35072568532082526,
+      "learning_rate": 7.920883091822408e-05,
+      "loss": 0.847,
+      "step": 36
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4076963271445678,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.8668,
+      "step": 37
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3794362951488034,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.887,
+      "step": 38
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.35306265395704695,
+      "learning_rate": 6.416320504546997e-05,
+      "loss": 0.8711,
+      "step": 39
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.4539374345769442,
+      "learning_rate": 5.9326335692419995e-05,
+      "loss": 0.9706,
+      "step": 40
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3460724552816038,
+      "learning_rate": 5.4600950026045326e-05,
+      "loss": 0.7806,
+      "step": 41
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.33225221207720107,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.8534,
+      "step": 42
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3773553326234811,
+      "learning_rate": 4.5536096498497295e-05,
+      "loss": 0.8824,
+      "step": 43
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3721320667635631,
+      "learning_rate": 4.12214747707527e-05,
+      "loss": 0.8505,
+      "step": 44
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3952460893667804,
+      "learning_rate": 3.7067960895016275e-05,
+      "loss": 0.8821,
+      "step": 45
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.4583596405040786,
+      "learning_rate": 3.308693936411421e-05,
+      "loss": 0.9099,
+      "step": 46
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.34318822344159067,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.8046,
+      "step": 47
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3974864621077287,
+      "learning_rate": 2.5685517452260567e-05,
+      "loss": 0.8029,
+      "step": 48
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.33691562279115866,
+      "learning_rate": 2.2285403854302912e-05,
+      "loss": 0.7858,
+      "step": 49
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3468959629694207,
+      "learning_rate": 1.9098300562505266e-05,
+      "loss": 0.7976,
+      "step": 50
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.4023248117346016,
+      "learning_rate": 1.6132943205457606e-05,
+      "loss": 0.8573,
+      "step": 51
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3788853906081972,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.8905,
+      "step": 52
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.33720185569187455,
+      "learning_rate": 1.0899347581163221e-05,
+      "loss": 0.8469,
+      "step": 53
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3739374025884539,
+      "learning_rate": 8.645454235739903e-06,
+      "loss": 0.8505,
+      "step": 54
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3505596171112871,
+      "learning_rate": 6.6419573502798374e-06,
+      "loss": 0.8288,
+      "step": 55
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3417938856878257,
+      "learning_rate": 4.8943483704846475e-06,
+      "loss": 0.822,
+      "step": 56
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3688386882858889,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.9049,
+      "step": 57
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3665705430510355,
+      "learning_rate": 2.1852399266194314e-06,
+      "loss": 0.8364,
+      "step": 58
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.37903546324196835,
+      "learning_rate": 1.231165940486234e-06,
+      "loss": 0.864,
+      "step": 59
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.33909260997680446,
+      "learning_rate": 5.478104631726711e-07,
+      "loss": 0.8762,
+      "step": 60
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3213467139172359,
+      "learning_rate": 1.3704652454261668e-07,
+      "loss": 0.8468,
+      "step": 61
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3981209076516244,
+      "learning_rate": 0.0,
+      "loss": 0.9223,
+      "step": 62
+    },
+    {
+      "epoch": 0.992,
+      "step": 62,
+      "total_flos": 146045366304768.0,
+      "train_loss": 0.9138678091187631,
+      "train_runtime": 1818.899,
+      "train_samples_per_second": 1.1,
+      "train_steps_per_second": 0.034
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 62,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 146045366304768.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..505d4ac1d30fa33adf5f438cc8f88c29ee45a54c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "v_proj",
+    "down_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..dfabb4564d1142a817d39e880f869095d8b7fa46
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30a3447fe5b036a4f8feb09dcd886357112019ee0b7e6259eb0b48aeae510bfe
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fdb0860c5097fc8eda2d1915d07515d5c4e36d7f
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4087778e60c19a0c01403e8aea7fb0ce9d0d3525e435d56f9ea21771a9c2aaa5
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c936d5f48a57435cebbfb7dcbc1584e2d29abfdb
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.0332716441410472,
+      "learning_rate": 5e-05,
+      "loss": 1.453,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8514892777847967,
+      "learning_rate": 0.0001,
+      "loss": 1.3234,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.6959198723787812,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.2067,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.8987235917741394,
+      "learning_rate": 0.0002,
+      "loss": 1.1664,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.1299456614468177,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 1.1911,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.793226445782373,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 1.1325,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.9253618295353283,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 1.0616,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5422074768572798,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 0.9376,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.7037637052231442,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 1.1079,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7361037631831095,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 1.0008,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.5450700968763571,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 0.8945,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.6663281248179639,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 1.0249,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5780218413581901,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 0.9906,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5939027347752556,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 0.8611,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5385064078264016,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.9205,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.5649023377388397,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 0.9865,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4899049124834227,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 0.9336,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.5134066015087574,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 0.8971,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.49291923410925337,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 0.865,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5925651438966584,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 0.9769,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.5320287155420453,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 0.9529,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.5644447042868125,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 0.9704,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.614822477245167,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 0.9425,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.49909622212960736,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 0.9296,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5414158089755281,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 1.0019,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.6410161194084684,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.9462,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5017741156898022,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 0.7758,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5629101098935121,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 0.9592,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.7337176164220769,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 1.0029,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.6164075654413312,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 0.9299,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.7419949695157417,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 1.0221,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.5299112205217898,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 0.9308,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.5776412989569846,
+      "learning_rate": 0.000172967916579403,
+      "loss": 0.9942,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.6313052845956789,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 1.0971,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5003061050107913,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 0.8396,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4591443010727368,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 0.7597,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.5911488491446177,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.8529,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.5989227409817768,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 0.9804,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.563063337709486,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 0.9114,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.730771230167327,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 0.9354,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.4639652152528779,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 0.8546,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.46037957857450756,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 0.8659,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.5473058205736925,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 0.8318,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.46545563704205734,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 0.9029,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.46677463624164733,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 0.8779,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.6437448823503382,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 0.9853,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.5912726223579762,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 0.9654,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.5474299182817363,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.8757,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.6580789970255577,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 0.8612,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5334122921747871,
+      "learning_rate": 0.000136764169663272,
+      "loss": 0.8773,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.6562301905218774,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 0.9756,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.5101667749739541,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 0.928,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.5565479940344975,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 1.0204,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.5638515277819136,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 0.9225,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.43460188063783134,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 0.7718,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4554073002029318,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 0.8008,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.5208371009471047,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 0.8889,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.5593229063063273,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 0.9384,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.54960098782361,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.9036,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.48884855977815644,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 0.8171,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.46435529530768666,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 0.8391,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.6628142660135761,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 0.8476,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.6122798204602851,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.88,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.49046332010661803,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.8288,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.6136093202470114,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 1.0322,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.5382687359600139,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.9218,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.5377382533156001,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 1.024,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.5201987253132132,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.9424,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.7127669438364177,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.8807,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.37575821411555355,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.7847,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.49383393324752206,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.8894,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.523020599533793,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.8799,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.5727461338955292,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.9956,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.5591412806115048,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.8745,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4893113572410945,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.8439,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4501928716993276,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.8346,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.4411794345564976,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.7823,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.4815928556011765,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.9199,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.45213927151139305,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.7929,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.8202385268416413,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 1.1149,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.43244185742587343,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.7356,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4645090449706881,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.8437,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.43524775119682985,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.8441,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4929646970860245,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.8442,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.5681369158524896,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.939,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.49771416915403976,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.8819,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.6441986856235902,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.9392,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.41313256987360597,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.7875,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.4259263131590564,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.808,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.6227537569017948,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 1.0245,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.588509982783627,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.7623,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.5622760139848599,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.9896,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.48763232574296367,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.8559,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.4667304015024215,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.8267,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.4261148874220418,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.8342,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4603737051292125,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.8217,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.4633578614777036,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.8426,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4740449779874253,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.7619,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.5256639252782974,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.9056,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.5471215610899327,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.9285,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.535860622346642,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.8971,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.6307994094530405,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 1.0222,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.5641556223888017,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.9874,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.597451406822819,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.8467,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.47083496691750065,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.8349,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.5075295600997802,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.9282,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.5065339725265197,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.7942,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.5116839630404477,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.8678,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.5642318524566664,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.8711,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5469422927536324,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.8471,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.42867885430913916,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.7824,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.4815338643224403,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.8101,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.4968549860424203,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.7646,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4976065822884428,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.8364,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.5250560990287962,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.8047,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4690379102326673,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.8484,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.5793237985664929,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.9048,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4269403095096772,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.7737,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.46070532681301707,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.774,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.46255760757400677,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.9062,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.4638669355270497,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.8366,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.39893326777206606,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.7951,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.6063856963402054,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.8833,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.48929628653621665,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.894,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.5869716432080327,
+      "learning_rate": 0.0,
+      "loss": 0.9297,
+      "step": 125
+    },
+    {
+      "epoch": 1.0,
+      "step": 125,
+      "total_flos": 101132938903552.0,
+      "train_loss": 0.9124656667709351,
+      "train_runtime": 1847.9032,
+      "train_samples_per_second": 1.082,
+      "train_steps_per_second": 0.068
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 101132938903552.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d454d3aa3d035dd52481edda13024c9006e2ae5e
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "q_proj",
+    "down_proj",
+    "o_proj",
+    "up_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..689a602f2a626b38c8fd9f3480f1f0369532ccc8
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1193313d43dcfadaebfbe1f934c944b44e81d1cc55efe9e59b9caba49b9348dd
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d85cb3c0d0899ced2143e96dd10db65277bb979e
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b963a4ff177f9c1ffb786e646e68709ebae25e6b73263d2b0fb7f24eb75bdaba
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3bbb77963f32d428ddecf087dc5b022f97043555
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,476 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.992,
+  "eval_steps": 500,
+  "global_step": 62,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8115376946117644,
+      "learning_rate": 0.0001,
+      "loss": 1.3882,
+      "step": 1
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.7379596036373107,
+      "learning_rate": 0.0002,
+      "loss": 1.2875,
+      "step": 2
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.6516716706764099,
+      "learning_rate": 0.0001998629534754574,
+      "loss": 1.2321,
+      "step": 3
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.8045523995759459,
+      "learning_rate": 0.00019945218953682734,
+      "loss": 1.1171,
+      "step": 4
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.9748341310102832,
+      "learning_rate": 0.00019876883405951377,
+      "loss": 1.1203,
+      "step": 5
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.43262498003397853,
+      "learning_rate": 0.00019781476007338058,
+      "loss": 0.9855,
+      "step": 6
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.43710555081674196,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.9553,
+      "step": 7
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4299584668831962,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 0.9737,
+      "step": 8
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.40784014191864315,
+      "learning_rate": 0.00019335804264972018,
+      "loss": 0.9345,
+      "step": 9
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4890797924633881,
+      "learning_rate": 0.0001913545457642601,
+      "loss": 0.942,
+      "step": 10
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.47290779258642107,
+      "learning_rate": 0.0001891006524188368,
+      "loss": 0.9837,
+      "step": 11
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.45166657752550265,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.9566,
+      "step": 12
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4802427221669656,
+      "learning_rate": 0.00018386705679454242,
+      "loss": 0.998,
+      "step": 13
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.39526045068707355,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.8799,
+      "step": 14
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4410743595660131,
+      "learning_rate": 0.0001777145961456971,
+      "loss": 0.9714,
+      "step": 15
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.46620032478721474,
+      "learning_rate": 0.00017431448254773944,
+      "loss": 0.9765,
+      "step": 16
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.47112777607913847,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 1.0516,
+      "step": 17
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.34959905040630146,
+      "learning_rate": 0.00016691306063588583,
+      "loss": 0.8071,
+      "step": 18
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.42026296264872304,
+      "learning_rate": 0.00016293203910498376,
+      "loss": 0.9082,
+      "step": 19
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.46200194126278427,
+      "learning_rate": 0.00015877852522924732,
+      "loss": 0.9288,
+      "step": 20
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.34456840268198746,
+      "learning_rate": 0.00015446390350150273,
+      "loss": 0.858,
+      "step": 21
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3515579855408626,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8691,
+      "step": 22
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.38030364310331255,
+      "learning_rate": 0.00014539904997395468,
+      "loss": 0.9363,
+      "step": 23
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.42311870140610947,
+      "learning_rate": 0.00014067366430758004,
+      "loss": 0.9232,
+      "step": 24
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.39866994027687425,
+      "learning_rate": 0.00013583679495453,
+      "loss": 0.8756,
+      "step": 25
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.41257440907704074,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.956,
+      "step": 26
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.40270214195226256,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.9732,
+      "step": 27
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3252586417943798,
+      "learning_rate": 0.00012079116908177593,
+      "loss": 0.7926,
+      "step": 28
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.40219994775926615,
+      "learning_rate": 0.0001156434465040231,
+      "loss": 0.9141,
+      "step": 29
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4617984370404977,
+      "learning_rate": 0.00011045284632676536,
+      "loss": 0.8637,
+      "step": 30
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4517632713189801,
+      "learning_rate": 0.0001052335956242944,
+      "loss": 0.84,
+      "step": 31
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.39591319078574316,
+      "learning_rate": 0.0001,
+      "loss": 0.8572,
+      "step": 32
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.43145688796857856,
+      "learning_rate": 9.476640437570562e-05,
+      "loss": 0.9818,
+      "step": 33
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.48356961743265936,
+      "learning_rate": 8.954715367323468e-05,
+      "loss": 0.9811,
+      "step": 34
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4235867054801886,
+      "learning_rate": 8.435655349597689e-05,
+      "loss": 0.8341,
+      "step": 35
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3838193770356642,
+      "learning_rate": 7.920883091822408e-05,
+      "loss": 0.8893,
+      "step": 36
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4037097044310178,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.9402,
+      "step": 37
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.35482227410871914,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.8438,
+      "step": 38
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.33458547670959093,
+      "learning_rate": 6.416320504546997e-05,
+      "loss": 0.8558,
+      "step": 39
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.48202841998230445,
+      "learning_rate": 5.9326335692419995e-05,
+      "loss": 0.9616,
+      "step": 40
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.33135359355210775,
+      "learning_rate": 5.4600950026045326e-05,
+      "loss": 0.7899,
+      "step": 41
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3655941362699932,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.8469,
+      "step": 42
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4416572733780382,
+      "learning_rate": 4.5536096498497295e-05,
+      "loss": 0.9167,
+      "step": 43
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3963306861215452,
+      "learning_rate": 4.12214747707527e-05,
+      "loss": 0.8726,
+      "step": 44
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.403752781263034,
+      "learning_rate": 3.7067960895016275e-05,
+      "loss": 0.9247,
+      "step": 45
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.4429767955337501,
+      "learning_rate": 3.308693936411421e-05,
+      "loss": 0.8834,
+      "step": 46
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3854408763441377,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.8498,
+      "step": 47
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.32318832922926677,
+      "learning_rate": 2.5685517452260567e-05,
+      "loss": 0.8347,
+      "step": 48
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.35118024180739127,
+      "learning_rate": 2.2285403854302912e-05,
+      "loss": 0.8105,
+      "step": 49
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.4052987291396471,
+      "learning_rate": 1.9098300562505266e-05,
+      "loss": 0.9288,
+      "step": 50
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.44147568315067864,
+      "learning_rate": 1.6132943205457606e-05,
+      "loss": 0.9733,
+      "step": 51
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.39868663009902816,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.9249,
+      "step": 52
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3512200707436714,
+      "learning_rate": 1.0899347581163221e-05,
+      "loss": 0.8878,
+      "step": 53
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.36779032235091047,
+      "learning_rate": 8.645454235739903e-06,
+      "loss": 0.8425,
+      "step": 54
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4039621504635964,
+      "learning_rate": 6.6419573502798374e-06,
+      "loss": 0.8691,
+      "step": 55
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.6982928092645577,
+      "learning_rate": 4.8943483704846475e-06,
+      "loss": 0.8083,
+      "step": 56
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3586377109842292,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.8086,
+      "step": 57
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3752504895539116,
+      "learning_rate": 2.1852399266194314e-06,
+      "loss": 0.8385,
+      "step": 58
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3658655707367418,
+      "learning_rate": 1.231165940486234e-06,
+      "loss": 0.847,
+      "step": 59
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3096899200162482,
+      "learning_rate": 5.478104631726711e-07,
+      "loss": 0.8499,
+      "step": 60
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3203032261580068,
+      "learning_rate": 1.3704652454261668e-07,
+      "loss": 0.8257,
+      "step": 61
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.40910150612194435,
+      "learning_rate": 0.0,
+      "loss": 0.9072,
+      "step": 62
+    },
+    {
+      "epoch": 0.992,
+      "step": 62,
+      "total_flos": 147227655536640.0,
+      "train_loss": 0.9255667328834534,
+      "train_runtime": 1838.2528,
+      "train_samples_per_second": 1.088,
+      "train_steps_per_second": 0.034
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 62,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 147227655536640.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..54aea61e623644bcee43f27e87281877e749b36a
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "gate_proj",
+    "down_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1f5e3ef0031d3e669ba9381578250bc72c67aaed
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7459044d10fe89f47d2625e747c6f65abe2e2949be54a8120cbebc9fa49f5b99
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..077c8ef7f96e606115c6a8bb9c09fc18fce0ec9e
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69001178fa6f4f8a27fe76d6457aff758561b0096c67de38dbde021927b77345
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1ab08c470f54e229f81d15dd86f84324317d096
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,13167 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005333333333333334,
+      "grad_norm": 0.7478160203624254,
+      "learning_rate": 3.5087719298245615e-06,
+      "loss": 1.2281,
+      "step": 1
+    },
+    {
+      "epoch": 0.0010666666666666667,
+      "grad_norm": 1.0422662241870386,
+      "learning_rate": 7.017543859649123e-06,
+      "loss": 1.1893,
+      "step": 2
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.898821327534039,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.2598,
+      "step": 3
+    },
+    {
+      "epoch": 0.0021333333333333334,
+      "grad_norm": 0.9404599215806203,
+      "learning_rate": 1.4035087719298246e-05,
+      "loss": 1.3365,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026666666666666666,
+      "grad_norm": 0.8271399266416969,
+      "learning_rate": 1.7543859649122806e-05,
+      "loss": 1.4094,
+      "step": 5
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.7523511046413769,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.1267,
+      "step": 6
+    },
+    {
+      "epoch": 0.0037333333333333333,
+      "grad_norm": 0.8181154938413164,
+      "learning_rate": 2.456140350877193e-05,
+      "loss": 1.2057,
+      "step": 7
+    },
+    {
+      "epoch": 0.004266666666666667,
+      "grad_norm": 0.6888887133244778,
+      "learning_rate": 2.8070175438596492e-05,
+      "loss": 1.2002,
+      "step": 8
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.6372000597807975,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.1094,
+      "step": 9
+    },
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 0.8311538549618501,
+      "learning_rate": 3.508771929824561e-05,
+      "loss": 1.2097,
+      "step": 10
+    },
+    {
+      "epoch": 0.005866666666666667,
+      "grad_norm": 0.7787950909551065,
+      "learning_rate": 3.859649122807018e-05,
+      "loss": 1.0411,
+      "step": 11
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8628953898774414,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.1819,
+      "step": 12
+    },
+    {
+      "epoch": 0.006933333333333333,
+      "grad_norm": 0.7207552828246774,
+      "learning_rate": 4.56140350877193e-05,
+      "loss": 1.0764,
+      "step": 13
+    },
+    {
+      "epoch": 0.007466666666666667,
+      "grad_norm": 0.8317473561875744,
+      "learning_rate": 4.912280701754386e-05,
+      "loss": 1.1077,
+      "step": 14
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.0880818053614876,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.2038,
+      "step": 15
+    },
+    {
+      "epoch": 0.008533333333333334,
+      "grad_norm": 0.7101685723179632,
+      "learning_rate": 5.6140350877192984e-05,
+      "loss": 1.038,
+      "step": 16
+    },
+    {
+      "epoch": 0.009066666666666667,
+      "grad_norm": 0.7977818474566017,
+      "learning_rate": 5.9649122807017544e-05,
+      "loss": 1.0586,
+      "step": 17
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.6964640262892129,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.017,
+      "step": 18
+    },
+    {
+      "epoch": 0.010133333333333333,
+      "grad_norm": 0.6772461845492532,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.9125,
+      "step": 19
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": 0.8132239433117805,
+      "learning_rate": 7.017543859649122e-05,
+      "loss": 0.9926,
+      "step": 20
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.7794313373761221,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.1181,
+      "step": 21
+    },
+    {
+      "epoch": 0.011733333333333333,
+      "grad_norm": 0.5293596087693633,
+      "learning_rate": 7.719298245614036e-05,
+      "loss": 0.8762,
+      "step": 22
+    },
+    {
+      "epoch": 0.012266666666666667,
+      "grad_norm": 0.7665557573044675,
+      "learning_rate": 8.070175438596491e-05,
+      "loss": 0.8919,
+      "step": 23
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.6499984197559702,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.9798,
+      "step": 24
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 0.5797362185097485,
+      "learning_rate": 8.771929824561403e-05,
+      "loss": 0.9479,
+      "step": 25
+    },
+    {
+      "epoch": 0.013866666666666666,
+      "grad_norm": 0.6285858322846415,
+      "learning_rate": 9.12280701754386e-05,
+      "loss": 0.7771,
+      "step": 26
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.685266749813973,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9645,
+      "step": 27
+    },
+    {
+      "epoch": 0.014933333333333333,
+      "grad_norm": 0.6392698438243327,
+      "learning_rate": 9.824561403508771e-05,
+      "loss": 0.9325,
+      "step": 28
+    },
+    {
+      "epoch": 0.015466666666666667,
+      "grad_norm": 0.663102762162956,
+      "learning_rate": 0.0001017543859649123,
+      "loss": 0.9508,
+      "step": 29
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5064758958958117,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.8695,
+      "step": 30
+    },
+    {
+      "epoch": 0.016533333333333334,
+      "grad_norm": 0.5992180434134455,
+      "learning_rate": 0.00010877192982456141,
+      "loss": 0.9056,
+      "step": 31
+    },
+    {
+      "epoch": 0.017066666666666667,
+      "grad_norm": 0.5062578401046487,
+      "learning_rate": 0.00011228070175438597,
+      "loss": 0.8436,
+      "step": 32
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.7926225185864879,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.9324,
+      "step": 33
+    },
+    {
+      "epoch": 0.018133333333333335,
+      "grad_norm": 0.7470260490190631,
+      "learning_rate": 0.00011929824561403509,
+      "loss": 1.0706,
+      "step": 34
+    },
+    {
+      "epoch": 0.018666666666666668,
+      "grad_norm": 0.5382330100027091,
+      "learning_rate": 0.00012280701754385965,
+      "loss": 0.8687,
+      "step": 35
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.6553871123688179,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8886,
+      "step": 36
+    },
+    {
+      "epoch": 0.019733333333333332,
+      "grad_norm": 0.7528426065938322,
+      "learning_rate": 0.0001298245614035088,
+      "loss": 0.9648,
+      "step": 37
+    },
+    {
+      "epoch": 0.020266666666666665,
+      "grad_norm": 0.5463385508806565,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.8942,
+      "step": 38
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.6764033329582932,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9313,
+      "step": 39
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 0.45754262448789773,
+      "learning_rate": 0.00014035087719298245,
+      "loss": 0.8133,
+      "step": 40
+    },
+    {
+      "epoch": 0.021866666666666666,
+      "grad_norm": 0.5362275391995662,
+      "learning_rate": 0.00014385964912280703,
+      "loss": 0.885,
+      "step": 41
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.6570143061405205,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.9815,
+      "step": 42
+    },
+    {
+      "epoch": 0.022933333333333333,
+      "grad_norm": 0.680711661682576,
+      "learning_rate": 0.00015087719298245616,
+      "loss": 0.9644,
+      "step": 43
+    },
+    {
+      "epoch": 0.023466666666666667,
+      "grad_norm": 0.6324241814563605,
+      "learning_rate": 0.0001543859649122807,
+      "loss": 0.9316,
+      "step": 44
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.5298714490811897,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8209,
+      "step": 45
+    },
+    {
+      "epoch": 0.024533333333333334,
+      "grad_norm": 0.5873277960396065,
+      "learning_rate": 0.00016140350877192982,
+      "loss": 0.8341,
+      "step": 46
+    },
+    {
+      "epoch": 0.025066666666666668,
+      "grad_norm": 0.7213114786508069,
+      "learning_rate": 0.0001649122807017544,
+      "loss": 1.0156,
+      "step": 47
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6916436592160714,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.9357,
+      "step": 48
+    },
+    {
+      "epoch": 0.026133333333333335,
+      "grad_norm": 0.7716424015946091,
+      "learning_rate": 0.00017192982456140353,
+      "loss": 1.006,
+      "step": 49
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.44992446044036816,
+      "learning_rate": 0.00017543859649122806,
+      "loss": 0.7826,
+      "step": 50
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.7427382532262721,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 1.041,
+      "step": 51
+    },
+    {
+      "epoch": 0.027733333333333332,
+      "grad_norm": 0.7401724097633609,
+      "learning_rate": 0.0001824561403508772,
+      "loss": 0.9546,
+      "step": 52
+    },
+    {
+      "epoch": 0.028266666666666666,
+      "grad_norm": 0.5319230954044463,
+      "learning_rate": 0.00018596491228070177,
+      "loss": 0.8394,
+      "step": 53
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5148420974205532,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8594,
+      "step": 54
+    },
+    {
+      "epoch": 0.029333333333333333,
+      "grad_norm": 0.6043264083005266,
+      "learning_rate": 0.00019298245614035088,
+      "loss": 0.8999,
+      "step": 55
+    },
+    {
+      "epoch": 0.029866666666666666,
+      "grad_norm": 0.4758608899930653,
+      "learning_rate": 0.00019649122807017543,
+      "loss": 0.8195,
+      "step": 56
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.5043119387684599,
+      "learning_rate": 0.0002,
+      "loss": 0.828,
+      "step": 57
+    },
+    {
+      "epoch": 0.030933333333333334,
+      "grad_norm": 0.6030587988789778,
+      "learning_rate": 0.00019999985069241055,
+      "loss": 0.9206,
+      "step": 58
+    },
+    {
+      "epoch": 0.031466666666666664,
+      "grad_norm": 0.49317157797189015,
+      "learning_rate": 0.00019999940277008808,
+      "loss": 0.859,
+      "step": 59
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.4964622434425981,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8302,
+      "step": 60
+    },
+    {
+      "epoch": 0.03253333333333333,
+      "grad_norm": 0.6122813129682467,
+      "learning_rate": 0.00019999761108748597,
+      "loss": 0.979,
+      "step": 61
+    },
+    {
+      "epoch": 0.03306666666666667,
+      "grad_norm": 0.6301970834502888,
+      "learning_rate": 0.00019999626733255662,
+      "loss": 0.9665,
+      "step": 62
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.6251235739078242,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.8917,
+      "step": 63
+    },
+    {
+      "epoch": 0.034133333333333335,
+      "grad_norm": 0.5546239147411945,
+      "learning_rate": 0.00019999268401550447,
+      "loss": 0.8638,
+      "step": 64
+    },
+    {
+      "epoch": 0.034666666666666665,
+      "grad_norm": 0.6237721194072088,
+      "learning_rate": 0.000199990444464082,
+      "loss": 0.9551,
+      "step": 65
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5219863685079097,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8254,
+      "step": 66
+    },
+    {
+      "epoch": 0.03573333333333333,
+      "grad_norm": 0.5840916216178993,
+      "learning_rate": 0.00019998506960888256,
+      "loss": 0.9045,
+      "step": 67
+    },
+    {
+      "epoch": 0.03626666666666667,
+      "grad_norm": 0.5337180935661104,
+      "learning_rate": 0.00019998193432115572,
+      "loss": 0.7799,
+      "step": 68
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.8851875594074589,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.9175,
+      "step": 69
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 0.6718075048302122,
+      "learning_rate": 0.00019997476807225985,
+      "loss": 0.8535,
+      "step": 70
+    },
+    {
+      "epoch": 0.037866666666666667,
+      "grad_norm": 0.5459572976567617,
+      "learning_rate": 0.0001999707371324904,
+      "loss": 0.8797,
+      "step": 71
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.6945402602726587,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 1.0184,
+      "step": 72
+    },
+    {
+      "epoch": 0.038933333333333334,
+      "grad_norm": 0.5387553405804502,
+      "learning_rate": 0.00019996177968249334,
+      "loss": 0.9378,
+      "step": 73
+    },
+    {
+      "epoch": 0.039466666666666664,
+      "grad_norm": 0.5008798819123914,
+      "learning_rate": 0.0001999568531990141,
+      "loss": 0.8913,
+      "step": 74
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6145477921081341,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.9477,
+      "step": 75
+    },
+    {
+      "epoch": 0.04053333333333333,
+      "grad_norm": 0.547190001995045,
+      "learning_rate": 0.00019994610478865011,
+      "loss": 0.8734,
+      "step": 76
+    },
+    {
+      "epoch": 0.04106666666666667,
+      "grad_norm": 0.6009466383707704,
+      "learning_rate": 0.0001999402828938618,
+      "loss": 0.8684,
+      "step": 77
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5839831615615629,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.953,
+      "step": 78
+    },
+    {
+      "epoch": 0.042133333333333335,
+      "grad_norm": 0.584207906860248,
+      "learning_rate": 0.00019992774381199778,
+      "loss": 0.9451,
+      "step": 79
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 0.7531770577811897,
+      "learning_rate": 0.00019992102666236566,
+      "loss": 0.9219,
+      "step": 80
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.5091828156406875,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.7954,
+      "step": 81
+    },
+    {
+      "epoch": 0.04373333333333333,
+      "grad_norm": 0.6752524298420287,
+      "learning_rate": 0.00019990669724599336,
+      "loss": 0.9881,
+      "step": 82
+    },
+    {
+      "epoch": 0.04426666666666667,
+      "grad_norm": 0.7131387010883815,
+      "learning_rate": 0.00019989908502204292,
+      "loss": 1.0035,
+      "step": 83
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.6297246197220789,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8603,
+      "step": 84
+    },
+    {
+      "epoch": 0.04533333333333334,
+      "grad_norm": 0.5547873349046207,
+      "learning_rate": 0.00019988296565626987,
+      "loss": 0.8457,
+      "step": 85
+    },
+    {
+      "epoch": 0.04586666666666667,
+      "grad_norm": 0.5011787007865315,
+      "learning_rate": 0.00019987445856258206,
+      "loss": 0.8908,
+      "step": 86
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.4792485673023386,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.8115,
+      "step": 87
+    },
+    {
+      "epoch": 0.046933333333333334,
+      "grad_norm": 0.47356663334432353,
+      "learning_rate": 0.00019985654968062122,
+      "loss": 0.838,
+      "step": 88
+    },
+    {
+      "epoch": 0.047466666666666664,
+      "grad_norm": 0.4668409126247973,
+      "learning_rate": 0.00019984714794582683,
+      "loss": 0.8978,
+      "step": 89
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5711282533560464,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8763,
+      "step": 90
+    },
+    {
+      "epoch": 0.04853333333333333,
+      "grad_norm": 0.5850757709665906,
+      "learning_rate": 0.000199827450028985,
+      "loss": 0.7986,
+      "step": 91
+    },
+    {
+      "epoch": 0.04906666666666667,
+      "grad_norm": 0.6035424102863467,
+      "learning_rate": 0.00019981715390575858,
+      "loss": 0.915,
+      "step": 92
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.7741143540791687,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.967,
+      "step": 93
+    },
+    {
+      "epoch": 0.050133333333333335,
+      "grad_norm": 0.7394279561513039,
+      "learning_rate": 0.00019979566748342347,
+      "loss": 0.9477,
+      "step": 94
+    },
+    {
+      "epoch": 0.050666666666666665,
+      "grad_norm": 0.5187414772692578,
+      "learning_rate": 0.00019978447724847652,
+      "loss": 0.8944,
+      "step": 95
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5040103640649286,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.8896,
+      "step": 96
+    },
+    {
+      "epoch": 0.05173333333333333,
+      "grad_norm": 0.615423569645884,
+      "learning_rate": 0.00019976120289810247,
+      "loss": 1.0496,
+      "step": 97
+    },
+    {
+      "epoch": 0.05226666666666667,
+      "grad_norm": 0.5060767129482728,
+      "learning_rate": 0.00019974911885217608,
+      "loss": 0.859,
+      "step": 98
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.5607251767336862,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8638,
+      "step": 99
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.5833206272945435,
+      "learning_rate": 0.0001997240571992685,
+      "loss": 0.9576,
+      "step": 100
+    },
+    {
+      "epoch": 0.05386666666666667,
+      "grad_norm": 0.6127131832061536,
+      "learning_rate": 0.00019971107966712518,
+      "loss": 0.9095,
+      "step": 101
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.6063282263672138,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.881,
+      "step": 102
+    },
+    {
+      "epoch": 0.054933333333333334,
+      "grad_norm": 0.5593957155103715,
+      "learning_rate": 0.0001996842313852238,
+      "loss": 0.8878,
+      "step": 103
+    },
+    {
+      "epoch": 0.055466666666666664,
+      "grad_norm": 0.5384031206814662,
+      "learning_rate": 0.00019967036071563877,
+      "loss": 0.8585,
+      "step": 104
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.4974118502751297,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.8453,
+      "step": 105
+    },
+    {
+      "epoch": 0.05653333333333333,
+      "grad_norm": 0.6566801387881922,
+      "learning_rate": 0.0001996417265262996,
+      "loss": 0.8566,
+      "step": 106
+    },
+    {
+      "epoch": 0.05706666666666667,
+      "grad_norm": 0.6465561263035702,
+      "learning_rate": 0.00019962696309205148,
+      "loss": 0.9044,
+      "step": 107
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.6948653356023212,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.9912,
+      "step": 108
+    },
+    {
+      "epoch": 0.058133333333333335,
+      "grad_norm": 0.6676689265229045,
+      "learning_rate": 0.0001995965437648273,
+      "loss": 1.052,
+      "step": 109
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 0.4810496727394801,
+      "learning_rate": 0.00019958088796268793,
+      "loss": 0.7299,
+      "step": 110
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.690134541637704,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.9363,
+      "step": 111
+    },
+    {
+      "epoch": 0.05973333333333333,
+      "grad_norm": 0.6199958083338022,
+      "learning_rate": 0.00019954868431510764,
+      "loss": 0.7922,
+      "step": 112
+    },
+    {
+      "epoch": 0.06026666666666667,
+      "grad_norm": 0.7080924436502963,
+      "learning_rate": 0.00019953213656583168,
+      "loss": 1.0961,
+      "step": 113
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.646788939725199,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8522,
+      "step": 114
+    },
+    {
+      "epoch": 0.06133333333333333,
+      "grad_norm": 0.4989726146441654,
+      "learning_rate": 0.00019949814946337838,
+      "loss": 0.8339,
+      "step": 115
+    },
+    {
+      "epoch": 0.06186666666666667,
+      "grad_norm": 0.6511852930206385,
+      "learning_rate": 0.00019948071021169174,
+      "loss": 0.8986,
+      "step": 116
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.6161598892774144,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.9124,
+      "step": 117
+    },
+    {
+      "epoch": 0.06293333333333333,
+      "grad_norm": 0.5033677421045472,
+      "learning_rate": 0.00019944494056777946,
+      "loss": 0.8814,
+      "step": 118
+    },
+    {
+      "epoch": 0.06346666666666667,
+      "grad_norm": 0.5746517616929623,
+      "learning_rate": 0.00019942661028236745,
+      "loss": 0.9185,
+      "step": 119
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.6343958142028976,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.9325,
+      "step": 120
+    },
+    {
+      "epoch": 0.06453333333333333,
+      "grad_norm": 0.5994397750129418,
+      "learning_rate": 0.00019938905905831654,
+      "loss": 0.8876,
+      "step": 121
+    },
+    {
+      "epoch": 0.06506666666666666,
+      "grad_norm": 0.5107450721069634,
+      "learning_rate": 0.00019936983823181132,
+      "loss": 0.8342,
+      "step": 122
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.7435486373478364,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 1.0186,
+      "step": 123
+    },
+    {
+      "epoch": 0.06613333333333334,
+      "grad_norm": 0.809182825238528,
+      "learning_rate": 0.00019933050643682269,
+      "loss": 1.0035,
+      "step": 124
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.5819253848159949,
+      "learning_rate": 0.00019931039558578997,
+      "loss": 0.8812,
+      "step": 125
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.6508723013254191,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.9668,
+      "step": 126
+    },
+    {
+      "epoch": 0.06773333333333334,
+      "grad_norm": 0.5796337333621042,
+      "learning_rate": 0.00019926928427691786,
+      "loss": 0.9341,
+      "step": 127
+    },
+    {
+      "epoch": 0.06826666666666667,
+      "grad_norm": 0.5420371988390913,
+      "learning_rate": 0.00019924828394184306,
+      "loss": 0.8384,
+      "step": 128
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.6320018365199501,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.8598,
+      "step": 129
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 0.5647761224706593,
+      "learning_rate": 0.0001992053942239668,
+      "loss": 0.8693,
+      "step": 130
+    },
+    {
+      "epoch": 0.06986666666666666,
+      "grad_norm": 0.5501288243476312,
+      "learning_rate": 0.0001991835049692405,
+      "loss": 0.8387,
+      "step": 131
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.5178460421740548,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.914,
+      "step": 132
+    },
+    {
+      "epoch": 0.07093333333333333,
+      "grad_norm": 0.5346632219276891,
+      "learning_rate": 0.0001991388379950346,
+      "loss": 0.8322,
+      "step": 133
+    },
+    {
+      "epoch": 0.07146666666666666,
+      "grad_norm": 0.6523935549795017,
+      "learning_rate": 0.0001991160604089374,
+      "loss": 0.8774,
+      "step": 134
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5679617364387883,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.8505,
+      "step": 135
+    },
+    {
+      "epoch": 0.07253333333333334,
+      "grad_norm": 0.49663539815551794,
+      "learning_rate": 0.00019906961737884077,
+      "loss": 0.8177,
+      "step": 136
+    },
+    {
+      "epoch": 0.07306666666666667,
+      "grad_norm": 0.5217944742403843,
+      "learning_rate": 0.00019904595207352737,
+      "loss": 0.8152,
+      "step": 137
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.5537561506732283,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8768,
+      "step": 138
+    },
+    {
+      "epoch": 0.07413333333333333,
+      "grad_norm": 0.6064073082702489,
+      "learning_rate": 0.000198997734235711,
+      "loss": 0.8915,
+      "step": 139
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.5488497227462218,
+      "learning_rate": 0.00019897318184719385,
+      "loss": 0.9196,
+      "step": 140
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.6528111270053313,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.8861,
+      "step": 141
+    },
+    {
+      "epoch": 0.07573333333333333,
+      "grad_norm": 0.5279519427408583,
+      "learning_rate": 0.0001989231904975272,
+      "loss": 0.87,
+      "step": 142
+    },
+    {
+      "epoch": 0.07626666666666666,
+      "grad_norm": 0.6185666663340771,
+      "learning_rate": 0.00019889775168565943,
+      "loss": 0.8429,
+      "step": 143
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.6171065838197844,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.8895,
+      "step": 144
+    },
+    {
+      "epoch": 0.07733333333333334,
+      "grad_norm": 0.49653402021215537,
+      "learning_rate": 0.00019884598816767563,
+      "loss": 0.8521,
+      "step": 145
+    },
+    {
+      "epoch": 0.07786666666666667,
+      "grad_norm": 0.4832143821851556,
+      "learning_rate": 0.0001988196636161333,
+      "loss": 0.8467,
+      "step": 146
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.5930464864818148,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8942,
+      "step": 147
+    },
+    {
+      "epoch": 0.07893333333333333,
+      "grad_norm": 0.6406968641975115,
+      "learning_rate": 0.00019876612932099308,
+      "loss": 0.9863,
+      "step": 148
+    },
+    {
+      "epoch": 0.07946666666666667,
+      "grad_norm": 0.5380431693069204,
+      "learning_rate": 0.0001987389197372567,
+      "loss": 1.0119,
+      "step": 149
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.47697223182531917,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8244,
+      "step": 150
+    },
+    {
+      "epoch": 0.08053333333333333,
+      "grad_norm": 0.6027764246637539,
+      "learning_rate": 0.00019868361610371097,
+      "loss": 0.9495,
+      "step": 151
+    },
+    {
+      "epoch": 0.08106666666666666,
+      "grad_norm": 0.4223573475293651,
+      "learning_rate": 0.00019865552221904665,
+      "loss": 0.7734,
+      "step": 152
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.6816218459869196,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 1.0553,
+      "step": 153
+    },
+    {
+      "epoch": 0.08213333333333334,
+      "grad_norm": 0.5288915810363807,
+      "learning_rate": 0.00019859845073339787,
+      "loss": 0.8751,
+      "step": 154
+    },
+    {
+      "epoch": 0.08266666666666667,
+      "grad_norm": 0.4947913647208401,
+      "learning_rate": 0.00019856947330283752,
+      "loss": 0.804,
+      "step": 155
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.6904189838107081,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.9027,
+      "step": 156
+    },
+    {
+      "epoch": 0.08373333333333334,
+      "grad_norm": 0.5992964636587961,
+      "learning_rate": 0.0001985106354988997,
+      "loss": 0.8853,
+      "step": 157
+    },
+    {
+      "epoch": 0.08426666666666667,
+      "grad_norm": 0.5348320967845114,
+      "learning_rate": 0.00019848077530122083,
+      "loss": 0.8075,
+      "step": 158
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.6298550790096009,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.9349,
+      "step": 159
+    },
+    {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 1.9331938077792006,
+      "learning_rate": 0.00019842017276027832,
+      "loss": 0.8624,
+      "step": 160
+    },
+    {
+      "epoch": 0.08586666666666666,
+      "grad_norm": 0.5236801578526996,
+      "learning_rate": 0.00019838943059798304,
+      "loss": 0.8768,
+      "step": 161
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.4574937323292251,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.8148,
+      "step": 162
+    },
+    {
+      "epoch": 0.08693333333333333,
+      "grad_norm": 0.6136683690125613,
+      "learning_rate": 0.0001983270649487481,
+      "loss": 0.9091,
+      "step": 163
+    },
+    {
+      "epoch": 0.08746666666666666,
+      "grad_norm": 0.4514678873411745,
+      "learning_rate": 0.0001982954416480417,
+      "loss": 0.7938,
+      "step": 164
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.4402770181429307,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.7222,
+      "step": 165
+    },
+    {
+      "epoch": 0.08853333333333334,
+      "grad_norm": 0.548887864087565,
+      "learning_rate": 0.00019823131456661063,
+      "loss": 0.8248,
+      "step": 166
+    },
+    {
+      "epoch": 0.08906666666666667,
+      "grad_norm": 0.5562661508499465,
+      "learning_rate": 0.00019819881097737915,
+      "loss": 0.9384,
+      "step": 167
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.6033629683305896,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.9273,
+      "step": 168
+    },
+    {
+      "epoch": 0.09013333333333333,
+      "grad_norm": 0.8342562362215857,
+      "learning_rate": 0.00019813292418718732,
+      "loss": 1.1554,
+      "step": 169
+    },
+    {
+      "epoch": 0.09066666666666667,
+      "grad_norm": 0.5069447856597911,
+      "learning_rate": 0.0001980995411829749,
+      "loss": 0.9092,
+      "step": 170
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.5734351306353567,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8657,
+      "step": 171
+    },
+    {
+      "epoch": 0.09173333333333333,
+      "grad_norm": 0.48674893805867275,
+      "learning_rate": 0.0001980318964547504,
+      "loss": 0.8155,
+      "step": 172
+    },
+    {
+      "epoch": 0.09226666666666666,
+      "grad_norm": 0.5894232151359743,
+      "learning_rate": 0.0001979976349327357,
+      "loss": 0.9222,
+      "step": 173
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.6063045680692516,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.9494,
+      "step": 174
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 0.5938132305924131,
+      "learning_rate": 0.00019792823408445174,
+      "loss": 0.9302,
+      "step": 175
+    },
+    {
+      "epoch": 0.09386666666666667,
+      "grad_norm": 0.5913801841305889,
+      "learning_rate": 0.0001978930949654239,
+      "loss": 0.8658,
+      "step": 176
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.9822282297619556,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 1.0164,
+      "step": 177
+    },
+    {
+      "epoch": 0.09493333333333333,
+      "grad_norm": 0.6757553741126089,
+      "learning_rate": 0.00019782193986224995,
+      "loss": 0.9826,
+      "step": 178
+    },
+    {
+      "epoch": 0.09546666666666667,
+      "grad_norm": 1.233905974997222,
+      "learning_rate": 0.00019778592409058378,
+      "loss": 0.8634,
+      "step": 179
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.7286142841214036,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.9223,
+      "step": 180
+    },
+    {
+      "epoch": 0.09653333333333333,
+      "grad_norm": 0.45484263786353385,
+      "learning_rate": 0.0001977130166448355,
+      "loss": 0.7534,
+      "step": 181
+    },
+    {
+      "epoch": 0.09706666666666666,
+      "grad_norm": 0.5711816184206053,
+      "learning_rate": 0.00019767612518846608,
+      "loss": 0.8735,
+      "step": 182
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.5418657935134991,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.9992,
+      "step": 183
+    },
+    {
+      "epoch": 0.09813333333333334,
+      "grad_norm": 0.6185597952660713,
+      "learning_rate": 0.00019760146735955388,
+      "loss": 0.8937,
+      "step": 184
+    },
+    {
+      "epoch": 0.09866666666666667,
+      "grad_norm": 0.516031279053156,
+      "learning_rate": 0.00019756370120995066,
+      "loss": 0.9806,
+      "step": 185
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.5599066265752062,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.8679,
+      "step": 186
+    },
+    {
+      "epoch": 0.09973333333333333,
+      "grad_norm": 0.5630464137659468,
+      "learning_rate": 0.000197487295004327,
+      "loss": 0.8109,
+      "step": 187
+    },
+    {
+      "epoch": 0.10026666666666667,
+      "grad_norm": 0.4814995495211024,
+      "learning_rate": 0.00019744865517646706,
+      "loss": 0.7618,
+      "step": 188
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.4970701269923393,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.7755,
+      "step": 189
+    },
+    {
+      "epoch": 0.10133333333333333,
+      "grad_norm": 0.6707479462374684,
+      "learning_rate": 0.0001973705026475726,
+      "loss": 0.9362,
+      "step": 190
+    },
+    {
+      "epoch": 0.10186666666666666,
+      "grad_norm": 0.645234890474023,
+      "learning_rate": 0.00019733099017991341,
+      "loss": 0.9033,
+      "step": 191
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5395230204710006,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.7949,
+      "step": 192
+    },
+    {
+      "epoch": 0.10293333333333334,
+      "grad_norm": 0.550382701125016,
+      "learning_rate": 0.0001972510934281218,
+      "loss": 0.8044,
+      "step": 193
+    },
+    {
+      "epoch": 0.10346666666666667,
+      "grad_norm": 0.5418835757233681,
+      "learning_rate": 0.00019721070938257324,
+      "loss": 0.9103,
+      "step": 194
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5149849534792273,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8234,
+      "step": 195
+    },
+    {
+      "epoch": 0.10453333333333334,
+      "grad_norm": 0.6222034792808916,
+      "learning_rate": 0.0001971290705551347,
+      "loss": 0.9314,
+      "step": 196
+    },
+    {
+      "epoch": 0.10506666666666667,
+      "grad_norm": 0.5020387021600372,
+      "learning_rate": 0.00019708781601703065,
+      "loss": 0.8834,
+      "step": 197
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.44609067118781853,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.7588,
+      "step": 198
+    },
+    {
+      "epoch": 0.10613333333333333,
+      "grad_norm": 0.59475354515924,
+      "learning_rate": 0.00019700443730801413,
+      "loss": 0.8931,
+      "step": 199
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.6103536228444102,
+      "learning_rate": 0.00019696231338608316,
+      "loss": 0.967,
+      "step": 200
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.4276077516316479,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.7639,
+      "step": 201
+    },
+    {
+      "epoch": 0.10773333333333333,
+      "grad_norm": 0.42679574249455227,
+      "learning_rate": 0.00019687719703631755,
+      "loss": 0.7505,
+      "step": 202
+    },
+    {
+      "epoch": 0.10826666666666666,
+      "grad_norm": 0.5195748925315388,
+      "learning_rate": 0.00019683420486265327,
+      "loss": 0.892,
+      "step": 203
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.5430095858621583,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.8572,
+      "step": 204
+    },
+    {
+      "epoch": 0.10933333333333334,
+      "grad_norm": 0.5490698318145699,
+      "learning_rate": 0.0001967473531596671,
+      "loss": 0.7836,
+      "step": 205
+    },
+    {
+      "epoch": 0.10986666666666667,
+      "grad_norm": 0.5810676797287423,
+      "learning_rate": 0.0001967034938896976,
+      "loss": 0.9217,
+      "step": 206
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.5397056603391304,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8318,
+      "step": 207
+    },
+    {
+      "epoch": 0.11093333333333333,
+      "grad_norm": 0.6852953334227522,
+      "learning_rate": 0.0001966149091676575,
+      "loss": 0.987,
+      "step": 208
+    },
+    {
+      "epoch": 0.11146666666666667,
+      "grad_norm": 0.5397841521812882,
+      "learning_rate": 0.00019657018398011434,
+      "loss": 0.8567,
+      "step": 209
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5558183153152181,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.889,
+      "step": 210
+    },
+    {
+      "epoch": 0.11253333333333333,
+      "grad_norm": 0.5323335553062883,
+      "learning_rate": 0.00019647986861976246,
+      "loss": 0.8292,
+      "step": 211
+    },
+    {
+      "epoch": 0.11306666666666666,
+      "grad_norm": 0.6359392350933629,
+      "learning_rate": 0.0001964342787166491,
+      "loss": 0.8789,
+      "step": 212
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.5157272312694805,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.8382,
+      "step": 213
+    },
+    {
+      "epoch": 0.11413333333333334,
+      "grad_norm": 0.4955543768095666,
+      "learning_rate": 0.0001963422351452389,
+      "loss": 0.827,
+      "step": 214
+    },
+    {
+      "epoch": 0.11466666666666667,
+      "grad_norm": 0.5185766880013812,
+      "learning_rate": 0.0001962957817517982,
+      "loss": 0.8525,
+      "step": 215
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.5799216700682058,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.8949,
+      "step": 216
+    },
+    {
+      "epoch": 0.11573333333333333,
+      "grad_norm": 0.5746162775961909,
+      "learning_rate": 0.00019620201244302952,
+      "loss": 0.9068,
+      "step": 217
+    },
+    {
+      "epoch": 0.11626666666666667,
+      "grad_norm": 0.5942962281260388,
+      "learning_rate": 0.00019615469680771096,
+      "loss": 0.9105,
+      "step": 218
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.4822965949841608,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.8006,
+      "step": 219
+    },
+    {
+      "epoch": 0.11733333333333333,
+      "grad_norm": 0.5285129624324617,
+      "learning_rate": 0.00019605920428166323,
+      "loss": 0.8916,
+      "step": 220
+    },
+    {
+      "epoch": 0.11786666666666666,
+      "grad_norm": 0.578767555547315,
+      "learning_rate": 0.00019601102767608923,
+      "loss": 0.8295,
+      "step": 221
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5726253586422992,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.7637,
+      "step": 222
+    },
+    {
+      "epoch": 0.11893333333333334,
+      "grad_norm": 0.5803865400279506,
+      "learning_rate": 0.00019591381449915397,
+      "loss": 0.847,
+      "step": 223
+    },
+    {
+      "epoch": 0.11946666666666667,
+      "grad_norm": 0.6238889881153514,
+      "learning_rate": 0.00019586477821808597,
+      "loss": 0.9309,
+      "step": 224
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5693911379870041,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.8328,
+      "step": 225
+    },
+    {
+      "epoch": 0.12053333333333334,
+      "grad_norm": 0.6019961384633078,
+      "learning_rate": 0.00019576584700289768,
+      "loss": 0.739,
+      "step": 226
+    },
+    {
+      "epoch": 0.12106666666666667,
+      "grad_norm": 0.49021989674917477,
+      "learning_rate": 0.00019571595236420102,
+      "loss": 0.8646,
+      "step": 227
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.5152822790823023,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.8294,
+      "step": 228
+    },
+    {
+      "epoch": 0.12213333333333333,
+      "grad_norm": 0.49830881392059656,
+      "learning_rate": 0.00019561530576956703,
+      "loss": 0.849,
+      "step": 229
+    },
+    {
+      "epoch": 0.12266666666666666,
+      "grad_norm": 0.6353438291466692,
+      "learning_rate": 0.00019556455411417573,
+      "loss": 0.8289,
+      "step": 230
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.5496078604729235,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.855,
+      "step": 231
+    },
+    {
+      "epoch": 0.12373333333333333,
+      "grad_norm": 0.5328663723300848,
+      "learning_rate": 0.00019546219484500475,
+      "loss": 0.8204,
+      "step": 232
+    },
+    {
+      "epoch": 0.12426666666666666,
+      "grad_norm": 0.6044824618670902,
+      "learning_rate": 0.00019541058753688538,
+      "loss": 0.9029,
+      "step": 233
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.562340865698981,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.8995,
+      "step": 234
+    },
+    {
+      "epoch": 0.12533333333333332,
+      "grad_norm": 0.5694336641591617,
+      "learning_rate": 0.00019530651834411474,
+      "loss": 0.8111,
+      "step": 235
+    },
+    {
+      "epoch": 0.12586666666666665,
+      "grad_norm": 0.5493525419976079,
+      "learning_rate": 0.00019525405677022989,
+      "loss": 0.8504,
+      "step": 236
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.629488795640096,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.8496,
+      "step": 237
+    },
+    {
+      "epoch": 0.12693333333333334,
+      "grad_norm": 0.4705893683305386,
+      "learning_rate": 0.0001951482804507517,
+      "loss": 0.7404,
+      "step": 238
+    },
+    {
+      "epoch": 0.12746666666666667,
+      "grad_norm": 0.514550204082626,
+      "learning_rate": 0.00019509496602102252,
+      "loss": 0.8082,
+      "step": 239
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.6060127960470112,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.8902,
+      "step": 240
+    },
+    {
+      "epoch": 0.12853333333333333,
+      "grad_norm": 0.4152489103408303,
+      "learning_rate": 0.00019498748541760846,
+      "loss": 0.8229,
+      "step": 241
+    },
+    {
+      "epoch": 0.12906666666666666,
+      "grad_norm": 0.45642913586332107,
+      "learning_rate": 0.0001949333195648769,
+      "loss": 0.7538,
+      "step": 242
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.5557390666476755,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8656,
+      "step": 243
+    },
+    {
+      "epoch": 0.13013333333333332,
+      "grad_norm": 0.5483083085505757,
+      "learning_rate": 0.00019482413756610173,
+      "loss": 0.8621,
+      "step": 244
+    },
+    {
+      "epoch": 0.13066666666666665,
+      "grad_norm": 0.6217623928148703,
+      "learning_rate": 0.0001947691217460921,
+      "loss": 0.9082,
+      "step": 245
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.5544728011159186,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.9426,
+      "step": 246
+    },
+    {
+      "epoch": 0.13173333333333334,
+      "grad_norm": 0.6386585001110127,
+      "learning_rate": 0.00019465824128625617,
+      "loss": 0.8571,
+      "step": 247
+    },
+    {
+      "epoch": 0.13226666666666667,
+      "grad_norm": 0.5364273119583183,
+      "learning_rate": 0.00019460237697753577,
+      "loss": 0.8107,
+      "step": 248
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.4912213309565991,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8051,
+      "step": 249
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.5891785869772029,
+      "learning_rate": 0.00019448980103658613,
+      "loss": 0.915,
+      "step": 250
+    },
+    {
+      "epoch": 0.13386666666666666,
+      "grad_norm": 0.48733214475456743,
+      "learning_rate": 0.0001944330897405257,
+      "loss": 0.8577,
+      "step": 251
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5799548544326986,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.825,
+      "step": 252
+    },
+    {
+      "epoch": 0.13493333333333332,
+      "grad_norm": 0.4788858732810099,
+      "learning_rate": 0.00019431882134397598,
+      "loss": 0.9298,
+      "step": 253
+    },
+    {
+      "epoch": 0.13546666666666668,
+      "grad_norm": 0.5883545906221429,
+      "learning_rate": 0.00019426126458470936,
+      "loss": 0.9316,
+      "step": 254
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4904291459104027,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.8268,
+      "step": 255
+    },
+    {
+      "epoch": 0.13653333333333334,
+      "grad_norm": 0.7115946528102219,
+      "learning_rate": 0.00019414530680355837,
+      "loss": 1.026,
+      "step": 256
+    },
+    {
+      "epoch": 0.13706666666666667,
+      "grad_norm": 0.40575516082874447,
+      "learning_rate": 0.00019408690612794148,
+      "loss": 0.7482,
+      "step": 257
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5403346585965954,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.9077,
+      "step": 258
+    },
+    {
+      "epoch": 0.13813333333333333,
+      "grad_norm": 0.6662237887549459,
+      "learning_rate": 0.00019396926207859084,
+      "loss": 0.9702,
+      "step": 259
+    },
+    {
+      "epoch": 0.13866666666666666,
+      "grad_norm": 0.6942651389363136,
+      "learning_rate": 0.0001939100190561601,
+      "loss": 0.9222,
+      "step": 260
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.6579007152763529,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.9697,
+      "step": 261
+    },
+    {
+      "epoch": 0.13973333333333332,
+      "grad_norm": 0.5496149330120147,
+      "learning_rate": 0.0001937906919003304,
+      "loss": 0.9229,
+      "step": 262
+    },
+    {
+      "epoch": 0.14026666666666668,
+      "grad_norm": 0.4490156799705011,
+      "learning_rate": 0.00019373060812326052,
+      "loss": 0.7789,
+      "step": 263
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4781576375838312,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8383,
+      "step": 264
+    },
+    {
+      "epoch": 0.14133333333333334,
+      "grad_norm": 0.4795905566479139,
+      "learning_rate": 0.00019360960106790643,
+      "loss": 0.8123,
+      "step": 265
+    },
+    {
+      "epoch": 0.14186666666666667,
+      "grad_norm": 0.8181773682867574,
+      "learning_rate": 0.0001935486781509677,
+      "loss": 1.0112,
+      "step": 266
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.522706657600922,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.8527,
+      "step": 267
+    },
+    {
+      "epoch": 0.14293333333333333,
+      "grad_norm": 0.5135054349379268,
+      "learning_rate": 0.00019342599444819168,
+      "loss": 0.7414,
+      "step": 268
+    },
+    {
+      "epoch": 0.14346666666666666,
+      "grad_norm": 0.5130302385142596,
+      "learning_rate": 0.00019336423402870653,
+      "loss": 0.7821,
+      "step": 269
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.5547056633661651,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.9122,
+      "step": 270
+    },
+    {
+      "epoch": 0.14453333333333335,
+      "grad_norm": 0.6441345254417032,
+      "learning_rate": 0.0001932398769756714,
+      "loss": 1.0426,
+      "step": 271
+    },
+    {
+      "epoch": 0.14506666666666668,
+      "grad_norm": 0.5273823414095974,
+      "learning_rate": 0.0001931772807134704,
+      "loss": 0.9104,
+      "step": 272
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.46069046608844544,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7569,
+      "step": 273
+    },
+    {
+      "epoch": 0.14613333333333334,
+      "grad_norm": 0.5381318342085369,
+      "learning_rate": 0.00019305125365231084,
+      "loss": 0.9187,
+      "step": 274
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 0.7487514917933524,
+      "learning_rate": 0.00019298782322968815,
+      "loss": 1.0041,
+      "step": 275
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.47966687804112507,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.8117,
+      "step": 276
+    },
+    {
+      "epoch": 0.14773333333333333,
+      "grad_norm": 0.6273165405318113,
+      "learning_rate": 0.0001928601295474208,
+      "loss": 1.0189,
+      "step": 277
+    },
+    {
+      "epoch": 0.14826666666666666,
+      "grad_norm": 0.5670914286563735,
+      "learning_rate": 0.00019279586666908884,
+      "loss": 0.8833,
+      "step": 278
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.7232675325632234,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.8313,
+      "step": 279
+    },
+    {
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.5121007508125893,
+      "learning_rate": 0.00019266650979752136,
+      "loss": 0.8204,
+      "step": 280
+    },
+    {
+      "epoch": 0.14986666666666668,
+      "grad_norm": 0.5128296692744078,
+      "learning_rate": 0.00019260141619056507,
+      "loss": 0.8674,
+      "step": 281
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.5204816772309242,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8232,
+      "step": 282
+    },
+    {
+      "epoch": 0.15093333333333334,
+      "grad_norm": 0.5473040774478453,
+      "learning_rate": 0.0001924703996062038,
+      "loss": 0.8777,
+      "step": 283
+    },
+    {
+      "epoch": 0.15146666666666667,
+      "grad_norm": 0.6102177275322413,
+      "learning_rate": 0.0001924044770200342,
+      "loss": 1.04,
+      "step": 284
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.5382713447028439,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.759,
+      "step": 285
+    },
+    {
+      "epoch": 0.15253333333333333,
+      "grad_norm": 0.5637183014231405,
+      "learning_rate": 0.0001922718042439908,
+      "loss": 0.7745,
+      "step": 286
+    },
+    {
+      "epoch": 0.15306666666666666,
+      "grad_norm": 0.5005955166838629,
+      "learning_rate": 0.000192205054450298,
+      "loss": 0.8215,
+      "step": 287
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.5647214750330327,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8736,
+      "step": 288
+    },
+    {
+      "epoch": 0.15413333333333334,
+      "grad_norm": 0.5430735915648874,
+      "learning_rate": 0.00019207072904819486,
+      "loss": 0.8685,
+      "step": 289
+    },
+    {
+      "epoch": 0.15466666666666667,
+      "grad_norm": 0.4985562198119292,
+      "learning_rate": 0.00019200315384090044,
+      "loss": 0.8372,
+      "step": 290
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.5410096511416426,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.9444,
+      "step": 291
+    },
+    {
+      "epoch": 0.15573333333333333,
+      "grad_norm": 0.5400865975872126,
+      "learning_rate": 0.00019186717942277462,
+      "loss": 0.828,
+      "step": 292
+    },
+    {
+      "epoch": 0.15626666666666666,
+      "grad_norm": 0.5199681853342951,
+      "learning_rate": 0.00019179878061798347,
+      "loss": 0.7628,
+      "step": 293
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.6263122336192022,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.9499,
+      "step": 294
+    },
+    {
+      "epoch": 0.15733333333333333,
+      "grad_norm": 0.5515913599293975,
+      "learning_rate": 0.00019166116083819002,
+      "loss": 0.902,
+      "step": 295
+    },
+    {
+      "epoch": 0.15786666666666666,
+      "grad_norm": 0.540077229662397,
+      "learning_rate": 0.00019159194027414128,
+      "loss": 0.8651,
+      "step": 296
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.5105953773814097,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8124,
+      "step": 297
+    },
+    {
+      "epoch": 0.15893333333333334,
+      "grad_norm": 0.5885775121761744,
+      "learning_rate": 0.00019145267883125482,
+      "loss": 0.8779,
+      "step": 298
+    },
+    {
+      "epoch": 0.15946666666666667,
+      "grad_norm": 0.5468317477972652,
+      "learning_rate": 0.00019138263836827288,
+      "loss": 0.9038,
+      "step": 299
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4774898883273422,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.7347,
+      "step": 300
+    },
+    {
+      "epoch": 0.16053333333333333,
+      "grad_norm": 0.6546369687247534,
+      "learning_rate": 0.00019124173900498818,
+      "loss": 1.074,
+      "step": 301
+    },
+    {
+      "epoch": 0.16106666666666666,
+      "grad_norm": 0.5746219704096867,
+      "learning_rate": 0.00019117088052543233,
+      "loss": 0.8785,
+      "step": 302
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.512783055857806,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8484,
+      "step": 303
+    },
+    {
+      "epoch": 0.16213333333333332,
+      "grad_norm": 0.48318252093811825,
+      "learning_rate": 0.00019102834702846387,
+      "loss": 0.8693,
+      "step": 304
+    },
+    {
+      "epoch": 0.16266666666666665,
+      "grad_norm": 0.4552038052381454,
+      "learning_rate": 0.0001909566724366779,
+      "loss": 0.8363,
+      "step": 305
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.48773217916485373,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.799,
+      "step": 306
+    },
+    {
+      "epoch": 0.16373333333333334,
+      "grad_norm": 0.6014093204013539,
+      "learning_rate": 0.00019081250863665794,
+      "loss": 0.8907,
+      "step": 307
+    },
+    {
+      "epoch": 0.16426666666666667,
+      "grad_norm": 0.6155458246007832,
+      "learning_rate": 0.0001907400198589189,
+      "loss": 0.9374,
+      "step": 308
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.42801125846418625,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7845,
+      "step": 309
+    },
+    {
+      "epoch": 0.16533333333333333,
+      "grad_norm": 0.4949972099285363,
+      "learning_rate": 0.00019059422963029464,
+      "loss": 0.872,
+      "step": 310
+    },
+    {
+      "epoch": 0.16586666666666666,
+      "grad_norm": 0.5035166804923811,
+      "learning_rate": 0.0001905209286147611,
+      "loss": 0.9066,
+      "step": 311
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.47353741325219184,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7517,
+      "step": 312
+    },
+    {
+      "epoch": 0.16693333333333332,
+      "grad_norm": 0.5736697688223861,
+      "learning_rate": 0.0001903735158756905,
+      "loss": 0.958,
+      "step": 313
+    },
+    {
+      "epoch": 0.16746666666666668,
+      "grad_norm": 0.53247859954624,
+      "learning_rate": 0.0001902994045923502,
+      "loss": 0.8478,
+      "step": 314
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.6010734943757549,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.8478,
+      "step": 315
+    },
+    {
+      "epoch": 0.16853333333333334,
+      "grad_norm": 0.5121525604394572,
+      "learning_rate": 0.0001901503733045967,
+      "loss": 0.7919,
+      "step": 316
+    },
+    {
+      "epoch": 0.16906666666666667,
+      "grad_norm": 0.46694379864441826,
+      "learning_rate": 0.00019007545374521355,
+      "loss": 0.7731,
+      "step": 317
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.5485906739730657,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.8151,
+      "step": 318
+    },
+    {
+      "epoch": 0.17013333333333333,
+      "grad_norm": 0.4585234757479113,
+      "learning_rate": 0.00018992480791403958,
+      "loss": 0.7339,
+      "step": 319
+    },
+    {
+      "epoch": 0.17066666666666666,
+      "grad_norm": 0.5940002138588427,
+      "learning_rate": 0.0001898490820921001,
+      "loss": 0.8679,
+      "step": 320
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.6057502149185376,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.8719,
+      "step": 321
+    },
+    {
+      "epoch": 0.17173333333333332,
+      "grad_norm": 0.5810819000531806,
+      "learning_rate": 0.0001896968257661595,
+      "loss": 0.9198,
+      "step": 322
+    },
+    {
+      "epoch": 0.17226666666666668,
+      "grad_norm": 0.6083936326137559,
+      "learning_rate": 0.00018962029571681886,
+      "loss": 0.86,
+      "step": 323
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.6865828858997022,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.9054,
+      "step": 324
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 0.566563080612365,
+      "learning_rate": 0.00018946643298804793,
+      "loss": 0.8687,
+      "step": 325
+    },
+    {
+      "epoch": 0.17386666666666667,
+      "grad_norm": 0.5006268810917645,
+      "learning_rate": 0.00018938910076807513,
+      "loss": 0.7906,
+      "step": 326
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.5018590669905312,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.8252,
+      "step": 327
+    },
+    {
+      "epoch": 0.17493333333333333,
+      "grad_norm": 0.5965473876338885,
+      "learning_rate": 0.0001892336357715829,
+      "loss": 0.986,
+      "step": 328
+    },
+    {
+      "epoch": 0.17546666666666666,
+      "grad_norm": 0.4330352467359094,
+      "learning_rate": 0.0001891555034593055,
+      "loss": 0.7868,
+      "step": 329
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.6268359083673914,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.9152,
+      "step": 330
+    },
+    {
+      "epoch": 0.17653333333333332,
+      "grad_norm": 0.580180599972275,
+      "learning_rate": 0.00018899844037326225,
+      "loss": 0.885,
+      "step": 331
+    },
+    {
+      "epoch": 0.17706666666666668,
+      "grad_norm": 0.4364646368352817,
+      "learning_rate": 0.0001889195100685106,
+      "loss": 0.7072,
+      "step": 332
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.5736322025691719,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7942,
+      "step": 333
+    },
+    {
+      "epoch": 0.17813333333333334,
+      "grad_norm": 0.6768535971383288,
+      "learning_rate": 0.00018876085311403593,
+      "loss": 0.9545,
+      "step": 334
+    },
+    {
+      "epoch": 0.17866666666666667,
+      "grad_norm": 0.4679683938709085,
+      "learning_rate": 0.00018868112693808665,
+      "loss": 0.7854,
+      "step": 335
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4719121618259123,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7323,
+      "step": 336
+    },
+    {
+      "epoch": 0.17973333333333333,
+      "grad_norm": 0.4632021599885029,
+      "learning_rate": 0.00018852088037913577,
+      "loss": 0.7609,
+      "step": 337
+    },
+    {
+      "epoch": 0.18026666666666666,
+      "grad_norm": 0.5804129077627767,
+      "learning_rate": 0.0001884403604746547,
+      "loss": 0.9667,
+      "step": 338
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.4169465814255522,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.7773,
+      "step": 339
+    },
+    {
+      "epoch": 0.18133333333333335,
+      "grad_norm": 0.6055416889203082,
+      "learning_rate": 0.00018827852861790398,
+      "loss": 0.9501,
+      "step": 340
+    },
+    {
+      "epoch": 0.18186666666666668,
+      "grad_norm": 0.47097022652856235,
+      "learning_rate": 0.00018819721714888877,
+      "loss": 0.8522,
+      "step": 341
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.4711804717392996,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.729,
+      "step": 342
+    },
+    {
+      "epoch": 0.18293333333333334,
+      "grad_norm": 0.5833928473869843,
+      "learning_rate": 0.00018803380434362,
+      "loss": 0.9746,
+      "step": 343
+    },
+    {
+      "epoch": 0.18346666666666667,
+      "grad_norm": 0.43197057031279645,
+      "learning_rate": 0.0001879517034953418,
+      "loss": 0.7755,
+      "step": 344
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.6187166684792078,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.85,
+      "step": 345
+    },
+    {
+      "epoch": 0.18453333333333333,
+      "grad_norm": 0.4964059310236803,
+      "learning_rate": 0.00018778671413332513,
+      "loss": 0.7622,
+      "step": 346
+    },
+    {
+      "epoch": 0.18506666666666666,
+      "grad_norm": 0.503061098176286,
+      "learning_rate": 0.00018770382611226987,
+      "loss": 0.8098,
+      "step": 347
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.4621277025640317,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.786,
+      "step": 348
+    },
+    {
+      "epoch": 0.18613333333333335,
+      "grad_norm": 0.5436191215591101,
+      "learning_rate": 0.000187537264627646,
+      "loss": 0.9056,
+      "step": 349
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.48810332750586005,
+      "learning_rate": 0.00018745359166145523,
+      "loss": 0.7994,
+      "step": 350
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.473568239322871,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.8444,
+      "step": 351
+    },
+    {
+      "epoch": 0.18773333333333334,
+      "grad_norm": 0.5586622378076873,
+      "learning_rate": 0.00018728546253061614,
+      "loss": 0.8522,
+      "step": 352
+    },
+    {
+      "epoch": 0.18826666666666667,
+      "grad_norm": 0.4281119912363646,
+      "learning_rate": 0.00018720100686802694,
+      "loss": 0.7804,
+      "step": 353
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5715157096974114,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.8718,
+      "step": 354
+    },
+    {
+      "epoch": 0.18933333333333333,
+      "grad_norm": 0.5316707554900694,
+      "learning_rate": 0.00018703131460949554,
+      "loss": 0.8543,
+      "step": 355
+    },
+    {
+      "epoch": 0.18986666666666666,
+      "grad_norm": 0.5438502162254243,
+      "learning_rate": 0.0001869460785202802,
+      "loss": 0.8143,
+      "step": 356
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.4961794864266288,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7625,
+      "step": 357
+    },
+    {
+      "epoch": 0.19093333333333334,
+      "grad_norm": 0.5627823210958318,
+      "learning_rate": 0.00018677482769458904,
+      "loss": 0.8573,
+      "step": 358
+    },
+    {
+      "epoch": 0.19146666666666667,
+      "grad_norm": 0.8146216841274553,
+      "learning_rate": 0.00018668881346949417,
+      "loss": 0.9443,
+      "step": 359
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.6009169654207878,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.9283,
+      "step": 360
+    },
+    {
+      "epoch": 0.19253333333333333,
+      "grad_norm": 0.41578777077511664,
+      "learning_rate": 0.00018651600867906272,
+      "loss": 0.7357,
+      "step": 361
+    },
+    {
+      "epoch": 0.19306666666666666,
+      "grad_norm": 0.666050224471447,
+      "learning_rate": 0.00018642921862974742,
+      "loss": 0.8986,
+      "step": 362
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.47970987588672004,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.8287,
+      "step": 363
+    },
+    {
+      "epoch": 0.19413333333333332,
+      "grad_norm": 0.479860868749402,
+      "learning_rate": 0.00018625486451875843,
+      "loss": 0.8108,
+      "step": 364
+    },
+    {
+      "epoch": 0.19466666666666665,
+      "grad_norm": 0.5884506522951436,
+      "learning_rate": 0.0001861673009777325,
+      "loss": 0.9642,
+      "step": 365
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.5510794845241416,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.8743,
+      "step": 366
+    },
+    {
+      "epoch": 0.19573333333333334,
+      "grad_norm": 0.5795214491241171,
+      "learning_rate": 0.00018599140223200716,
+      "loss": 0.8866,
+      "step": 367
+    },
+    {
+      "epoch": 0.19626666666666667,
+      "grad_norm": 0.5344279213250357,
+      "learning_rate": 0.0001859030675525681,
+      "loss": 0.75,
+      "step": 368
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.5005010601614996,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.838,
+      "step": 369
+    },
+    {
+      "epoch": 0.19733333333333333,
+      "grad_norm": 0.6184079702216735,
+      "learning_rate": 0.0001857256288994402,
+      "loss": 0.9063,
+      "step": 370
+    },
+    {
+      "epoch": 0.19786666666666666,
+      "grad_norm": 0.5707216840794579,
+      "learning_rate": 0.00018563652545561013,
+      "loss": 0.9228,
+      "step": 371
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.45330242748343846,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7376,
+      "step": 372
+    },
+    {
+      "epoch": 0.19893333333333332,
+      "grad_norm": 0.6088262432419314,
+      "learning_rate": 0.000185457551663799,
+      "loss": 0.9296,
+      "step": 373
+    },
+    {
+      "epoch": 0.19946666666666665,
+      "grad_norm": 0.4531406961541151,
+      "learning_rate": 0.00018536768185026083,
+      "loss": 0.7553,
+      "step": 374
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5647055757625112,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.8885,
+      "step": 375
+    },
+    {
+      "epoch": 0.20053333333333334,
+      "grad_norm": 0.5882369341941862,
+      "learning_rate": 0.00018518717772974302,
+      "loss": 0.8576,
+      "step": 376
+    },
+    {
+      "epoch": 0.20106666666666667,
+      "grad_norm": 0.5400256164599937,
+      "learning_rate": 0.00018509654396177609,
+      "loss": 0.8778,
+      "step": 377
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.676537394010591,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.9723,
+      "step": 378
+    },
+    {
+      "epoch": 0.20213333333333333,
+      "grad_norm": 0.5167315801994107,
+      "learning_rate": 0.00018491451436365627,
+      "loss": 0.9023,
+      "step": 379
+    },
+    {
+      "epoch": 0.20266666666666666,
+      "grad_norm": 0.47234634295211225,
+      "learning_rate": 0.0001848231190770714,
+      "loss": 0.7748,
+      "step": 380
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.5752326204538213,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.8052,
+      "step": 381
+    },
+    {
+      "epoch": 0.20373333333333332,
+      "grad_norm": 0.48278342600159446,
+      "learning_rate": 0.00018463956889345194,
+      "loss": 0.8469,
+      "step": 382
+    },
+    {
+      "epoch": 0.20426666666666668,
+      "grad_norm": 0.5747001961815094,
+      "learning_rate": 0.00018454741454452603,
+      "loss": 0.884,
+      "step": 383
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.63448866592384,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.8585,
+      "step": 384
+    },
+    {
+      "epoch": 0.20533333333333334,
+      "grad_norm": 0.4819688568653652,
+      "learning_rate": 0.00018436234870837547,
+      "loss": 0.7211,
+      "step": 385
+    },
+    {
+      "epoch": 0.20586666666666667,
+      "grad_norm": 0.5435821237712569,
+      "learning_rate": 0.00018426943777378552,
+      "loss": 0.8053,
+      "step": 386
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4670677897534823,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.8869,
+      "step": 387
+    },
+    {
+      "epoch": 0.20693333333333333,
+      "grad_norm": 0.5443572205211442,
+      "learning_rate": 0.00018408286125880604,
+      "loss": 0.9102,
+      "step": 388
+    },
+    {
+      "epoch": 0.20746666666666666,
+      "grad_norm": 0.5908846609114969,
+      "learning_rate": 0.00018398919623556238,
+      "loss": 0.7811,
+      "step": 389
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.5765923750964485,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 1.0941,
+      "step": 390
+    },
+    {
+      "epoch": 0.20853333333333332,
+      "grad_norm": 0.5138817115120994,
+      "learning_rate": 0.0001838011140560562,
+      "loss": 0.7936,
+      "step": 391
+    },
+    {
+      "epoch": 0.20906666666666668,
+      "grad_norm": 0.5995044651962633,
+      "learning_rate": 0.00018370669746143564,
+      "loss": 0.8256,
+      "step": 392
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5285618582126465,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.8912,
+      "step": 393
+    },
+    {
+      "epoch": 0.21013333333333334,
+      "grad_norm": 0.6023859216088946,
+      "learning_rate": 0.0001835171146721701,
+      "loss": 0.7781,
+      "step": 394
+    },
+    {
+      "epoch": 0.21066666666666667,
+      "grad_norm": 0.5007290710095313,
+      "learning_rate": 0.00018342194904364813,
+      "loss": 0.7527,
+      "step": 395
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5228675855644663,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.8552,
+      "step": 396
+    },
+    {
+      "epoch": 0.21173333333333333,
+      "grad_norm": 0.5496498789032858,
+      "learning_rate": 0.00018323087073971993,
+      "loss": 0.8445,
+      "step": 397
+    },
+    {
+      "epoch": 0.21226666666666666,
+      "grad_norm": 0.5220544188374293,
+      "learning_rate": 0.00018313495863490258,
+      "loss": 0.8247,
+      "step": 398
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.5067334422083659,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.8302,
+      "step": 399
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.4691172286804912,
+      "learning_rate": 0.00018294238995160094,
+      "loss": 0.7837,
+      "step": 400
+    },
+    {
+      "epoch": 0.21386666666666668,
+      "grad_norm": 0.5330880084610178,
+      "learning_rate": 0.00018284573394815597,
+      "loss": 0.8542,
+      "step": 401
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.4642122947986411,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7679,
+      "step": 402
+    },
+    {
+      "epoch": 0.21493333333333334,
+      "grad_norm": 0.43891395287461854,
+      "learning_rate": 0.00018265168006082437,
+      "loss": 0.7624,
+      "step": 403
+    },
+    {
+      "epoch": 0.21546666666666667,
+      "grad_norm": 0.4237105239650998,
+      "learning_rate": 0.00018255428275641214,
+      "loss": 0.7488,
+      "step": 404
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5130046523239726,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7836,
+      "step": 405
+    },
+    {
+      "epoch": 0.21653333333333333,
+      "grad_norm": 0.6317362602782209,
+      "learning_rate": 0.0001823587488803095,
+      "loss": 0.8742,
+      "step": 406
+    },
+    {
+      "epoch": 0.21706666666666666,
+      "grad_norm": 0.5580357630858312,
+      "learning_rate": 0.00018226061289251298,
+      "loss": 0.738,
+      "step": 407
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.5387490413701905,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.8758,
+      "step": 408
+    },
+    {
+      "epoch": 0.21813333333333335,
+      "grad_norm": 0.5047434117071069,
+      "learning_rate": 0.00018206360428267332,
+      "loss": 0.874,
+      "step": 409
+    },
+    {
+      "epoch": 0.21866666666666668,
+      "grad_norm": 0.5941421898634975,
+      "learning_rate": 0.00018196473224892784,
+      "loss": 0.8613,
+      "step": 410
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.5574177582023201,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.8667,
+      "step": 411
+    },
+    {
+      "epoch": 0.21973333333333334,
+      "grad_norm": 0.5173676833274382,
+      "learning_rate": 0.0001817662542000192,
+      "loss": 0.8572,
+      "step": 412
+    },
+    {
+      "epoch": 0.22026666666666667,
+      "grad_norm": 0.4860014045365757,
+      "learning_rate": 0.0001816666487775416,
+      "loss": 0.7569,
+      "step": 413
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.6180223002740645,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.8477,
+      "step": 414
+    },
+    {
+      "epoch": 0.22133333333333333,
+      "grad_norm": 0.5609024155241303,
+      "learning_rate": 0.00018146670662372354,
+      "loss": 0.8657,
+      "step": 415
+    },
+    {
+      "epoch": 0.22186666666666666,
+      "grad_norm": 0.47057323212256735,
+      "learning_rate": 0.0001813663704894407,
+      "loss": 0.7562,
+      "step": 416
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.4945005490935583,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.8207,
+      "step": 417
+    },
+    {
+      "epoch": 0.22293333333333334,
+      "grad_norm": 0.6133187947117877,
+      "learning_rate": 0.00018116496960422107,
+      "loss": 0.8199,
+      "step": 418
+    },
+    {
+      "epoch": 0.22346666666666667,
+      "grad_norm": 0.6911757752607458,
+      "learning_rate": 0.00018106390545469795,
+      "loss": 0.87,
+      "step": 419
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.6137929993298187,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.837,
+      "step": 420
+    },
+    {
+      "epoch": 0.22453333333333333,
+      "grad_norm": 0.5501024759025723,
+      "learning_rate": 0.00018086105125078857,
+      "loss": 0.7912,
+      "step": 421
+    },
+    {
+      "epoch": 0.22506666666666666,
+      "grad_norm": 0.5063033271652045,
+      "learning_rate": 0.00018075926180215576,
+      "loss": 0.7805,
+      "step": 422
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.510137173717112,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.7797,
+      "step": 423
+    },
+    {
+      "epoch": 0.22613333333333333,
+      "grad_norm": 0.6157392638015244,
+      "learning_rate": 0.0001805549597313267,
+      "loss": 0.9578,
+      "step": 424
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 0.622176074552984,
+      "learning_rate": 0.0001804524477192075,
+      "loss": 0.8868,
+      "step": 425
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.6176511393183507,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7839,
+      "step": 426
+    },
+    {
+      "epoch": 0.22773333333333334,
+      "grad_norm": 0.6022582377918664,
+      "learning_rate": 0.00018024670327214084,
+      "loss": 0.9504,
+      "step": 427
+    },
+    {
+      "epoch": 0.22826666666666667,
+      "grad_norm": 0.6117420394311651,
+      "learning_rate": 0.00018014347145157755,
+      "loss": 0.8724,
+      "step": 428
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.4613306447590751,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7111,
+      "step": 429
+    },
+    {
+      "epoch": 0.22933333333333333,
+      "grad_norm": 0.49274299642677505,
+      "learning_rate": 0.0001799362901577196,
+      "loss": 0.9149,
+      "step": 430
+    },
+    {
+      "epoch": 0.22986666666666666,
+      "grad_norm": 0.5144909057081801,
+      "learning_rate": 0.00017983234130309968,
+      "loss": 0.9043,
+      "step": 431
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.4574806071235767,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.824,
+      "step": 432
+    },
+    {
+      "epoch": 0.23093333333333332,
+      "grad_norm": 0.6321660311909871,
+      "learning_rate": 0.00017962372873051252,
+      "loss": 0.9061,
+      "step": 433
+    },
+    {
+      "epoch": 0.23146666666666665,
+      "grad_norm": 0.43513411301300586,
+      "learning_rate": 0.00017951906563549397,
+      "loss": 0.7199,
+      "step": 434
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.6248157856692813,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.8101,
+      "step": 435
+    },
+    {
+      "epoch": 0.23253333333333334,
+      "grad_norm": 0.47038352398190103,
+      "learning_rate": 0.00017930902739070562,
+      "loss": 0.7837,
+      "step": 436
+    },
+    {
+      "epoch": 0.23306666666666667,
+      "grad_norm": 0.5856396926417063,
+      "learning_rate": 0.00017920365286814183,
+      "loss": 0.8927,
+      "step": 437
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4976973531467887,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.8334,
+      "step": 438
+    },
+    {
+      "epoch": 0.23413333333333333,
+      "grad_norm": 0.4818085061614485,
+      "learning_rate": 0.0001789921945959958,
+      "loss": 0.7827,
+      "step": 439
+    },
+    {
+      "epoch": 0.23466666666666666,
+      "grad_norm": 0.47692623522400623,
+      "learning_rate": 0.00017888611147786002,
+      "loss": 0.7826,
+      "step": 440
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.5466326203656854,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.8243,
+      "step": 441
+    },
+    {
+      "epoch": 0.23573333333333332,
+      "grad_norm": 0.5152162988773265,
+      "learning_rate": 0.00017867323886136348,
+      "loss": 0.7567,
+      "step": 442
+    },
+    {
+      "epoch": 0.23626666666666668,
+      "grad_norm": 0.45388835284062296,
+      "learning_rate": 0.00017856644999867264,
+      "loss": 0.7695,
+      "step": 443
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5853618309402371,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.8208,
+      "step": 444
+    },
+    {
+      "epoch": 0.23733333333333334,
+      "grad_norm": 0.6106932999990901,
+      "learning_rate": 0.00017835216875884368,
+      "loss": 0.867,
+      "step": 445
+    },
+    {
+      "epoch": 0.23786666666666667,
+      "grad_norm": 0.4874600419982771,
+      "learning_rate": 0.0001782446770215819,
+      "loss": 0.7949,
+      "step": 446
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.4817403222328486,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7809,
+      "step": 447
+    },
+    {
+      "epoch": 0.23893333333333333,
+      "grad_norm": 0.4685691151727727,
+      "learning_rate": 0.00017802899291729585,
+      "loss": 0.7372,
+      "step": 448
+    },
+    {
+      "epoch": 0.23946666666666666,
+      "grad_norm": 0.6095969024234741,
+      "learning_rate": 0.0001779208011943371,
+      "loss": 0.7889,
+      "step": 449
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.6301026106350284,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.9439,
+      "step": 450
+    },
+    {
+      "epoch": 0.24053333333333332,
+      "grad_norm": 0.558599760329991,
+      "learning_rate": 0.00017770372002217172,
+      "loss": 0.8016,
+      "step": 451
+    },
+    {
+      "epoch": 0.24106666666666668,
+      "grad_norm": 0.5867696843362354,
+      "learning_rate": 0.00017759483122120238,
+      "loss": 0.9291,
+      "step": 452
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.4542271311118376,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.831,
+      "step": 453
+    },
+    {
+      "epoch": 0.24213333333333334,
+      "grad_norm": 0.48078456375330836,
+      "learning_rate": 0.00017737635881528196,
+      "loss": 0.8061,
+      "step": 454
+    },
+    {
+      "epoch": 0.24266666666666667,
+      "grad_norm": 0.5837470567030518,
+      "learning_rate": 0.00017726677586272263,
+      "loss": 0.8194,
+      "step": 455
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.590019028646285,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.8989,
+      "step": 456
+    },
+    {
+      "epoch": 0.24373333333333333,
+      "grad_norm": 0.5201869075062091,
+      "learning_rate": 0.00017704691809456143,
+      "loss": 0.8389,
+      "step": 457
+    },
+    {
+      "epoch": 0.24426666666666666,
+      "grad_norm": 0.5456899747750916,
+      "learning_rate": 0.0001769366439354882,
+      "loss": 0.8175,
+      "step": 458
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.5379656453512448,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.8548,
+      "step": 459
+    },
+    {
+      "epoch": 0.24533333333333332,
+      "grad_norm": 0.504279218630493,
+      "learning_rate": 0.00017671540671383243,
+      "loss": 0.8189,
+      "step": 460
+    },
+    {
+      "epoch": 0.24586666666666668,
+      "grad_norm": 0.5100218415175828,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 0.7222,
+      "step": 461
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.5356827307519676,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.9023,
+      "step": 462
+    },
+    {
+      "epoch": 0.24693333333333334,
+      "grad_norm": 0.4764855230388838,
+      "learning_rate": 0.00017638183358256696,
+      "loss": 0.8479,
+      "step": 463
+    },
+    {
+      "epoch": 0.24746666666666667,
+      "grad_norm": 0.575984961535713,
+      "learning_rate": 0.00017627018591992018,
+      "loss": 0.7998,
+      "step": 464
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.5896885652530898,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.8793,
+      "step": 465
+    },
+    {
+      "epoch": 0.24853333333333333,
+      "grad_norm": 0.4369359609408987,
+      "learning_rate": 0.00017604620766564723,
+      "loss": 0.7633,
+      "step": 466
+    },
+    {
+      "epoch": 0.24906666666666666,
+      "grad_norm": 0.6604020258812308,
+      "learning_rate": 0.00017593387774285412,
+      "loss": 1.0006,
+      "step": 467
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.6622083940456467,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8981,
+      "step": 468
+    },
+    {
+      "epoch": 0.2501333333333333,
+      "grad_norm": 0.6256728641179665,
+      "learning_rate": 0.0001757085379831246,
+      "loss": 0.9839,
+      "step": 469
+    },
+    {
+      "epoch": 0.25066666666666665,
+      "grad_norm": 0.6290910760399225,
+      "learning_rate": 0.00017559552881908695,
+      "loss": 0.8761,
+      "step": 470
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.5272851956074754,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8193,
+      "step": 471
+    },
+    {
+      "epoch": 0.2517333333333333,
+      "grad_norm": 0.46195198232319007,
+      "learning_rate": 0.00017536883360997743,
+      "loss": 0.7859,
+      "step": 472
+    },
+    {
+      "epoch": 0.25226666666666664,
+      "grad_norm": 0.520073020384154,
+      "learning_rate": 0.00017525514824185185,
+      "loss": 0.8488,
+      "step": 473
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.6085112558608898,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.868,
+      "step": 474
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 0.5207737793312851,
+      "learning_rate": 0.00017502710367586687,
+      "loss": 0.8593,
+      "step": 475
+    },
+    {
+      "epoch": 0.2538666666666667,
+      "grad_norm": 0.5767838902749421,
+      "learning_rate": 0.0001749127451589832,
+      "loss": 0.7459,
+      "step": 476
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.3947273347232596,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7597,
+      "step": 477
+    },
+    {
+      "epoch": 0.25493333333333335,
+      "grad_norm": 0.5804785627383354,
+      "learning_rate": 0.00017468335736489177,
+      "loss": 0.8237,
+      "step": 478
+    },
+    {
+      "epoch": 0.2554666666666667,
+      "grad_norm": 0.4714559275309985,
+      "learning_rate": 0.00017456832877267084,
+      "loss": 0.7953,
+      "step": 479
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.47567972325860264,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.8037,
+      "step": 480
+    },
+    {
+      "epoch": 0.25653333333333334,
+      "grad_norm": 0.8071744068960827,
+      "learning_rate": 0.00017433760391534167,
+      "loss": 1.1113,
+      "step": 481
+    },
+    {
+      "epoch": 0.25706666666666667,
+      "grad_norm": 0.47927549431497496,
+      "learning_rate": 0.00017422190833921283,
+      "loss": 0.7565,
+      "step": 482
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.542472938443313,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.8184,
+      "step": 483
+    },
+    {
+      "epoch": 0.2581333333333333,
+      "grad_norm": 0.7695927800463104,
+      "learning_rate": 0.00017398985261944856,
+      "loss": 0.9872,
+      "step": 484
+    },
+    {
+      "epoch": 0.25866666666666666,
+      "grad_norm": 0.6358222932380025,
+      "learning_rate": 0.00017387349316876666,
+      "loss": 0.8552,
+      "step": 485
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.5226165286628119,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.8495,
+      "step": 486
+    },
+    {
+      "epoch": 0.2597333333333333,
+      "grad_norm": 0.5529408183593187,
+      "learning_rate": 0.0001736401128231373,
+      "loss": 0.8307,
+      "step": 487
+    },
+    {
+      "epoch": 0.26026666666666665,
+      "grad_norm": 0.5398514940998124,
+      "learning_rate": 0.00017352309262509894,
+      "loss": 0.8691,
+      "step": 488
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.499866112005662,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.8597,
+      "step": 489
+    },
+    {
+      "epoch": 0.2613333333333333,
+      "grad_norm": 0.4269871088798527,
+      "learning_rate": 0.0001732883939257742,
+      "loss": 0.7545,
+      "step": 490
+    },
+    {
+      "epoch": 0.2618666666666667,
+      "grad_norm": 0.5058226215124434,
+      "learning_rate": 0.0001731707161253338,
+      "loss": 0.8474,
+      "step": 491
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.5749674216515905,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.8277,
+      "step": 492
+    },
+    {
+      "epoch": 0.26293333333333335,
+      "grad_norm": 0.5020270362004025,
+      "learning_rate": 0.00017293470537991463,
+      "loss": 0.8762,
+      "step": 493
+    },
+    {
+      "epoch": 0.2634666666666667,
+      "grad_norm": 0.5750060859703835,
+      "learning_rate": 0.00017281637313969978,
+      "loss": 0.854,
+      "step": 494
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.544859109216088,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.8573,
+      "step": 495
+    },
+    {
+      "epoch": 0.26453333333333334,
+      "grad_norm": 0.42039803428169764,
+      "learning_rate": 0.00017257905669104874,
+      "loss": 0.7308,
+      "step": 496
+    },
+    {
+      "epoch": 0.2650666666666667,
+      "grad_norm": 0.514802162289825,
+      "learning_rate": 0.00017246007319127545,
+      "loss": 0.688,
+      "step": 497
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.5132339873084991,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7364,
+      "step": 498
+    },
+    {
+      "epoch": 0.26613333333333333,
+      "grad_norm": 0.6706884030316487,
+      "learning_rate": 0.00017222145741734626,
+      "loss": 0.8348,
+      "step": 499
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.5781843065650732,
+      "learning_rate": 0.00017210182585573327,
+      "loss": 0.8212,
+      "step": 500
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.5275680410726457,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.8587,
+      "step": 501
+    },
+    {
+      "epoch": 0.2677333333333333,
+      "grad_norm": 0.6370838075439973,
+      "learning_rate": 0.00017186191716939944,
+      "loss": 0.9092,
+      "step": 502
+    },
+    {
+      "epoch": 0.26826666666666665,
+      "grad_norm": 0.5015890105789446,
+      "learning_rate": 0.0001717416407610824,
+      "loss": 0.8558,
+      "step": 503
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4962255759207231,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8169,
+      "step": 504
+    },
+    {
+      "epoch": 0.2693333333333333,
+      "grad_norm": 0.4629324862616736,
+      "learning_rate": 0.00017150044560996488,
+      "loss": 0.7627,
+      "step": 505
+    },
+    {
+      "epoch": 0.26986666666666664,
+      "grad_norm": 0.4790462376610464,
+      "learning_rate": 0.00017137952758740978,
+      "loss": 0.7806,
+      "step": 506
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.4589193456933277,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.8131,
+      "step": 507
+    },
+    {
+      "epoch": 0.27093333333333336,
+      "grad_norm": 0.52371605378381,
+      "learning_rate": 0.00017113705245370368,
+      "loss": 0.8487,
+      "step": 508
+    },
+    {
+      "epoch": 0.2714666666666667,
+      "grad_norm": 0.6854660099008522,
+      "learning_rate": 0.00017101549606662024,
+      "loss": 1.0226,
+      "step": 509
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.58934966996955,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.9247,
+      "step": 510
+    },
+    {
+      "epoch": 0.27253333333333335,
+      "grad_norm": 0.5402264313064337,
+      "learning_rate": 0.00017077174746692056,
+      "loss": 0.8788,
+      "step": 511
+    },
+    {
+      "epoch": 0.2730666666666667,
+      "grad_norm": 0.47957437403783126,
+      "learning_rate": 0.00017064955598217462,
+      "loss": 0.8203,
+      "step": 512
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.44709841455364296,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7414,
+      "step": 513
+    },
+    {
+      "epoch": 0.27413333333333334,
+      "grad_norm": 0.5332362597845244,
+      "learning_rate": 0.00017040454046730115,
+      "loss": 0.8195,
+      "step": 514
+    },
+    {
+      "epoch": 0.27466666666666667,
+      "grad_norm": 0.5904914584094229,
+      "learning_rate": 0.00017028171716882714,
+      "loss": 0.9734,
+      "step": 515
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.5342974518753195,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.8036,
+      "step": 516
+    },
+    {
+      "epoch": 0.27573333333333333,
+      "grad_norm": 0.6386296051221013,
+      "learning_rate": 0.00017003544132364846,
+      "loss": 0.8332,
+      "step": 517
+    },
+    {
+      "epoch": 0.27626666666666666,
+      "grad_norm": 0.5264806057403153,
+      "learning_rate": 0.00016991198951236088,
+      "loss": 0.8166,
+      "step": 518
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.6546698223767454,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.8983,
+      "step": 519
+    },
+    {
+      "epoch": 0.2773333333333333,
+      "grad_norm": 0.5634840483102529,
+      "learning_rate": 0.00016966445995561727,
+      "loss": 0.8522,
+      "step": 520
+    },
+    {
+      "epoch": 0.27786666666666665,
+      "grad_norm": 0.4733268974825999,
+      "learning_rate": 0.00016954038294932216,
+      "loss": 0.7678,
+      "step": 521
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.5655096983871343,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7449,
+      "step": 522
+    },
+    {
+      "epoch": 0.2789333333333333,
+      "grad_norm": 0.47868263433954594,
+      "learning_rate": 0.0001692916063334479,
+      "loss": 0.8203,
+      "step": 523
+    },
+    {
+      "epoch": 0.27946666666666664,
+      "grad_norm": 0.5266609333720934,
+      "learning_rate": 0.0001691669074667535,
+      "loss": 0.82,
+      "step": 524
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.6157752040080987,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.7982,
+      "step": 525
+    },
+    {
+      "epoch": 0.28053333333333336,
+      "grad_norm": 0.559279468125684,
+      "learning_rate": 0.0001689168904776979,
+      "loss": 0.7989,
+      "step": 526
+    },
+    {
+      "epoch": 0.2810666666666667,
+      "grad_norm": 0.5360024942522439,
+      "learning_rate": 0.00016879157310192535,
+      "loss": 0.7937,
+      "step": 527
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.5096612921083854,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7727,
+      "step": 528
+    },
+    {
+      "epoch": 0.28213333333333335,
+      "grad_norm": 0.8784262818664934,
+      "learning_rate": 0.00016854032245897308,
+      "loss": 0.9953,
+      "step": 529
+    },
+    {
+      "epoch": 0.2826666666666667,
+      "grad_norm": 0.6994689219107383,
+      "learning_rate": 0.00016841438994206595,
+      "loss": 0.9399,
+      "step": 530
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.46459927394998424,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7893,
+      "step": 531
+    },
+    {
+      "epoch": 0.28373333333333334,
+      "grad_norm": 0.5935554208834619,
+      "learning_rate": 0.00016816191239765667,
+      "loss": 0.896,
+      "step": 532
+    },
+    {
+      "epoch": 0.28426666666666667,
+      "grad_norm": 0.4270487738727027,
+      "learning_rate": 0.00016803536812409075,
+      "loss": 0.7317,
+      "step": 533
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.4996296503595494,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.8214,
+      "step": 534
+    },
+    {
+      "epoch": 0.2853333333333333,
+      "grad_norm": 0.5982774004385281,
+      "learning_rate": 0.00016778167046363734,
+      "loss": 0.9189,
+      "step": 535
+    },
+    {
+      "epoch": 0.28586666666666666,
+      "grad_norm": 0.4706957694534351,
+      "learning_rate": 0.00016765451783432953,
+      "loss": 0.8101,
+      "step": 536
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.549213850147289,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.8732,
+      "step": 537
+    },
+    {
+      "epoch": 0.2869333333333333,
+      "grad_norm": 0.5857748532390733,
+      "learning_rate": 0.0001673996068760359,
+      "loss": 0.7735,
+      "step": 538
+    },
+    {
+      "epoch": 0.28746666666666665,
+      "grad_norm": 0.5086801122280594,
+      "learning_rate": 0.00016727184930825288,
+      "loss": 0.7906,
+      "step": 539
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.5138457768924369,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.8575,
+      "step": 540
+    },
+    {
+      "epoch": 0.2885333333333333,
+      "grad_norm": 0.5113291229424737,
+      "learning_rate": 0.00016701573190293077,
+      "loss": 0.8191,
+      "step": 541
+    },
+    {
+      "epoch": 0.2890666666666667,
+      "grad_norm": 0.5520767593214518,
+      "learning_rate": 0.00016688737283019706,
+      "loss": 0.8266,
+      "step": 542
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.47561911014972424,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7163,
+      "step": 543
+    },
+    {
+      "epoch": 0.29013333333333335,
+      "grad_norm": 0.5824368385480034,
+      "learning_rate": 0.00016663005586108176,
+      "loss": 0.9457,
+      "step": 544
+    },
+    {
+      "epoch": 0.2906666666666667,
+      "grad_norm": 0.5586463030276689,
+      "learning_rate": 0.00016650109873308765,
+      "loss": 0.8331,
+      "step": 545
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.4638270088383452,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.8051,
+      "step": 546
+    },
+    {
+      "epoch": 0.29173333333333334,
+      "grad_norm": 0.5286869926090003,
+      "learning_rate": 0.0001662425891156531,
+      "loss": 0.8007,
+      "step": 547
+    },
+    {
+      "epoch": 0.2922666666666667,
+      "grad_norm": 0.4357724563674736,
+      "learning_rate": 0.00016611303739816168,
+      "loss": 0.7317,
+      "step": 548
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.37164510925752114,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7168,
+      "step": 549
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.48288239001177796,
+      "learning_rate": 0.00016585334207993476,
+      "loss": 0.801,
+      "step": 550
+    },
+    {
+      "epoch": 0.29386666666666666,
+      "grad_norm": 0.5043367237233689,
+      "learning_rate": 0.00016572319925468892,
+      "loss": 0.7983,
+      "step": 551
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.5419864242698199,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.8818,
+      "step": 552
+    },
+    {
+      "epoch": 0.2949333333333333,
+      "grad_norm": 0.43418777180724477,
+      "learning_rate": 0.0001654623252150624,
+      "loss": 0.7404,
+      "step": 553
+    },
+    {
+      "epoch": 0.29546666666666666,
+      "grad_norm": 0.5173208460101304,
+      "learning_rate": 0.00016533159477969122,
+      "loss": 0.8224,
+      "step": 554
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.5504684985819356,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7496,
+      "step": 555
+    },
+    {
+      "epoch": 0.2965333333333333,
+      "grad_norm": 0.42604835139136,
+      "learning_rate": 0.00016506954902973655,
+      "loss": 0.8319,
+      "step": 556
+    },
+    {
+      "epoch": 0.29706666666666665,
+      "grad_norm": 0.47856812166759943,
+      "learning_rate": 0.00016493823449766136,
+      "loss": 0.7996,
+      "step": 557
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.5218824605992826,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7575,
+      "step": 558
+    },
+    {
+      "epoch": 0.2981333333333333,
+      "grad_norm": 0.7257298399405144,
+      "learning_rate": 0.00016467502407993992,
+      "loss": 0.8362,
+      "step": 559
+    },
+    {
+      "epoch": 0.2986666666666667,
+      "grad_norm": 0.5599836703147796,
+      "learning_rate": 0.0001645431289802799,
+      "loss": 0.8223,
+      "step": 560
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.5962035431992888,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.7909,
+      "step": 561
+    },
+    {
+      "epoch": 0.29973333333333335,
+      "grad_norm": 0.5824597748075987,
+      "learning_rate": 0.00016427876096865394,
+      "loss": 0.8891,
+      "step": 562
+    },
+    {
+      "epoch": 0.3002666666666667,
+      "grad_norm": 0.4907512680751975,
+      "learning_rate": 0.00016414628884613107,
+      "loss": 0.7922,
+      "step": 563
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.6603540202315609,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.9001,
+      "step": 564
+    },
+    {
+      "epoch": 0.30133333333333334,
+      "grad_norm": 0.47200466542824876,
+      "learning_rate": 0.00016388077034557355,
+      "loss": 0.7159,
+      "step": 565
+    },
+    {
+      "epoch": 0.30186666666666667,
+      "grad_norm": 0.6394882361961985,
+      "learning_rate": 0.00016374772476041748,
+      "loss": 0.8674,
+      "step": 566
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.5228457915772745,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.833,
+      "step": 567
+    },
+    {
+      "epoch": 0.30293333333333333,
+      "grad_norm": 0.49909695192505343,
+      "learning_rate": 0.00016348106290682118,
+      "loss": 0.8456,
+      "step": 568
+    },
+    {
+      "epoch": 0.30346666666666666,
+      "grad_norm": 0.5084880904551904,
+      "learning_rate": 0.00016334744743467364,
+      "loss": 0.7277,
+      "step": 569
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4544296233646207,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.8133,
+      "step": 570
+    },
+    {
+      "epoch": 0.3045333333333333,
+      "grad_norm": 0.5851864836675709,
+      "learning_rate": 0.00016307964939465914,
+      "loss": 0.8358,
+      "step": 571
+    },
+    {
+      "epoch": 0.30506666666666665,
+      "grad_norm": 0.5763269464522621,
+      "learning_rate": 0.00016294546762647775,
+      "loss": 0.8423,
+      "step": 572
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.4996570666145586,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7754,
+      "step": 573
+    },
+    {
+      "epoch": 0.3061333333333333,
+      "grad_norm": 0.5984740080839147,
+      "learning_rate": 0.0001626765405972011,
+      "loss": 0.9079,
+      "step": 574
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 0.5767114718560038,
+      "learning_rate": 0.00016254179613916278,
+      "loss": 0.8105,
+      "step": 575
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4912622950915509,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.8452,
+      "step": 576
+    },
+    {
+      "epoch": 0.30773333333333336,
+      "grad_norm": 0.7038046059310531,
+      "learning_rate": 0.000162271747348122,
+      "loss": 0.8286,
+      "step": 577
+    },
+    {
+      "epoch": 0.3082666666666667,
+      "grad_norm": 0.7134046759703173,
+      "learning_rate": 0.0001621364438215262,
+      "loss": 1.0264,
+      "step": 578
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.49404490870992984,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7835,
+      "step": 579
+    },
+    {
+      "epoch": 0.30933333333333335,
+      "grad_norm": 0.623471111912179,
+      "learning_rate": 0.00016186528052636692,
+      "loss": 0.8711,
+      "step": 580
+    },
+    {
+      "epoch": 0.3098666666666667,
+      "grad_norm": 0.4804767904049977,
+      "learning_rate": 0.0001617294215675382,
+      "loss": 0.8387,
+      "step": 581
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.46761157040793505,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7845,
+      "step": 582
+    },
+    {
+      "epoch": 0.31093333333333334,
+      "grad_norm": 0.7806940162867643,
+      "learning_rate": 0.0001614571510558588,
+      "loss": 0.992,
+      "step": 583
+    },
+    {
+      "epoch": 0.31146666666666667,
+      "grad_norm": 0.4311619702018419,
+      "learning_rate": 0.00016132074031604917,
+      "loss": 0.8103,
+      "step": 584
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.4994566659105595,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.8189,
+      "step": 585
+    },
+    {
+      "epoch": 0.31253333333333333,
+      "grad_norm": 0.5846419690789595,
+      "learning_rate": 0.00016104736990520468,
+      "loss": 0.8395,
+      "step": 586
+    },
+    {
+      "epoch": 0.31306666666666666,
+      "grad_norm": 0.5866380456067939,
+      "learning_rate": 0.0001609104110504954,
+      "loss": 0.9201,
+      "step": 587
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4853202162020479,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7618,
+      "step": 588
+    },
+    {
+      "epoch": 0.3141333333333333,
+      "grad_norm": 0.7194557992744496,
+      "learning_rate": 0.00016063594808740113,
+      "loss": 0.9393,
+      "step": 589
+    },
+    {
+      "epoch": 0.31466666666666665,
+      "grad_norm": 0.5960008271004776,
+      "learning_rate": 0.00016049844479860422,
+      "loss": 0.9787,
+      "step": 590
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.6403500254416795,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.9428,
+      "step": 591
+    },
+    {
+      "epoch": 0.3157333333333333,
+      "grad_norm": 0.4931262065041592,
+      "learning_rate": 0.00016022289665953808,
+      "loss": 0.7831,
+      "step": 592
+    },
+    {
+      "epoch": 0.31626666666666664,
+      "grad_norm": 0.6453050711874272,
+      "learning_rate": 0.00016008485263209742,
+      "loss": 0.9162,
+      "step": 593
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.6674266833511433,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.954,
+      "step": 594
+    },
+    {
+      "epoch": 0.31733333333333336,
+      "grad_norm": 0.46995150980533834,
+      "learning_rate": 0.0001598082267225018,
+      "loss": 0.7899,
+      "step": 595
+    },
+    {
+      "epoch": 0.3178666666666667,
+      "grad_norm": 0.480919148513137,
+      "learning_rate": 0.0001596696456663938,
+      "loss": 0.7898,
+      "step": 596
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.48233714744879935,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.768,
+      "step": 597
+    },
+    {
+      "epoch": 0.31893333333333335,
+      "grad_norm": 0.5314895272602383,
+      "learning_rate": 0.00015939194942067646,
+      "loss": 0.8241,
+      "step": 598
+    },
+    {
+      "epoch": 0.3194666666666667,
+      "grad_norm": 0.549652819798972,
+      "learning_rate": 0.0001592528350603103,
+      "loss": 0.9037,
+      "step": 599
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4619070622568058,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.8204,
+      "step": 600
+    },
+    {
+      "epoch": 0.32053333333333334,
+      "grad_norm": 0.5432509501817164,
+      "learning_rate": 0.00015897407594164467,
+      "loss": 0.7856,
+      "step": 601
+    },
+    {
+      "epoch": 0.32106666666666667,
+      "grad_norm": 0.5875370509312653,
+      "learning_rate": 0.00015883443201576225,
+      "loss": 0.8048,
+      "step": 602
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.4639712654536695,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7795,
+      "step": 603
+    },
+    {
+      "epoch": 0.3221333333333333,
+      "grad_norm": 0.4580442349982068,
+      "learning_rate": 0.00015855461751588677,
+      "loss": 0.7516,
+      "step": 604
+    },
+    {
+      "epoch": 0.32266666666666666,
+      "grad_norm": 0.4346865408481033,
+      "learning_rate": 0.0001584144477774623,
+      "loss": 0.7591,
+      "step": 605
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.494760556314188,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7833,
+      "step": 606
+    },
+    {
+      "epoch": 0.3237333333333333,
+      "grad_norm": 0.528352204234871,
+      "learning_rate": 0.00015813358541647915,
+      "loss": 0.8288,
+      "step": 607
+    },
+    {
+      "epoch": 0.32426666666666665,
+      "grad_norm": 0.6492585774807074,
+      "learning_rate": 0.00015799289363261813,
+      "loss": 0.9012,
+      "step": 608
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.5240550560844716,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.8712,
+      "step": 609
+    },
+    {
+      "epoch": 0.3253333333333333,
+      "grad_norm": 0.5488479871941387,
+      "learning_rate": 0.00015771099095879108,
+      "loss": 0.8995,
+      "step": 610
+    },
+    {
+      "epoch": 0.3258666666666667,
+      "grad_norm": 0.5192865550604814,
+      "learning_rate": 0.0001575697809106292,
+      "loss": 0.8374,
+      "step": 611
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.5529828711217839,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.8902,
+      "step": 612
+    },
+    {
+      "epoch": 0.32693333333333335,
+      "grad_norm": 0.4971428712265958,
+      "learning_rate": 0.00015728684550018064,
+      "loss": 0.8137,
+      "step": 613
+    },
+    {
+      "epoch": 0.3274666666666667,
+      "grad_norm": 0.5374827282262518,
+      "learning_rate": 0.0001571451209827821,
+      "loss": 0.798,
+      "step": 614
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.49224084092290105,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7794,
+      "step": 615
+    },
+    {
+      "epoch": 0.32853333333333334,
+      "grad_norm": 0.48859078374389414,
+      "learning_rate": 0.00015686116043968972,
+      "loss": 0.8465,
+      "step": 616
+    },
+    {
+      "epoch": 0.3290666666666667,
+      "grad_norm": 0.58071595013088,
+      "learning_rate": 0.00015671892526194516,
+      "loss": 0.8189,
+      "step": 617
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.5070498492857128,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.8166,
+      "step": 618
+    },
+    {
+      "epoch": 0.33013333333333333,
+      "grad_norm": 0.45243401011492024,
+      "learning_rate": 0.0001564339472177373,
+      "loss": 0.744,
+      "step": 619
+    },
+    {
+      "epoch": 0.33066666666666666,
+      "grad_norm": 0.4548224071388063,
+      "learning_rate": 0.00015629120520226165,
+      "loss": 0.7771,
+      "step": 620
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.6218642511004636,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.9506,
+      "step": 621
+    },
+    {
+      "epoch": 0.3317333333333333,
+      "grad_norm": 0.5871105962110491,
+      "learning_rate": 0.0001560052173158123,
+      "loss": 0.8571,
+      "step": 622
+    },
+    {
+      "epoch": 0.33226666666666665,
+      "grad_norm": 0.4502345296443168,
+      "learning_rate": 0.00015586197229884184,
+      "loss": 0.6505,
+      "step": 623
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.5426367653877069,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.8341,
+      "step": 624
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.5270219482527734,
+      "learning_rate": 0.00015557498225616487,
+      "loss": 0.8399,
+      "step": 625
+    },
+    {
+      "epoch": 0.33386666666666664,
+      "grad_norm": 0.45051557886044796,
+      "learning_rate": 0.0001554312380874542,
+      "loss": 0.7813,
+      "step": 626
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.4529058004894344,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7491,
+      "step": 627
+    },
+    {
+      "epoch": 0.33493333333333336,
+      "grad_norm": 0.4491210952944165,
+      "learning_rate": 0.00015514325360149668,
+      "loss": 0.6935,
+      "step": 628
+    },
+    {
+      "epoch": 0.3354666666666667,
+      "grad_norm": 0.47190486222078226,
+      "learning_rate": 0.0001549990141442153,
+      "loss": 0.8135,
+      "step": 629
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5297070644313344,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7777,
+      "step": 630
+    },
+    {
+      "epoch": 0.33653333333333335,
+      "grad_norm": 0.4681988968090716,
+      "learning_rate": 0.00015471004295465035,
+      "loss": 0.7446,
+      "step": 631
+    },
+    {
+      "epoch": 0.3370666666666667,
+      "grad_norm": 0.43319075533756046,
+      "learning_rate": 0.0001545653120852787,
+      "loss": 0.7399,
+      "step": 632
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.5434146752151499,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.8054,
+      "step": 633
+    },
+    {
+      "epoch": 0.33813333333333334,
+      "grad_norm": 0.563507787244446,
+      "learning_rate": 0.00015427536195829742,
+      "loss": 0.8432,
+      "step": 634
+    },
+    {
+      "epoch": 0.33866666666666667,
+      "grad_norm": 0.4467567514677332,
+      "learning_rate": 0.00015413014356652286,
+      "loss": 0.7761,
+      "step": 635
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.7510957755652906,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.9545,
+      "step": 636
+    },
+    {
+      "epoch": 0.33973333333333333,
+      "grad_norm": 0.5212395372222507,
+      "learning_rate": 0.00015383922229462549,
+      "loss": 0.7712,
+      "step": 637
+    },
+    {
+      "epoch": 0.34026666666666666,
+      "grad_norm": 0.5480976245264051,
+      "learning_rate": 0.00015369352028323774,
+      "loss": 0.8605,
+      "step": 638
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.4891950203200895,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.8298,
+      "step": 639
+    },
+    {
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.6464062265699964,
+      "learning_rate": 0.0001534016356850244,
+      "loss": 0.9124,
+      "step": 640
+    },
+    {
+      "epoch": 0.34186666666666665,
+      "grad_norm": 0.6372004048385579,
+      "learning_rate": 0.0001532554539698105,
+      "loss": 0.8321,
+      "step": 641
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.6949770307779344,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.8713,
+      "step": 642
+    },
+    {
+      "epoch": 0.3429333333333333,
+      "grad_norm": 0.5385662142958166,
+      "learning_rate": 0.00015296261388977108,
+      "loss": 0.7753,
+      "step": 643
+    },
+    {
+      "epoch": 0.34346666666666664,
+      "grad_norm": 0.4790502869005614,
+      "learning_rate": 0.0001528159563994104,
+      "loss": 0.6772,
+      "step": 644
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.47222086088374976,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7664,
+      "step": 645
+    },
+    {
+      "epoch": 0.34453333333333336,
+      "grad_norm": 0.5504109345744241,
+      "learning_rate": 0.00015252216870771345,
+      "loss": 0.7948,
+      "step": 646
+    },
+    {
+      "epoch": 0.3450666666666667,
+      "grad_norm": 0.555248741312122,
+      "learning_rate": 0.00015237503938367186,
+      "loss": 0.8657,
+      "step": 647
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.7073240775226617,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.9117,
+      "step": 648
+    },
+    {
+      "epoch": 0.34613333333333335,
+      "grad_norm": 0.4173185557703972,
+      "learning_rate": 0.00015208031197595356,
+      "loss": 0.738,
+      "step": 649
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.5281107008645449,
+      "learning_rate": 0.0001519327147723776,
+      "loss": 0.8222,
+      "step": 650
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.45267513821679595,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7409,
+      "step": 651
+    },
+    {
+      "epoch": 0.34773333333333334,
+      "grad_norm": 0.5664719511836385,
+      "learning_rate": 0.0001516370555695291,
+      "loss": 0.8596,
+      "step": 652
+    },
+    {
+      "epoch": 0.34826666666666667,
+      "grad_norm": 0.5214844665016414,
+      "learning_rate": 0.00015148899445313981,
+      "loss": 0.7568,
+      "step": 653
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 1.0152205592076065,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.8269,
+      "step": 654
+    },
+    {
+      "epoch": 0.34933333333333333,
+      "grad_norm": 0.4519594191860944,
+      "learning_rate": 0.00015119241140109467,
+      "loss": 0.8106,
+      "step": 655
+    },
+    {
+      "epoch": 0.34986666666666666,
+      "grad_norm": 0.6341795199762441,
+      "learning_rate": 0.00015104389035108077,
+      "loss": 0.7695,
+      "step": 656
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.4476920087048747,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.8549,
+      "step": 657
+    },
+    {
+      "epoch": 0.3509333333333333,
+      "grad_norm": 0.7878868782854666,
+      "learning_rate": 0.0001507463914206012,
+      "loss": 0.9339,
+      "step": 658
+    },
+    {
+      "epoch": 0.35146666666666665,
+      "grad_norm": 0.5918566999530537,
+      "learning_rate": 0.0001505974144285124,
+      "loss": 0.8271,
+      "step": 659
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5530858544705913,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.8463,
+      "step": 660
+    },
+    {
+      "epoch": 0.3525333333333333,
+      "grad_norm": 0.509467346093617,
+      "learning_rate": 0.00015029900761497506,
+      "loss": 0.8852,
+      "step": 661
+    },
+    {
+      "epoch": 0.35306666666666664,
+      "grad_norm": 0.4810367462976718,
+      "learning_rate": 0.00015014957868461458,
+      "loss": 0.775,
+      "step": 662
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.555117207802824,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.9463,
+      "step": 663
+    },
+    {
+      "epoch": 0.35413333333333336,
+      "grad_norm": 0.5475186506627697,
+      "learning_rate": 0.000149850272007796,
+      "loss": 0.8014,
+      "step": 664
+    },
+    {
+      "epoch": 0.3546666666666667,
+      "grad_norm": 0.549780187963014,
+      "learning_rate": 0.00014970039515511304,
+      "loss": 0.9234,
+      "step": 665
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.6521370981065189,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.9162,
+      "step": 666
+    },
+    {
+      "epoch": 0.35573333333333335,
+      "grad_norm": 0.4915534290163208,
+      "learning_rate": 0.0001494001966589736,
+      "loss": 0.7963,
+      "step": 667
+    },
+    {
+      "epoch": 0.3562666666666667,
+      "grad_norm": 0.5822674771779652,
+      "learning_rate": 0.00014924987591195547,
+      "loss": 0.8058,
+      "step": 668
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.4050047986048172,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.6743,
+      "step": 669
+    },
+    {
+      "epoch": 0.35733333333333334,
+      "grad_norm": 0.48441273420916153,
+      "learning_rate": 0.0001489487936644237,
+      "loss": 0.8787,
+      "step": 670
+    },
+    {
+      "epoch": 0.35786666666666667,
+      "grad_norm": 0.453001655086232,
+      "learning_rate": 0.00014879803306298736,
+      "loss": 0.7646,
+      "step": 671
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.6405269452110277,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.875,
+      "step": 672
+    },
+    {
+      "epoch": 0.3589333333333333,
+      "grad_norm": 0.544293763208351,
+      "learning_rate": 0.00014849607515574276,
+      "loss": 0.8141,
+      "step": 673
+    },
+    {
+      "epoch": 0.35946666666666666,
+      "grad_norm": 0.5289104399576442,
+      "learning_rate": 0.00014834487875162657,
+      "loss": 0.8627,
+      "step": 674
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4858671702738135,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.7289,
+      "step": 675
+    },
+    {
+      "epoch": 0.3605333333333333,
+      "grad_norm": 0.46105750944172125,
+      "learning_rate": 0.00014804205329988225,
+      "loss": 0.7136,
+      "step": 676
+    },
+    {
+      "epoch": 0.36106666666666665,
+      "grad_norm": 0.443082210947998,
+      "learning_rate": 0.00014789042515653687,
+      "loss": 0.7249,
+      "step": 677
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.47418210192702337,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7509,
+      "step": 678
+    },
+    {
+      "epoch": 0.3621333333333333,
+      "grad_norm": 0.45300590502898413,
+      "learning_rate": 0.00014758674029882152,
+      "loss": 0.7828,
+      "step": 679
+    },
+    {
+      "epoch": 0.3626666666666667,
+      "grad_norm": 0.5426220757134201,
+      "learning_rate": 0.00014743468449130063,
+      "loss": 0.8303,
+      "step": 680
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.4561823438528258,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7711,
+      "step": 681
+    },
+    {
+      "epoch": 0.36373333333333335,
+      "grad_norm": 0.5199329649530187,
+      "learning_rate": 0.00014713014838923976,
+      "loss": 0.7742,
+      "step": 682
+    },
+    {
+      "epoch": 0.3642666666666667,
+      "grad_norm": 0.5638152597453153,
+      "learning_rate": 0.00014697766900409074,
+      "loss": 0.8875,
+      "step": 683
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.5063226354788936,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.6814,
+      "step": 684
+    },
+    {
+      "epoch": 0.36533333333333334,
+      "grad_norm": 0.5242501097120728,
+      "learning_rate": 0.0001466722898421873,
+      "loss": 0.8227,
+      "step": 685
+    },
+    {
+      "epoch": 0.3658666666666667,
+      "grad_norm": 0.5923674137806112,
+      "learning_rate": 0.0001465193909773413,
+      "loss": 0.8095,
+      "step": 686
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.5214849178482943,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.8084,
+      "step": 687
+    },
+    {
+      "epoch": 0.36693333333333333,
+      "grad_norm": 0.5990569040221414,
+      "learning_rate": 0.00014621317696275564,
+      "loss": 0.7951,
+      "step": 688
+    },
+    {
+      "epoch": 0.36746666666666666,
+      "grad_norm": 0.5168001235866095,
+      "learning_rate": 0.00014605986272741748,
+      "loss": 0.8297,
+      "step": 689
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.835047444638174,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7869,
+      "step": 690
+    },
+    {
+      "epoch": 0.3685333333333333,
+      "grad_norm": 0.5823636977054464,
+      "learning_rate": 0.00014575282208974702,
+      "loss": 0.8818,
+      "step": 691
+    },
+    {
+      "epoch": 0.36906666666666665,
+      "grad_norm": 0.44554725452601684,
+      "learning_rate": 0.00014559909660428468,
+      "loss": 0.728,
+      "step": 692
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.5020753127466948,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7315,
+      "step": 693
+    },
+    {
+      "epoch": 0.3701333333333333,
+      "grad_norm": 0.5704474888880798,
+      "learning_rate": 0.00014529123759534255,
+      "loss": 0.7228,
+      "step": 694
+    },
+    {
+      "epoch": 0.37066666666666664,
+      "grad_norm": 0.5384520328041145,
+      "learning_rate": 0.00014513710499117647,
+      "loss": 0.9527,
+      "step": 695
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.394233650703157,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.7055,
+      "step": 696
+    },
+    {
+      "epoch": 0.37173333333333336,
+      "grad_norm": 0.47824908759666046,
+      "learning_rate": 0.00014482843588476974,
+      "loss": 0.7723,
+      "step": 697
+    },
+    {
+      "epoch": 0.3722666666666667,
+      "grad_norm": 0.5235905145789982,
+      "learning_rate": 0.00014467390030426186,
+      "loss": 0.843,
+      "step": 698
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.5732810130133138,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.9212,
+      "step": 699
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.4244310157039686,
+      "learning_rate": 0.0001443644293959693,
+      "loss": 0.739,
+      "step": 700
+    },
+    {
+      "epoch": 0.3738666666666667,
+      "grad_norm": 0.5124127796057848,
+      "learning_rate": 0.00014420949499231172,
+      "loss": 0.761,
+      "step": 701
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.5486393317506724,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.8438,
+      "step": 702
+    },
+    {
+      "epoch": 0.37493333333333334,
+      "grad_norm": 0.6586646039222367,
+      "learning_rate": 0.00014389923059926062,
+      "loss": 0.9941,
+      "step": 703
+    },
+    {
+      "epoch": 0.37546666666666667,
+      "grad_norm": 0.5441001490299535,
+      "learning_rate": 0.0001437439015363638,
+      "loss": 0.8729,
+      "step": 704
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.54014081332669,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.8425,
+      "step": 705
+    },
+    {
+      "epoch": 0.37653333333333333,
+      "grad_norm": 0.7196576958019154,
+      "learning_rate": 0.00014343285199700683,
+      "loss": 0.8626,
+      "step": 706
+    },
+    {
+      "epoch": 0.37706666666666666,
+      "grad_norm": 0.5374306315732337,
+      "learning_rate": 0.0001432771324493879,
+      "loss": 0.796,
+      "step": 707
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.45059029744056645,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.6929,
+      "step": 708
+    },
+    {
+      "epoch": 0.3781333333333333,
+      "grad_norm": 0.682780798968389,
+      "learning_rate": 0.00014296530612327863,
+      "loss": 0.835,
+      "step": 709
+    },
+    {
+      "epoch": 0.37866666666666665,
+      "grad_norm": 0.5760745710423022,
+      "learning_rate": 0.00014280920027594907,
+      "loss": 0.9098,
+      "step": 710
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.6632918626070273,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.8505,
+      "step": 711
+    },
+    {
+      "epoch": 0.3797333333333333,
+      "grad_norm": 0.5281403267195357,
+      "learning_rate": 0.00014249660554351752,
+      "loss": 0.809,
+      "step": 712
+    },
+    {
+      "epoch": 0.38026666666666664,
+      "grad_norm": 0.5641990601207183,
+      "learning_rate": 0.00014234011759187083,
+      "loss": 0.8848,
+      "step": 713
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.527565077400961,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7952,
+      "step": 714
+    },
+    {
+      "epoch": 0.38133333333333336,
+      "grad_norm": 0.5914854708228958,
+      "learning_rate": 0.00014202676285419812,
+      "loss": 0.9428,
+      "step": 715
+    },
+    {
+      "epoch": 0.3818666666666667,
+      "grad_norm": 0.5545782427824447,
+      "learning_rate": 0.00014186989700389687,
+      "loss": 0.7598,
+      "step": 716
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.5165600165575125,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.8076,
+      "step": 717
+    },
+    {
+      "epoch": 0.38293333333333335,
+      "grad_norm": 0.5524505528132692,
+      "learning_rate": 0.0001415557906824895,
+      "loss": 0.81,
+      "step": 718
+    },
+    {
+      "epoch": 0.3834666666666667,
+      "grad_norm": 0.4404724358357859,
+      "learning_rate": 0.00014139855114935252,
+      "loss": 0.7321,
+      "step": 719
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.5464096844465652,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.863,
+      "step": 720
+    },
+    {
+      "epoch": 0.38453333333333334,
+      "grad_norm": 0.596326184176317,
+      "learning_rate": 0.0001410837016859161,
+      "loss": 0.9013,
+      "step": 721
+    },
+    {
+      "epoch": 0.38506666666666667,
+      "grad_norm": 0.5583953745799929,
+      "learning_rate": 0.00014092609269580496,
+      "loss": 0.825,
+      "step": 722
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.4659146569910716,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.785,
+      "step": 723
+    },
+    {
+      "epoch": 0.38613333333333333,
+      "grad_norm": 0.594532375525093,
+      "learning_rate": 0.00014061050855201723,
+      "loss": 0.9042,
+      "step": 724
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 0.5711569809211288,
+      "learning_rate": 0.0001404525343407228,
+      "loss": 0.8858,
+      "step": 725
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.4619560652991561,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.695,
+      "step": 726
+    },
+    {
+      "epoch": 0.3877333333333333,
+      "grad_norm": 0.4537332799771394,
+      "learning_rate": 0.00014013622399800627,
+      "loss": 0.7853,
+      "step": 727
+    },
+    {
+      "epoch": 0.38826666666666665,
+      "grad_norm": 0.4903248337452614,
+      "learning_rate": 0.00013997788881113489,
+      "loss": 0.7266,
+      "step": 728
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.5185749863376512,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.8424,
+      "step": 729
+    },
+    {
+      "epoch": 0.3893333333333333,
+      "grad_norm": 0.5810223907787916,
+      "learning_rate": 0.0001396608607704289,
+      "loss": 0.7551,
+      "step": 730
+    },
+    {
+      "epoch": 0.38986666666666664,
+      "grad_norm": 0.5175629712730113,
+      "learning_rate": 0.0001395021688632882,
+      "loss": 0.8969,
+      "step": 731
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.508891684470168,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.8234,
+      "step": 732
+    },
+    {
+      "epoch": 0.39093333333333335,
+      "grad_norm": 0.43832681951349495,
+      "learning_rate": 0.00013918443164482046,
+      "loss": 0.8012,
+      "step": 733
+    },
+    {
+      "epoch": 0.3914666666666667,
+      "grad_norm": 0.47721748897383354,
+      "learning_rate": 0.000139025387282305,
+      "loss": 0.6773,
+      "step": 734
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.5518334509589206,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.8515,
+      "step": 735
+    },
+    {
+      "epoch": 0.39253333333333335,
+      "grad_norm": 0.5932736192719247,
+      "learning_rate": 0.0001387069494253626,
+      "loss": 0.8884,
+      "step": 736
+    },
+    {
+      "epoch": 0.3930666666666667,
+      "grad_norm": 0.5016716850470819,
+      "learning_rate": 0.0001385475568818394,
+      "loss": 0.8362,
+      "step": 737
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.4408539969017525,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.655,
+      "step": 738
+    },
+    {
+      "epoch": 0.39413333333333334,
+      "grad_norm": 0.5532651593950209,
+      "learning_rate": 0.00013822842694453924,
+      "loss": 0.7705,
+      "step": 739
+    },
+    {
+      "epoch": 0.39466666666666667,
+      "grad_norm": 0.7610366060314936,
+      "learning_rate": 0.0001380686905037327,
+      "loss": 0.8962,
+      "step": 740
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.5013309116849305,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.8293,
+      "step": 741
+    },
+    {
+      "epoch": 0.3957333333333333,
+      "grad_norm": 0.6000546878969186,
+      "learning_rate": 0.00013774887706279165,
+      "loss": 0.9113,
+      "step": 742
+    },
+    {
+      "epoch": 0.39626666666666666,
+      "grad_norm": 0.5347278003291565,
+      "learning_rate": 0.0001375888010176686,
+      "loss": 0.7952,
+      "step": 743
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.45936417274121255,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7514,
+      "step": 744
+    },
+    {
+      "epoch": 0.3973333333333333,
+      "grad_norm": 0.5498682094516937,
+      "learning_rate": 0.00013726831266817278,
+      "loss": 0.8453,
+      "step": 745
+    },
+    {
+      "epoch": 0.39786666666666665,
+      "grad_norm": 0.431134911625742,
+      "learning_rate": 0.00013710790132082692,
+      "loss": 0.783,
+      "step": 746
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.45639860323450016,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.6993,
+      "step": 747
+    },
+    {
+      "epoch": 0.3989333333333333,
+      "grad_norm": 0.4421287312564141,
+      "learning_rate": 0.00013678674667600102,
+      "loss": 0.7874,
+      "step": 748
+    },
+    {
+      "epoch": 0.3994666666666667,
+      "grad_norm": 0.5869186668573526,
+      "learning_rate": 0.00013662600433753745,
+      "loss": 0.8399,
+      "step": 749
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4910762154838812,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.8655,
+      "step": 750
+    },
+    {
+      "epoch": 0.40053333333333335,
+      "grad_norm": 0.4270454417647534,
+      "learning_rate": 0.00013630419202851284,
+      "loss": 0.6779,
+      "step": 751
+    },
+    {
+      "epoch": 0.4010666666666667,
+      "grad_norm": 0.5167903722991387,
+      "learning_rate": 0.00013614312301893223,
+      "loss": 0.8096,
+      "step": 752
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.39608345285964125,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.6431,
+      "step": 753
+    },
+    {
+      "epoch": 0.40213333333333334,
+      "grad_norm": 0.48275630343373616,
+      "learning_rate": 0.00013582066169451535,
+      "loss": 0.7902,
+      "step": 754
+    },
+    {
+      "epoch": 0.4026666666666667,
+      "grad_norm": 0.5767104270423933,
+      "learning_rate": 0.0001356592703425976,
+      "loss": 0.81,
+      "step": 755
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.49965591680653026,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7811,
+      "step": 756
+    },
+    {
+      "epoch": 0.40373333333333333,
+      "grad_norm": 0.5519634374737002,
+      "learning_rate": 0.00013533616866903735,
+      "loss": 0.7847,
+      "step": 757
+    },
+    {
+      "epoch": 0.40426666666666666,
+      "grad_norm": 0.444654791469125,
+      "learning_rate": 0.0001351744593122255,
+      "loss": 0.6873,
+      "step": 758
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.552838304876708,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.8698,
+      "step": 759
+    },
+    {
+      "epoch": 0.4053333333333333,
+      "grad_norm": 0.6226914419710441,
+      "learning_rate": 0.00013485072597298038,
+      "loss": 0.7971,
+      "step": 760
+    },
+    {
+      "epoch": 0.40586666666666665,
+      "grad_norm": 0.4637695203653883,
+      "learning_rate": 0.00013468870295726398,
+      "loss": 0.7128,
+      "step": 761
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.4407982506043246,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.7387,
+      "step": 762
+    },
+    {
+      "epoch": 0.4069333333333333,
+      "grad_norm": 0.49683090767362115,
+      "learning_rate": 0.00013436434665276865,
+      "loss": 0.8292,
+      "step": 763
+    },
+    {
+      "epoch": 0.40746666666666664,
+      "grad_norm": 0.41056446416752945,
+      "learning_rate": 0.00013420201433256689,
+      "loss": 0.7174,
+      "step": 764
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.5436079044084973,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7716,
+      "step": 765
+    },
+    {
+      "epoch": 0.40853333333333336,
+      "grad_norm": 0.45038021533211064,
+      "learning_rate": 0.00013387704377999842,
+      "loss": 0.7074,
+      "step": 766
+    },
+    {
+      "epoch": 0.4090666666666667,
+      "grad_norm": 0.4342779506245978,
+      "learning_rate": 0.00013371440651804313,
+      "loss": 0.7158,
+      "step": 767
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.49226940999672253,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.8152,
+      "step": 768
+    },
+    {
+      "epoch": 0.41013333333333335,
+      "grad_norm": 0.5220423047113417,
+      "learning_rate": 0.00013338883045108674,
+      "loss": 0.7654,
+      "step": 769
+    },
+    {
+      "epoch": 0.4106666666666667,
+      "grad_norm": 0.7758953736238604,
+      "learning_rate": 0.00013322589261830517,
+      "loss": 0.8961,
+      "step": 770
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.3854930807897228,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7132,
+      "step": 771
+    },
+    {
+      "epoch": 0.41173333333333334,
+      "grad_norm": 0.48707337361403186,
+      "learning_rate": 0.0001328997197869194,
+      "loss": 0.7717,
+      "step": 772
+    },
+    {
+      "epoch": 0.41226666666666667,
+      "grad_norm": 0.49695349665441296,
+      "learning_rate": 0.0001327364857623168,
+      "loss": 0.7762,
+      "step": 773
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4476964992522201,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.6505,
+      "step": 774
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 0.6202234359546465,
+      "learning_rate": 0.00013240972493249847,
+      "loss": 0.8655,
+      "step": 775
+    },
+    {
+      "epoch": 0.41386666666666666,
+      "grad_norm": 0.5924779401268684,
+      "learning_rate": 0.0001322461991030402,
+      "loss": 0.9234,
+      "step": 776
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.5628833346263253,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7812,
+      "step": 777
+    },
+    {
+      "epoch": 0.4149333333333333,
+      "grad_norm": 0.5541031075439173,
+      "learning_rate": 0.00013191885905658872,
+      "loss": 0.8099,
+      "step": 778
+    },
+    {
+      "epoch": 0.41546666666666665,
+      "grad_norm": 0.4432608568673146,
+      "learning_rate": 0.0001317550458170826,
+      "loss": 0.7587,
+      "step": 779
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.5539590755326042,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.8822,
+      "step": 780
+    },
+    {
+      "epoch": 0.4165333333333333,
+      "grad_norm": 0.5323410127256709,
+      "learning_rate": 0.00013142713535136414,
+      "loss": 0.823,
+      "step": 781
+    },
+    {
+      "epoch": 0.41706666666666664,
+      "grad_norm": 0.42560719757431825,
+      "learning_rate": 0.00013126303910434214,
+      "loss": 0.7662,
+      "step": 782
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.5261584777482855,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7926,
+      "step": 783
+    },
+    {
+      "epoch": 0.41813333333333336,
+      "grad_norm": 0.5012552941079297,
+      "learning_rate": 0.00013093456703205288,
+      "loss": 0.8183,
+      "step": 784
+    },
+    {
+      "epoch": 0.4186666666666667,
+      "grad_norm": 0.5678895279740954,
+      "learning_rate": 0.00013077019218765305,
+      "loss": 0.859,
+      "step": 785
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4926381674080493,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7717,
+      "step": 786
+    },
+    {
+      "epoch": 0.41973333333333335,
+      "grad_norm": 0.7495165793130969,
+      "learning_rate": 0.0001304411673365826,
+      "loss": 0.9309,
+      "step": 787
+    },
+    {
+      "epoch": 0.4202666666666667,
+      "grad_norm": 0.6307274534361385,
+      "learning_rate": 0.0001302765183124302,
+      "loss": 0.9415,
+      "step": 788
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.48031720156492214,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.8021,
+      "step": 789
+    },
+    {
+      "epoch": 0.42133333333333334,
+      "grad_norm": 0.44262945926543323,
+      "learning_rate": 0.00012994694952522435,
+      "loss": 0.7721,
+      "step": 790
+    },
+    {
+      "epoch": 0.42186666666666667,
+      "grad_norm": 0.4489673221358516,
+      "learning_rate": 0.00012978203074631334,
+      "loss": 0.74,
+      "step": 791
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.5636990162785608,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7533,
+      "step": 792
+    },
+    {
+      "epoch": 0.42293333333333333,
+      "grad_norm": 0.6190679087459675,
+      "learning_rate": 0.00012945192688023624,
+      "loss": 0.8796,
+      "step": 793
+    },
+    {
+      "epoch": 0.42346666666666666,
+      "grad_norm": 0.8275050823934263,
+      "learning_rate": 0.0001292867427788104,
+      "loss": 0.985,
+      "step": 794
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.46408825605364606,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7622,
+      "step": 795
+    },
+    {
+      "epoch": 0.4245333333333333,
+      "grad_norm": 0.45975415834022415,
+      "learning_rate": 0.00012895611270550666,
+      "loss": 0.8291,
+      "step": 796
+    },
+    {
+      "epoch": 0.42506666666666665,
+      "grad_norm": 0.5169476805504977,
+      "learning_rate": 0.0001287906677209403,
+      "loss": 0.8317,
+      "step": 797
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.513118763839025,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7682,
+      "step": 798
+    },
+    {
+      "epoch": 0.4261333333333333,
+      "grad_norm": 0.447672363663369,
+      "learning_rate": 0.0001284595203261965,
+      "loss": 0.7256,
+      "step": 799
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.6414550481006086,
+      "learning_rate": 0.00012829381890487536,
+      "loss": 0.7853,
+      "step": 800
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.5001712911293971,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7769,
+      "step": 801
+    },
+    {
+      "epoch": 0.42773333333333335,
+      "grad_norm": 0.5350510273310185,
+      "learning_rate": 0.00012796216308838117,
+      "loss": 0.8501,
+      "step": 802
+    },
+    {
+      "epoch": 0.4282666666666667,
+      "grad_norm": 0.5960689101392795,
+      "learning_rate": 0.00012779620968358273,
+      "loss": 0.8967,
+      "step": 803
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.42735189779942234,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7208,
+      "step": 804
+    },
+    {
+      "epoch": 0.42933333333333334,
+      "grad_norm": 0.5989912309790166,
+      "learning_rate": 0.00012746405435869198,
+      "loss": 0.7989,
+      "step": 805
+    },
+    {
+      "epoch": 0.4298666666666667,
+      "grad_norm": 0.40593249035056556,
+      "learning_rate": 0.00012729785343046588,
+      "loss": 0.7625,
+      "step": 806
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.5900373156108883,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.9246,
+      "step": 807
+    },
+    {
+      "epoch": 0.43093333333333333,
+      "grad_norm": 0.5520195141305115,
+      "learning_rate": 0.00012696520752395672,
+      "loss": 0.8108,
+      "step": 808
+    },
+    {
+      "epoch": 0.43146666666666667,
+      "grad_norm": 0.4856158173207746,
+      "learning_rate": 0.00012679876353900482,
+      "loss": 0.7981,
+      "step": 809
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.5018996299240422,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.8017,
+      "step": 810
+    },
+    {
+      "epoch": 0.4325333333333333,
+      "grad_norm": 0.44592198162703134,
+      "learning_rate": 0.00012646563599083996,
+      "loss": 0.7587,
+      "step": 811
+    },
+    {
+      "epoch": 0.43306666666666666,
+      "grad_norm": 0.5851073621552025,
+      "learning_rate": 0.00012629895342239643,
+      "loss": 0.8203,
+      "step": 812
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.6022549792651677,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.7525,
+      "step": 813
+    },
+    {
+      "epoch": 0.4341333333333333,
+      "grad_norm": 0.418459635990812,
+      "learning_rate": 0.00012596535318548289,
+      "loss": 0.699,
+      "step": 814
+    },
+    {
+      "epoch": 0.43466666666666665,
+      "grad_norm": 0.5013376005889809,
+      "learning_rate": 0.0001257984365131938,
+      "loss": 0.8398,
+      "step": 815
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.6079407284267303,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.8777,
+      "step": 816
+    },
+    {
+      "epoch": 0.4357333333333333,
+      "grad_norm": 0.4257371313410328,
+      "learning_rate": 0.00012546437255314222,
+      "loss": 0.7593,
+      "step": 817
+    },
+    {
+      "epoch": 0.4362666666666667,
+      "grad_norm": 0.6591808043227458,
+      "learning_rate": 0.0001252972262629454,
+      "loss": 0.9076,
+      "step": 818
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.6802700917475603,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.8756,
+      "step": 819
+    },
+    {
+      "epoch": 0.43733333333333335,
+      "grad_norm": 0.5181596294153098,
+      "learning_rate": 0.00012496270755782914,
+      "loss": 0.8258,
+      "step": 820
+    },
+    {
+      "epoch": 0.4378666666666667,
+      "grad_norm": 0.5981191712702367,
+      "learning_rate": 0.00012479533614183334,
+      "loss": 0.8298,
+      "step": 821
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.4734819007822332,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.6754,
+      "step": 822
+    },
+    {
+      "epoch": 0.43893333333333334,
+      "grad_norm": 0.5499751224532713,
+      "learning_rate": 0.00012446037168194714,
+      "loss": 0.8376,
+      "step": 823
+    },
+    {
+      "epoch": 0.43946666666666667,
+      "grad_norm": 0.8101400981147132,
+      "learning_rate": 0.00012429277963831148,
+      "loss": 0.8358,
+      "step": 824
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.4981836652220473,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.8621,
+      "step": 825
+    },
+    {
+      "epoch": 0.44053333333333333,
+      "grad_norm": 0.6068909278602164,
+      "learning_rate": 0.00012395737842592995,
+      "loss": 0.8989,
+      "step": 826
+    },
+    {
+      "epoch": 0.44106666666666666,
+      "grad_norm": 0.5063610479063229,
+      "learning_rate": 0.000123789570258743,
+      "loss": 0.8067,
+      "step": 827
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.6150433634808093,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.7889,
+      "step": 828
+    },
+    {
+      "epoch": 0.4421333333333333,
+      "grad_norm": 0.6595179600986578,
+      "learning_rate": 0.00012345374130787854,
+      "loss": 0.9369,
+      "step": 829
+    },
+    {
+      "epoch": 0.44266666666666665,
+      "grad_norm": 0.5242205074387309,
+      "learning_rate": 0.00012328572152703725,
+      "loss": 0.8035,
+      "step": 830
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.5249171801917747,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7271,
+      "step": 831
+    },
+    {
+      "epoch": 0.4437333333333333,
+      "grad_norm": 0.525253689840259,
+      "learning_rate": 0.00012294947386319794,
+      "loss": 0.8092,
+      "step": 832
+    },
+    {
+      "epoch": 0.44426666666666664,
+      "grad_norm": 0.5148719131426918,
+      "learning_rate": 0.0001227812469842864,
+      "loss": 0.7503,
+      "step": 833
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 2.263188923952998,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.814,
+      "step": 834
+    },
+    {
+      "epoch": 0.44533333333333336,
+      "grad_norm": 0.576438018451827,
+      "learning_rate": 0.00012244458964423327,
+      "loss": 0.8351,
+      "step": 835
+    },
+    {
+      "epoch": 0.4458666666666667,
+      "grad_norm": 0.6799800154645094,
+      "learning_rate": 0.00012227616018840154,
+      "loss": 0.8366,
+      "step": 836
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.48282807516787113,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.6809,
+      "step": 837
+    },
+    {
+      "epoch": 0.44693333333333335,
+      "grad_norm": 0.5971682835989159,
+      "learning_rate": 0.00012193910221990581,
+      "loss": 0.8206,
+      "step": 838
+    },
+    {
+      "epoch": 0.4474666666666667,
+      "grad_norm": 0.4830829692501156,
+      "learning_rate": 0.00012177047471374807,
+      "loss": 0.7988,
+      "step": 839
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.48624698153396156,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7756,
+      "step": 840
+    },
+    {
+      "epoch": 0.44853333333333334,
+      "grad_norm": 0.5583476779499136,
+      "learning_rate": 0.0001214330251753481,
+      "loss": 0.8014,
+      "step": 841
+    },
+    {
+      "epoch": 0.44906666666666667,
+      "grad_norm": 0.5121053928194806,
+      "learning_rate": 0.00012126420415078132,
+      "loss": 0.8634,
+      "step": 842
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.5109827941220455,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.8331,
+      "step": 843
+    },
+    {
+      "epoch": 0.45013333333333333,
+      "grad_norm": 0.45922568949074166,
+      "learning_rate": 0.00012092637211153885,
+      "loss": 0.7878,
+      "step": 844
+    },
+    {
+      "epoch": 0.45066666666666666,
+      "grad_norm": 0.491968494299683,
+      "learning_rate": 0.0001207573621056809,
+      "loss": 0.7838,
+      "step": 845
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.5340310158148845,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.8843,
+      "step": 846
+    },
+    {
+      "epoch": 0.4517333333333333,
+      "grad_norm": 0.5213494917538501,
+      "learning_rate": 0.00012041915664493761,
+      "loss": 0.9101,
+      "step": 847
+    },
+    {
+      "epoch": 0.45226666666666665,
+      "grad_norm": 0.3655596990339082,
+      "learning_rate": 0.00012024996219998517,
+      "loss": 0.7014,
+      "step": 848
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.4226746379320319,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7779,
+      "step": 849
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.431341435554287,
+      "learning_rate": 0.00011991139240711857,
+      "loss": 0.8037,
+      "step": 850
+    },
+    {
+      "epoch": 0.45386666666666664,
+      "grad_norm": 0.47820557918463213,
+      "learning_rate": 0.00011974201807022525,
+      "loss": 0.7338,
+      "step": 851
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.4320850528449281,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7034,
+      "step": 852
+    },
+    {
+      "epoch": 0.45493333333333336,
+      "grad_norm": 0.5121373761865688,
+      "learning_rate": 0.00011940309304440433,
+      "loss": 0.785,
+      "step": 853
+    },
+    {
+      "epoch": 0.4554666666666667,
+      "grad_norm": 0.5195334690582885,
+      "learning_rate": 0.00011923354336755835,
+      "loss": 0.8547,
+      "step": 854
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.47214121557243605,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.764,
+      "step": 855
+    },
+    {
+      "epoch": 0.45653333333333335,
+      "grad_norm": 0.4517409213750945,
+      "learning_rate": 0.00011889427221749916,
+      "loss": 0.7222,
+      "step": 856
+    },
+    {
+      "epoch": 0.4570666666666667,
+      "grad_norm": 0.487304930266377,
+      "learning_rate": 0.00011872455175740112,
+      "loss": 0.7959,
+      "step": 857
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.4557153863646574,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.8079,
+      "step": 858
+    },
+    {
+      "epoch": 0.45813333333333334,
+      "grad_norm": 0.3929571758539548,
+      "learning_rate": 0.00011838494360112185,
+      "loss": 0.7375,
+      "step": 859
+    },
+    {
+      "epoch": 0.45866666666666667,
+      "grad_norm": 0.6326251573267269,
+      "learning_rate": 0.00011821505691906216,
+      "loss": 0.9481,
+      "step": 860
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.4990926600937626,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7798,
+      "step": 861
+    },
+    {
+      "epoch": 0.4597333333333333,
+      "grad_norm": 0.5934156452636286,
+      "learning_rate": 0.00011787512088363817,
+      "loss": 0.7484,
+      "step": 862
+    },
+    {
+      "epoch": 0.46026666666666666,
+      "grad_norm": 0.4278808297111898,
+      "learning_rate": 0.00011770507254537453,
+      "loss": 0.6593,
+      "step": 863
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.536243830039993,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.8349,
+      "step": 864
+    },
+    {
+      "epoch": 0.4613333333333333,
+      "grad_norm": 0.46465540209464534,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.7015,
+      "step": 865
+    },
+    {
+      "epoch": 0.46186666666666665,
+      "grad_norm": 0.433642846301004,
+      "learning_rate": 0.00011719461234232764,
+      "loss": 0.7143,
+      "step": 866
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.42269971011810403,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7105,
+      "step": 867
+    },
+    {
+      "epoch": 0.4629333333333333,
+      "grad_norm": 0.4847553733063033,
+      "learning_rate": 0.00011685404796484225,
+      "loss": 0.6555,
+      "step": 868
+    },
+    {
+      "epoch": 0.4634666666666667,
+      "grad_norm": 0.5140368262100883,
+      "learning_rate": 0.00011668369002869912,
+      "loss": 0.8074,
+      "step": 869
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.7552403345216738,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.8453,
+      "step": 870
+    },
+    {
+      "epoch": 0.46453333333333335,
+      "grad_norm": 0.56376967482142,
+      "learning_rate": 0.00011634282520518383,
+      "loss": 0.8418,
+      "step": 871
+    },
+    {
+      "epoch": 0.4650666666666667,
+      "grad_norm": 0.5851998857854613,
+      "learning_rate": 0.00011617231933568578,
+      "loss": 0.85,
+      "step": 872
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.4105678044595301,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7372,
+      "step": 873
+    },
+    {
+      "epoch": 0.46613333333333334,
+      "grad_norm": 0.6190898553898988,
+      "learning_rate": 0.00011583116322698935,
+      "loss": 0.8528,
+      "step": 874
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.6340439413696632,
+      "learning_rate": 0.00011566051400653486,
+      "loss": 0.8552,
+      "step": 875
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.4871600449223175,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.737,
+      "step": 876
+    },
+    {
+      "epoch": 0.46773333333333333,
+      "grad_norm": 0.46543459690668115,
+      "learning_rate": 0.00011531907578133429,
+      "loss": 0.7621,
+      "step": 877
+    },
+    {
+      "epoch": 0.46826666666666666,
+      "grad_norm": 0.45676040032070964,
+      "learning_rate": 0.00011514828779617459,
+      "loss": 0.6521,
+      "step": 878
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.4444505690264811,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7469,
+      "step": 879
+    },
+    {
+      "epoch": 0.4693333333333333,
+      "grad_norm": 0.4316611368918944,
+      "learning_rate": 0.00011480657663072896,
+      "loss": 0.6903,
+      "step": 880
+    },
+    {
+      "epoch": 0.46986666666666665,
+      "grad_norm": 0.48348628347312234,
+      "learning_rate": 0.00011463565447084445,
+      "loss": 0.7634,
+      "step": 881
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.5651308913118273,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.8512,
+      "step": 882
+    },
+    {
+      "epoch": 0.4709333333333333,
+      "grad_norm": 0.4956568675602372,
+      "learning_rate": 0.00011429367954874819,
+      "loss": 0.7371,
+      "step": 883
+    },
+    {
+      "epoch": 0.47146666666666665,
+      "grad_norm": 0.5711445179616791,
+      "learning_rate": 0.0001141226278077254,
+      "loss": 0.793,
+      "step": 884
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.49332492360055186,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7517,
+      "step": 885
+    },
+    {
+      "epoch": 0.47253333333333336,
+      "grad_norm": 0.43059417980440506,
+      "learning_rate": 0.00011378039831966134,
+      "loss": 0.6782,
+      "step": 886
+    },
+    {
+      "epoch": 0.4730666666666667,
+      "grad_norm": 0.515807728343383,
+      "learning_rate": 0.00011360922159456928,
+      "loss": 0.7495,
+      "step": 887
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.6139067016998846,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7653,
+      "step": 888
+    },
+    {
+      "epoch": 0.47413333333333335,
+      "grad_norm": 0.5228184194482335,
+      "learning_rate": 0.00011326674673806195,
+      "loss": 0.8436,
+      "step": 889
+    },
+    {
+      "epoch": 0.4746666666666667,
+      "grad_norm": 0.45164086282048727,
+      "learning_rate": 0.00011309544962932862,
+      "loss": 0.8154,
+      "step": 890
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.49664338982240125,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7513,
+      "step": 891
+    },
+    {
+      "epoch": 0.47573333333333334,
+      "grad_norm": 0.48084906692108864,
+      "learning_rate": 0.00011275273860849684,
+      "loss": 0.7482,
+      "step": 892
+    },
+    {
+      "epoch": 0.47626666666666667,
+      "grad_norm": 0.551448897376019,
+      "learning_rate": 0.00011258132571978555,
+      "loss": 0.8216,
+      "step": 893
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.4445395746743038,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.7074,
+      "step": 894
+    },
+    {
+      "epoch": 0.47733333333333333,
+      "grad_norm": 0.5620772014856222,
+      "learning_rate": 0.00011223838774509514,
+      "loss": 0.8429,
+      "step": 895
+    },
+    {
+      "epoch": 0.47786666666666666,
+      "grad_norm": 0.4909096300506755,
+      "learning_rate": 0.00011206686368318086,
+      "loss": 0.7787,
+      "step": 896
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.4681599585765567,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7866,
+      "step": 897
+    },
+    {
+      "epoch": 0.4789333333333333,
+      "grad_norm": 0.5543045236010434,
+      "learning_rate": 0.00011172370797119712,
+      "loss": 0.8084,
+      "step": 898
+    },
+    {
+      "epoch": 0.47946666666666665,
+      "grad_norm": 0.45408125652307624,
+      "learning_rate": 0.00011155207734584263,
+      "loss": 0.7466,
+      "step": 899
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.46233535011485677,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7628,
+      "step": 900
+    },
+    {
+      "epoch": 0.4805333333333333,
+      "grad_norm": 0.47013186928167217,
+      "learning_rate": 0.00011120871311898254,
+      "loss": 0.8193,
+      "step": 901
+    },
+    {
+      "epoch": 0.48106666666666664,
+      "grad_norm": 0.5887968982013919,
+      "learning_rate": 0.0001110369805428146,
+      "loss": 0.9226,
+      "step": 902
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.695042367869921,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.8255,
+      "step": 903
+    },
+    {
+      "epoch": 0.48213333333333336,
+      "grad_norm": 0.4546170412410129,
+      "learning_rate": 0.0001106934170290991,
+      "loss": 0.7532,
+      "step": 904
+    },
+    {
+      "epoch": 0.4826666666666667,
+      "grad_norm": 0.5002388839330394,
+      "learning_rate": 0.00011052158711748434,
+      "loss": 0.7211,
+      "step": 905
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.46112879552081704,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.761,
+      "step": 906
+    },
+    {
+      "epoch": 0.48373333333333335,
+      "grad_norm": 0.7208674165988788,
+      "learning_rate": 0.00011017783355029026,
+      "loss": 0.7312,
+      "step": 907
+    },
+    {
+      "epoch": 0.4842666666666667,
+      "grad_norm": 0.4663813215399649,
+      "learning_rate": 0.00011000591092121127,
+      "loss": 0.7875,
+      "step": 908
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.4829297395816714,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7529,
+      "step": 909
+    },
+    {
+      "epoch": 0.48533333333333334,
+      "grad_norm": 0.4493307430244323,
+      "learning_rate": 0.0001096619765390232,
+      "loss": 0.7081,
+      "step": 910
+    },
+    {
+      "epoch": 0.48586666666666667,
+      "grad_norm": 0.44985051226127615,
+      "learning_rate": 0.00010948996581295436,
+      "loss": 0.6445,
+      "step": 911
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.5586032923358022,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.8904,
+      "step": 912
+    },
+    {
+      "epoch": 0.48693333333333333,
+      "grad_norm": 0.41189700186726963,
+      "learning_rate": 0.00010914585985911632,
+      "loss": 0.7298,
+      "step": 913
+    },
+    {
+      "epoch": 0.48746666666666666,
+      "grad_norm": 0.48701202749098055,
+      "learning_rate": 0.00010897376565889971,
+      "loss": 0.6895,
+      "step": 914
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.5102273322747214,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.8056,
+      "step": 915
+    },
+    {
+      "epoch": 0.4885333333333333,
+      "grad_norm": 0.5078299674739979,
+      "learning_rate": 0.00010862949738136681,
+      "loss": 0.7926,
+      "step": 916
+    },
+    {
+      "epoch": 0.48906666666666665,
+      "grad_norm": 0.3976489366799753,
+      "learning_rate": 0.00010845732433208779,
+      "loss": 0.6567,
+      "step": 917
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.505189628624692,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.7802,
+      "step": 918
+    },
+    {
+      "epoch": 0.4901333333333333,
+      "grad_norm": 0.4651704738807796,
+      "learning_rate": 0.00010811290298317755,
+      "loss": 0.783,
+      "step": 919
+    },
+    {
+      "epoch": 0.49066666666666664,
+      "grad_norm": 0.5943459424529413,
+      "learning_rate": 0.00010794065571204072,
+      "loss": 0.7674,
+      "step": 920
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.38255583421035383,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.6984,
+      "step": 921
+    },
+    {
+      "epoch": 0.49173333333333336,
+      "grad_norm": 0.669499610975014,
+      "learning_rate": 0.00010759609054818458,
+      "loss": 0.8616,
+      "step": 922
+    },
+    {
+      "epoch": 0.4922666666666667,
+      "grad_norm": 0.6167405918055653,
+      "learning_rate": 0.00010742377368438914,
+      "loss": 0.7848,
+      "step": 923
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.6810199151029439,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.8911,
+      "step": 924
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 0.4851207421034525,
+      "learning_rate": 0.00010707907396588361,
+      "loss": 0.7019,
+      "step": 925
+    },
+    {
+      "epoch": 0.4938666666666667,
+      "grad_norm": 0.4675091668445895,
+      "learning_rate": 0.0001069066921404992,
+      "loss": 0.7666,
+      "step": 926
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.4372682694720911,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.8016,
+      "step": 927
+    },
+    {
+      "epoch": 0.49493333333333334,
+      "grad_norm": 0.5569363401589187,
+      "learning_rate": 0.00010656186713125689,
+      "loss": 0.9226,
+      "step": 928
+    },
+    {
+      "epoch": 0.49546666666666667,
+      "grad_norm": 0.4437524237523622,
+      "learning_rate": 0.0001063894249770989,
+      "loss": 0.7747,
+      "step": 929
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.38670304289499796,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.6455,
+      "step": 930
+    },
+    {
+      "epoch": 0.4965333333333333,
+      "grad_norm": 0.4390419147479167,
+      "learning_rate": 0.00010604448394439983,
+      "loss": 0.7476,
+      "step": 931
+    },
+    {
+      "epoch": 0.49706666666666666,
+      "grad_norm": 0.48671406938555606,
+      "learning_rate": 0.00010587198609590505,
+      "loss": 0.7387,
+      "step": 932
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.452709425015621,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.7399,
+      "step": 933
+    },
+    {
+      "epoch": 0.4981333333333333,
+      "grad_norm": 0.48169458759755385,
+      "learning_rate": 0.00010552693831014726,
+      "loss": 0.7716,
+      "step": 934
+    },
+    {
+      "epoch": 0.49866666666666665,
+      "grad_norm": 0.5810191696791625,
+      "learning_rate": 0.0001053543894032493,
+      "loss": 0.8712,
+      "step": 935
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.5765581151091413,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.8819,
+      "step": 936
+    },
+    {
+      "epoch": 0.4997333333333333,
+      "grad_norm": 0.41820338774986837,
+      "learning_rate": 0.00010500924413769988,
+      "loss": 0.682,
+      "step": 937
+    },
+    {
+      "epoch": 0.5002666666666666,
+      "grad_norm": 0.46426557902311427,
+      "learning_rate": 0.00010483664880970457,
+      "loss": 0.7801,
+      "step": 938
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.4952533035100198,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7979,
+      "step": 939
+    },
+    {
+      "epoch": 0.5013333333333333,
+      "grad_norm": 0.4688203344646727,
+      "learning_rate": 0.00010449141534025045,
+      "loss": 0.7134,
+      "step": 940
+    },
+    {
+      "epoch": 0.5018666666666667,
+      "grad_norm": 0.43481292002665417,
+      "learning_rate": 0.00010431877822971117,
+      "loss": 0.7983,
+      "step": 941
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.6569565722752031,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.7516,
+      "step": 942
+    },
+    {
+      "epoch": 0.5029333333333333,
+      "grad_norm": 0.5597575663379816,
+      "learning_rate": 0.00010397346583460971,
+      "loss": 0.8148,
+      "step": 943
+    },
+    {
+      "epoch": 0.5034666666666666,
+      "grad_norm": 0.6499924668627625,
+      "learning_rate": 0.0001038007915812028,
+      "loss": 0.8307,
+      "step": 944
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.5532966267081787,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.8149,
+      "step": 945
+    },
+    {
+      "epoch": 0.5045333333333333,
+      "grad_norm": 0.5472775181001878,
+      "learning_rate": 0.0001034554095408326,
+      "loss": 0.8534,
+      "step": 946
+    },
+    {
+      "epoch": 0.5050666666666667,
+      "grad_norm": 0.491042510513382,
+      "learning_rate": 0.00010328270278523256,
+      "loss": 0.8242,
+      "step": 947
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.5041185848643801,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7893,
+      "step": 948
+    },
+    {
+      "epoch": 0.5061333333333333,
+      "grad_norm": 0.5474803994533827,
+      "learning_rate": 0.00010293726038184393,
+      "loss": 0.749,
+      "step": 949
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.35925183829918744,
+      "learning_rate": 0.00010276452576559879,
+      "loss": 0.6674,
+      "step": 950
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.48099690864579764,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7598,
+      "step": 951
+    },
+    {
+      "epoch": 0.5077333333333334,
+      "grad_norm": 0.7751380156755314,
+      "learning_rate": 0.00010241903228306431,
+      "loss": 0.8022,
+      "step": 952
+    },
+    {
+      "epoch": 0.5082666666666666,
+      "grad_norm": 0.49938410552595913,
+      "learning_rate": 0.0001022462744484709,
+      "loss": 0.7422,
+      "step": 953
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.4882321906277085,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.8031,
+      "step": 954
+    },
+    {
+      "epoch": 0.5093333333333333,
+      "grad_norm": 0.45454582469219645,
+      "learning_rate": 0.00010190073917203589,
+      "loss": 0.7319,
+      "step": 955
+    },
+    {
+      "epoch": 0.5098666666666667,
+      "grad_norm": 0.4934886271469217,
+      "learning_rate": 0.00010172796276201503,
+      "loss": 0.8239,
+      "step": 956
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.5424838119054968,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.8179,
+      "step": 957
+    },
+    {
+      "epoch": 0.5109333333333334,
+      "grad_norm": 0.5110543967634006,
+      "learning_rate": 0.00010138239497804804,
+      "loss": 0.8246,
+      "step": 958
+    },
+    {
+      "epoch": 0.5114666666666666,
+      "grad_norm": 0.5787063388849131,
+      "learning_rate": 0.00010120960463601976,
+      "loss": 0.7412,
+      "step": 959
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.5794064281337606,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.806,
+      "step": 960
+    },
+    {
+      "epoch": 0.5125333333333333,
+      "grad_norm": 0.492692541135898,
+      "learning_rate": 0.00010086401363176305,
+      "loss": 0.7348,
+      "step": 961
+    },
+    {
+      "epoch": 0.5130666666666667,
+      "grad_norm": 0.46714017449595235,
+      "learning_rate": 0.00010069121400152181,
+      "loss": 0.7117,
+      "step": 962
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.5771394507869333,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.8032,
+      "step": 963
+    },
+    {
+      "epoch": 0.5141333333333333,
+      "grad_norm": 0.5660541739325817,
+      "learning_rate": 0.0001003456090648416,
+      "loss": 0.8302,
+      "step": 964
+    },
+    {
+      "epoch": 0.5146666666666667,
+      "grad_norm": 0.5025679783937408,
+      "learning_rate": 0.00010017280479043147,
+      "loss": 0.7271,
+      "step": 965
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.5283901296934519,
+      "learning_rate": 0.0001,
+      "loss": 0.7609,
+      "step": 966
+    },
+    {
+      "epoch": 0.5157333333333334,
+      "grad_norm": 0.4821874746709882,
+      "learning_rate": 9.982719520956855e-05,
+      "loss": 0.8704,
+      "step": 967
+    },
+    {
+      "epoch": 0.5162666666666667,
+      "grad_norm": 0.5357850660811156,
+      "learning_rate": 9.965439093515841e-05,
+      "loss": 0.7379,
+      "step": 968
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.525442476682316,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.708,
+      "step": 969
+    },
+    {
+      "epoch": 0.5173333333333333,
+      "grad_norm": 0.3962009188355929,
+      "learning_rate": 9.930878599847821e-05,
+      "loss": 0.6954,
+      "step": 970
+    },
+    {
+      "epoch": 0.5178666666666667,
+      "grad_norm": 0.5389712448135058,
+      "learning_rate": 9.913598636823693e-05,
+      "loss": 0.798,
+      "step": 971
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3881918146298721,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.6047,
+      "step": 972
+    },
+    {
+      "epoch": 0.5189333333333334,
+      "grad_norm": 0.5315277531548152,
+      "learning_rate": 9.879039536398024e-05,
+      "loss": 0.8672,
+      "step": 973
+    },
+    {
+      "epoch": 0.5194666666666666,
+      "grad_norm": 0.396419317534482,
+      "learning_rate": 9.861760502195197e-05,
+      "loss": 0.6909,
+      "step": 974
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.4806724454908013,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7973,
+      "step": 975
+    },
+    {
+      "epoch": 0.5205333333333333,
+      "grad_norm": 0.4697484761739775,
+      "learning_rate": 9.827203723798498e-05,
+      "loss": 0.7394,
+      "step": 976
+    },
+    {
+      "epoch": 0.5210666666666667,
+      "grad_norm": 0.5160014179048552,
+      "learning_rate": 9.809926082796415e-05,
+      "loss": 0.832,
+      "step": 977
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.591745687809906,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.7554,
+      "step": 978
+    },
+    {
+      "epoch": 0.5221333333333333,
+      "grad_norm": 0.4651574748926571,
+      "learning_rate": 9.775372555152912e-05,
+      "loss": 0.6775,
+      "step": 979
+    },
+    {
+      "epoch": 0.5226666666666666,
+      "grad_norm": 0.6072165626333691,
+      "learning_rate": 9.758096771693573e-05,
+      "loss": 0.9198,
+      "step": 980
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.5195555651371119,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.8281,
+      "step": 981
+    },
+    {
+      "epoch": 0.5237333333333334,
+      "grad_norm": 0.5442537680604943,
+      "learning_rate": 9.723547423440122e-05,
+      "loss": 0.8562,
+      "step": 982
+    },
+    {
+      "epoch": 0.5242666666666667,
+      "grad_norm": 0.6341924868794666,
+      "learning_rate": 9.70627396181561e-05,
+      "loss": 0.9075,
+      "step": 983
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.5003976190088961,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7935,
+      "step": 984
+    },
+    {
+      "epoch": 0.5253333333333333,
+      "grad_norm": 0.43780679774099346,
+      "learning_rate": 9.671729721476746e-05,
+      "loss": 0.6798,
+      "step": 985
+    },
+    {
+      "epoch": 0.5258666666666667,
+      "grad_norm": 0.6084254517867603,
+      "learning_rate": 9.654459045916743e-05,
+      "loss": 0.8155,
+      "step": 986
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.5195380244556218,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.8254,
+      "step": 987
+    },
+    {
+      "epoch": 0.5269333333333334,
+      "grad_norm": 0.5814706420425718,
+      "learning_rate": 9.619920841879725e-05,
+      "loss": 0.8689,
+      "step": 988
+    },
+    {
+      "epoch": 0.5274666666666666,
+      "grad_norm": 0.563914936612008,
+      "learning_rate": 9.602653416539031e-05,
+      "loss": 0.7789,
+      "step": 989
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4711340862213265,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.759,
+      "step": 990
+    },
+    {
+      "epoch": 0.5285333333333333,
+      "grad_norm": 0.59213774648018,
+      "learning_rate": 9.568122177028884e-05,
+      "loss": 0.8744,
+      "step": 991
+    },
+    {
+      "epoch": 0.5290666666666667,
+      "grad_norm": 0.6243405291520489,
+      "learning_rate": 9.550858465974958e-05,
+      "loss": 0.8354,
+      "step": 992
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.5992797953079083,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.7935,
+      "step": 993
+    },
+    {
+      "epoch": 0.5301333333333333,
+      "grad_norm": 0.41773210721308607,
+      "learning_rate": 9.516335119029546e-05,
+      "loss": 0.779,
+      "step": 994
+    },
+    {
+      "epoch": 0.5306666666666666,
+      "grad_norm": 0.5192755859477955,
+      "learning_rate": 9.499075586230013e-05,
+      "loss": 0.8225,
+      "step": 995
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.5140188987129515,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.7594,
+      "step": 996
+    },
+    {
+      "epoch": 0.5317333333333333,
+      "grad_norm": 0.4609510429287488,
+      "learning_rate": 9.464561059675073e-05,
+      "loss": 0.7799,
+      "step": 997
+    },
+    {
+      "epoch": 0.5322666666666667,
+      "grad_norm": 0.5358185906609553,
+      "learning_rate": 9.44730616898528e-05,
+      "loss": 0.862,
+      "step": 998
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.574451280567273,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.8422,
+      "step": 999
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.42928921805115855,
+      "learning_rate": 9.412801390409497e-05,
+      "loss": 0.7623,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5338666666666667,
+      "grad_norm": 0.7272438157788303,
+      "learning_rate": 9.395551605560018e-05,
+      "loss": 0.9059,
+      "step": 1001
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.5559463653954599,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.7792,
+      "step": 1002
+    },
+    {
+      "epoch": 0.5349333333333334,
+      "grad_norm": 0.4313360808757448,
+      "learning_rate": 9.361057502290113e-05,
+      "loss": 0.7717,
+      "step": 1003
+    },
+    {
+      "epoch": 0.5354666666666666,
+      "grad_norm": 0.47030954524984847,
+      "learning_rate": 9.343813286874312e-05,
+      "loss": 0.757,
+      "step": 1004
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.442738354497151,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.7434,
+      "step": 1005
+    },
+    {
+      "epoch": 0.5365333333333333,
+      "grad_norm": 0.5667853061205357,
+      "learning_rate": 9.309330785950086e-05,
+      "loss": 0.8977,
+      "step": 1006
+    },
+    {
+      "epoch": 0.5370666666666667,
+      "grad_norm": 0.41132253844073,
+      "learning_rate": 9.292092603411641e-05,
+      "loss": 0.6707,
+      "step": 1007
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.7032423662806179,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.84,
+      "step": 1008
+    },
+    {
+      "epoch": 0.5381333333333334,
+      "grad_norm": 0.48968412442626585,
+      "learning_rate": 9.257622631561085e-05,
+      "loss": 0.7559,
+      "step": 1009
+    },
+    {
+      "epoch": 0.5386666666666666,
+      "grad_norm": 0.5422782601866228,
+      "learning_rate": 9.240390945181543e-05,
+      "loss": 0.721,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.6473133424013873,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.9128,
+      "step": 1011
+    },
+    {
+      "epoch": 0.5397333333333333,
+      "grad_norm": 0.5439588271027264,
+      "learning_rate": 9.205934428795929e-05,
+      "loss": 0.7603,
+      "step": 1012
+    },
+    {
+      "epoch": 0.5402666666666667,
+      "grad_norm": 0.4016782313446635,
+      "learning_rate": 9.188709701682247e-05,
+      "loss": 0.6788,
+      "step": 1013
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.5068484930555268,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.888,
+      "step": 1014
+    },
+    {
+      "epoch": 0.5413333333333333,
+      "grad_norm": 0.5145274137351125,
+      "learning_rate": 9.154267566791223e-05,
+      "loss": 0.7793,
+      "step": 1015
+    },
+    {
+      "epoch": 0.5418666666666667,
+      "grad_norm": 0.6751853358091369,
+      "learning_rate": 9.137050261863324e-05,
+      "loss": 0.8471,
+      "step": 1016
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.5064023549791228,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7535,
+      "step": 1017
+    },
+    {
+      "epoch": 0.5429333333333334,
+      "grad_norm": 0.5281939437096247,
+      "learning_rate": 9.102623434110028e-05,
+      "loss": 0.7673,
+      "step": 1018
+    },
+    {
+      "epoch": 0.5434666666666667,
+      "grad_norm": 0.4409058470768508,
+      "learning_rate": 9.085414014088369e-05,
+      "loss": 0.6555,
+      "step": 1019
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4318211050051625,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.7061,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5445333333333333,
+      "grad_norm": 0.4423890277498593,
+      "learning_rate": 9.051003418704565e-05,
+      "loss": 0.7312,
+      "step": 1021
+    },
+    {
+      "epoch": 0.5450666666666667,
+      "grad_norm": 0.5308839756869687,
+      "learning_rate": 9.033802346097682e-05,
+      "loss": 0.8195,
+      "step": 1022
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.5027576313133176,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.8071,
+      "step": 1023
+    },
+    {
+      "epoch": 0.5461333333333334,
+      "grad_norm": 0.4731949786966609,
+      "learning_rate": 8.999408907878877e-05,
+      "loss": 0.7497,
+      "step": 1024
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 0.5183385982817474,
+      "learning_rate": 8.982216644970979e-05,
+      "loss": 0.6428,
+      "step": 1025
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.5327655953051283,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.8299,
+      "step": 1026
+    },
+    {
+      "epoch": 0.5477333333333333,
+      "grad_norm": 0.5252812689417474,
+      "learning_rate": 8.947841288251568e-05,
+      "loss": 0.7438,
+      "step": 1027
+    },
+    {
+      "epoch": 0.5482666666666667,
+      "grad_norm": 0.4854605612544107,
+      "learning_rate": 8.930658297090091e-05,
+      "loss": 0.7781,
+      "step": 1028
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.47055365993939163,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.6582,
+      "step": 1029
+    },
+    {
+      "epoch": 0.5493333333333333,
+      "grad_norm": 0.5560444704582403,
+      "learning_rate": 8.896301945718541e-05,
+      "loss": 0.8353,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5498666666666666,
+      "grad_norm": 0.41571277834164605,
+      "learning_rate": 8.879128688101749e-05,
+      "loss": 0.6936,
+      "step": 1031
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.4371639418042648,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.6978,
+      "step": 1032
+    },
+    {
+      "epoch": 0.5509333333333334,
+      "grad_norm": 0.6722608795134971,
+      "learning_rate": 8.844792265415738e-05,
+      "loss": 0.8802,
+      "step": 1033
+    },
+    {
+      "epoch": 0.5514666666666667,
+      "grad_norm": 0.4970904267426464,
+      "learning_rate": 8.827629202880293e-05,
+      "loss": 0.7537,
+      "step": 1034
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.5680397577452019,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7023,
+      "step": 1035
+    },
+    {
+      "epoch": 0.5525333333333333,
+      "grad_norm": 0.6607651748179223,
+      "learning_rate": 8.793313631681915e-05,
+      "loss": 0.8185,
+      "step": 1036
+    },
+    {
+      "epoch": 0.5530666666666667,
+      "grad_norm": 0.4320010725813437,
+      "learning_rate": 8.776161225490489e-05,
+      "loss": 0.7247,
+      "step": 1037
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4379851357731281,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.7357,
+      "step": 1038
+    },
+    {
+      "epoch": 0.5541333333333334,
+      "grad_norm": 0.5864123812194793,
+      "learning_rate": 8.741867428021446e-05,
+      "loss": 0.9109,
+      "step": 1039
+    },
+    {
+      "epoch": 0.5546666666666666,
+      "grad_norm": 0.42591442989357187,
+      "learning_rate": 8.724726139150318e-05,
+      "loss": 0.673,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.5692366538803478,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.8681,
+      "step": 1041
+    },
+    {
+      "epoch": 0.5557333333333333,
+      "grad_norm": 0.5146718413776341,
+      "learning_rate": 8.690455037067141e-05,
+      "loss": 0.7601,
+      "step": 1042
+    },
+    {
+      "epoch": 0.5562666666666667,
+      "grad_norm": 0.6637949998918076,
+      "learning_rate": 8.673325326193806e-05,
+      "loss": 0.8332,
+      "step": 1043
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.542905102982066,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7672,
+      "step": 1044
+    },
+    {
+      "epoch": 0.5573333333333333,
+      "grad_norm": 0.6532562664979465,
+      "learning_rate": 8.639077840543077e-05,
+      "loss": 0.8629,
+      "step": 1045
+    },
+    {
+      "epoch": 0.5578666666666666,
+      "grad_norm": 0.4526633920079731,
+      "learning_rate": 8.621960168033867e-05,
+      "loss": 0.7665,
+      "step": 1046
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.5367334612585443,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.7661,
+      "step": 1047
+    },
+    {
+      "epoch": 0.5589333333333333,
+      "grad_norm": 0.5950768846379804,
+      "learning_rate": 8.587737219227462e-05,
+      "loss": 0.7678,
+      "step": 1048
+    },
+    {
+      "epoch": 0.5594666666666667,
+      "grad_norm": 0.4301047695545847,
+      "learning_rate": 8.570632045125185e-05,
+      "loss": 0.6559,
+      "step": 1049
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.7484350038467311,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.7962,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5605333333333333,
+      "grad_norm": 0.41575632022589987,
+      "learning_rate": 8.536434552915556e-05,
+      "loss": 0.6308,
+      "step": 1051
+    },
+    {
+      "epoch": 0.5610666666666667,
+      "grad_norm": 0.4393636084782435,
+      "learning_rate": 8.519342336927105e-05,
+      "loss": 0.7342,
+      "step": 1052
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.41562609846030607,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.726,
+      "step": 1053
+    },
+    {
+      "epoch": 0.5621333333333334,
+      "grad_norm": 0.4477045164078837,
+      "learning_rate": 8.485171220382545e-05,
+      "loss": 0.6865,
+      "step": 1054
+    },
+    {
+      "epoch": 0.5626666666666666,
+      "grad_norm": 0.486071042296672,
+      "learning_rate": 8.468092421866573e-05,
+      "loss": 0.8573,
+      "step": 1055
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.4665545159433152,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7984,
+      "step": 1056
+    },
+    {
+      "epoch": 0.5637333333333333,
+      "grad_norm": 0.5524757999976719,
+      "learning_rate": 8.433948599346516e-05,
+      "loss": 0.8859,
+      "step": 1057
+    },
+    {
+      "epoch": 0.5642666666666667,
+      "grad_norm": 0.45394390731347345,
+      "learning_rate": 8.416883677301069e-05,
+      "loss": 0.8273,
+      "step": 1058
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.8670666883863559,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.9476,
+      "step": 1059
+    },
+    {
+      "epoch": 0.5653333333333334,
+      "grad_norm": 0.45014162704566263,
+      "learning_rate": 8.382768066431425e-05,
+      "loss": 0.7477,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5658666666666666,
+      "grad_norm": 0.47423961273501625,
+      "learning_rate": 8.36571747948162e-05,
+      "loss": 0.725,
+      "step": 1061
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4950365048983719,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7586,
+      "step": 1062
+    },
+    {
+      "epoch": 0.5669333333333333,
+      "grad_norm": 0.5029439088135188,
+      "learning_rate": 8.33163099713009e-05,
+      "loss": 0.7981,
+      "step": 1063
+    },
+    {
+      "epoch": 0.5674666666666667,
+      "grad_norm": 0.5897608031444519,
+      "learning_rate": 8.31459520351578e-05,
+      "loss": 0.9008,
+      "step": 1064
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.6525085166965572,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.8962,
+      "step": 1065
+    },
+    {
+      "epoch": 0.5685333333333333,
+      "grad_norm": 0.5765106103125884,
+      "learning_rate": 8.280538765767235e-05,
+      "loss": 0.882,
+      "step": 1066
+    },
+    {
+      "epoch": 0.5690666666666667,
+      "grad_norm": 0.5661404452293268,
+      "learning_rate": 8.263518223330697e-05,
+      "loss": 0.8775,
+      "step": 1067
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4399109717922037,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.6821,
+      "step": 1068
+    },
+    {
+      "epoch": 0.5701333333333334,
+      "grad_norm": 0.515553035736288,
+      "learning_rate": 8.22949274546255e-05,
+      "loss": 0.7302,
+      "step": 1069
+    },
+    {
+      "epoch": 0.5706666666666667,
+      "grad_norm": 0.5546251007029763,
+      "learning_rate": 8.212487911636184e-05,
+      "loss": 0.8414,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.36818646059975035,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6362,
+      "step": 1071
+    },
+    {
+      "epoch": 0.5717333333333333,
+      "grad_norm": 0.5973642474673283,
+      "learning_rate": 8.178494308093789e-05,
+      "loss": 0.8576,
+      "step": 1072
+    },
+    {
+      "epoch": 0.5722666666666667,
+      "grad_norm": 0.5453914955495656,
+      "learning_rate": 8.161505639887817e-05,
+      "loss": 0.8003,
+      "step": 1073
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.4538127419270415,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.7232,
+      "step": 1074
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 0.5145454434088483,
+      "learning_rate": 8.127544824259889e-05,
+      "loss": 0.8297,
+      "step": 1075
+    },
+    {
+      "epoch": 0.5738666666666666,
+      "grad_norm": 0.457613168791868,
+      "learning_rate": 8.110572778250085e-05,
+      "loss": 0.7314,
+      "step": 1076
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.529026255615779,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7671,
+      "step": 1077
+    },
+    {
+      "epoch": 0.5749333333333333,
+      "grad_norm": 0.5276732011135837,
+      "learning_rate": 8.076645663244168e-05,
+      "loss": 0.7988,
+      "step": 1078
+    },
+    {
+      "epoch": 0.5754666666666667,
+      "grad_norm": 0.5119811476084692,
+      "learning_rate": 8.059690695559568e-05,
+      "loss": 0.8729,
+      "step": 1079
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.5669022170618133,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.838,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5765333333333333,
+      "grad_norm": 0.47221332942467154,
+      "learning_rate": 8.025798192977481e-05,
+      "loss": 0.7096,
+      "step": 1081
+    },
+    {
+      "epoch": 0.5770666666666666,
+      "grad_norm": 0.4163881284245536,
+      "learning_rate": 8.008860759288147e-05,
+      "loss": 0.6801,
+      "step": 1082
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.5010530863802051,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.7289,
+      "step": 1083
+    },
+    {
+      "epoch": 0.5781333333333334,
+      "grad_norm": 0.48392985830710034,
+      "learning_rate": 7.975003780001485e-05,
+      "loss": 0.6489,
+      "step": 1084
+    },
+    {
+      "epoch": 0.5786666666666667,
+      "grad_norm": 0.4814123674902523,
+      "learning_rate": 7.958084335506239e-05,
+      "loss": 0.7967,
+      "step": 1085
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.45439283152827153,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.7022,
+      "step": 1086
+    },
+    {
+      "epoch": 0.5797333333333333,
+      "grad_norm": 0.5160712883620804,
+      "learning_rate": 7.924263789431912e-05,
+      "loss": 0.7722,
+      "step": 1087
+    },
+    {
+      "epoch": 0.5802666666666667,
+      "grad_norm": 0.4084949839651646,
+      "learning_rate": 7.907362788846116e-05,
+      "loss": 0.7663,
+      "step": 1088
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.48014982117628685,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.8165,
+      "step": 1089
+    },
+    {
+      "epoch": 0.5813333333333334,
+      "grad_norm": 0.4281813822547069,
+      "learning_rate": 7.873579584921869e-05,
+      "loss": 0.6895,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5818666666666666,
+      "grad_norm": 0.5134186078998622,
+      "learning_rate": 7.856697482465196e-05,
+      "loss": 0.7335,
+      "step": 1091
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.6060821542053557,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.952,
+      "step": 1092
+    },
+    {
+      "epoch": 0.5829333333333333,
+      "grad_norm": 0.5428771989367646,
+      "learning_rate": 7.822952528625191e-05,
+      "loss": 0.7522,
+      "step": 1093
+    },
+    {
+      "epoch": 0.5834666666666667,
+      "grad_norm": 0.5649056681978917,
+      "learning_rate": 7.806089778009421e-05,
+      "loss": 0.785,
+      "step": 1094
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.5410584645799422,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.7819,
+      "step": 1095
+    },
+    {
+      "epoch": 0.5845333333333333,
+      "grad_norm": 0.5297779787392666,
+      "learning_rate": 7.772383981159849e-05,
+      "loss": 0.7827,
+      "step": 1096
+    },
+    {
+      "epoch": 0.5850666666666666,
+      "grad_norm": 0.4981974984077256,
+      "learning_rate": 7.755541035576677e-05,
+      "loss": 0.7938,
+      "step": 1097
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.43965228584236327,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.7716,
+      "step": 1098
+    },
+    {
+      "epoch": 0.5861333333333333,
+      "grad_norm": 0.46078442248990276,
+      "learning_rate": 7.721875301571359e-05,
+      "loss": 0.7952,
+      "step": 1099
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.47099705722870444,
+      "learning_rate": 7.705052613680211e-05,
+      "loss": 0.6965,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.6014987937081797,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.7589,
+      "step": 1101
+    },
+    {
+      "epoch": 0.5877333333333333,
+      "grad_norm": 0.46239459077607387,
+      "learning_rate": 7.671427847296275e-05,
+      "loss": 0.7276,
+      "step": 1102
+    },
+    {
+      "epoch": 0.5882666666666667,
+      "grad_norm": 0.4697286751474339,
+      "learning_rate": 7.654625869212146e-05,
+      "loss": 0.7152,
+      "step": 1103
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.5210272405996849,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.8268,
+      "step": 1104
+    },
+    {
+      "epoch": 0.5893333333333334,
+      "grad_norm": 0.5118511288541373,
+      "learning_rate": 7.6210429741257e-05,
+      "loss": 0.7668,
+      "step": 1105
+    },
+    {
+      "epoch": 0.5898666666666667,
+      "grad_norm": 0.4711383662788997,
+      "learning_rate": 7.604262157407007e-05,
+      "loss": 0.8149,
+      "step": 1106
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.46408282702945824,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.7562,
+      "step": 1107
+    },
+    {
+      "epoch": 0.5909333333333333,
+      "grad_norm": 0.5394146489399196,
+      "learning_rate": 7.570722036168854e-05,
+      "loss": 0.7958,
+      "step": 1108
+    },
+    {
+      "epoch": 0.5914666666666667,
+      "grad_norm": 0.6869150004348163,
+      "learning_rate": 7.55396283180529e-05,
+      "loss": 0.7718,
+      "step": 1109
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.49055454254608344,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.758,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5925333333333334,
+      "grad_norm": 0.4458991499930382,
+      "learning_rate": 7.520466385816671e-05,
+      "loss": 0.672,
+      "step": 1111
+    },
+    {
+      "epoch": 0.5930666666666666,
+      "grad_norm": 0.4347672269064152,
+      "learning_rate": 7.503729244217086e-05,
+      "loss": 0.6597,
+      "step": 1112
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.493692750523553,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.828,
+      "step": 1113
+    },
+    {
+      "epoch": 0.5941333333333333,
+      "grad_norm": 0.5427876490200283,
+      "learning_rate": 7.470277373705461e-05,
+      "loss": 0.8172,
+      "step": 1114
+    },
+    {
+      "epoch": 0.5946666666666667,
+      "grad_norm": 0.49025422986820805,
+      "learning_rate": 7.453562744685778e-05,
+      "loss": 0.7671,
+      "step": 1115
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.6049657915730587,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.6777,
+      "step": 1116
+    },
+    {
+      "epoch": 0.5957333333333333,
+      "grad_norm": 0.4819646779616934,
+      "learning_rate": 7.42015634868062e-05,
+      "loss": 0.7578,
+      "step": 1117
+    },
+    {
+      "epoch": 0.5962666666666666,
+      "grad_norm": 0.5235721200251294,
+      "learning_rate": 7.403464681451715e-05,
+      "loss": 0.8077,
+      "step": 1118
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.6087209952321163,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.8524,
+      "step": 1119
+    },
+    {
+      "epoch": 0.5973333333333334,
+      "grad_norm": 0.4500483929554947,
+      "learning_rate": 7.370104657760361e-05,
+      "loss": 0.7262,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5978666666666667,
+      "grad_norm": 0.45807234060644647,
+      "learning_rate": 7.353436400916004e-05,
+      "loss": 0.6825,
+      "step": 1121
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4129702576367196,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.7258,
+      "step": 1122
+    },
+    {
+      "epoch": 0.5989333333333333,
+      "grad_norm": 0.4403723919576854,
+      "learning_rate": 7.320123646099519e-05,
+      "loss": 0.6541,
+      "step": 1123
+    },
+    {
+      "epoch": 0.5994666666666667,
+      "grad_norm": 0.6009178369133598,
+      "learning_rate": 7.303479247604332e-05,
+      "loss": 0.8899,
+      "step": 1124
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.5239256047852181,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.8046,
+      "step": 1125
+    },
+    {
+      "epoch": 0.6005333333333334,
+      "grad_norm": 0.46279889651377026,
+      "learning_rate": 7.270214656953415e-05,
+      "loss": 0.631,
+      "step": 1126
+    },
+    {
+      "epoch": 0.6010666666666666,
+      "grad_norm": 0.5501400019892834,
+      "learning_rate": 7.253594564130804e-05,
+      "loss": 0.8407,
+      "step": 1127
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.4602923749840518,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.7225,
+      "step": 1128
+    },
+    {
+      "epoch": 0.6021333333333333,
+      "grad_norm": 0.5463141230994777,
+      "learning_rate": 7.22037903164173e-05,
+      "loss": 0.7215,
+      "step": 1129
+    },
+    {
+      "epoch": 0.6026666666666667,
+      "grad_norm": 0.5693472216906276,
+      "learning_rate": 7.203783691161883e-05,
+      "loss": 0.8745,
+      "step": 1130
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.41613892010850634,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.6888,
+      "step": 1131
+    },
+    {
+      "epoch": 0.6037333333333333,
+      "grad_norm": 0.577122066529536,
+      "learning_rate": 7.170618109512465e-05,
+      "loss": 0.7882,
+      "step": 1132
+    },
+    {
+      "epoch": 0.6042666666666666,
+      "grad_norm": 0.46998332690172934,
+      "learning_rate": 7.154047967380354e-05,
+      "loss": 0.8013,
+      "step": 1133
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.4383546462072333,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.6924,
+      "step": 1134
+    },
+    {
+      "epoch": 0.6053333333333333,
+      "grad_norm": 0.44865404369550144,
+      "learning_rate": 7.12093322790597e-05,
+      "loss": 0.7643,
+      "step": 1135
+    },
+    {
+      "epoch": 0.6058666666666667,
+      "grad_norm": 0.5925515225657153,
+      "learning_rate": 7.104388729449338e-05,
+      "loss": 0.9031,
+      "step": 1136
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.4257641971014278,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7118,
+      "step": 1137
+    },
+    {
+      "epoch": 0.6069333333333333,
+      "grad_norm": 0.5206866535828725,
+      "learning_rate": 7.071325722118963e-05,
+      "loss": 0.7415,
+      "step": 1138
+    },
+    {
+      "epoch": 0.6074666666666667,
+      "grad_norm": 0.5707751370071729,
+      "learning_rate": 7.054807311976379e-05,
+      "loss": 0.7437,
+      "step": 1139
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.40524442168153607,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6977,
+      "step": 1140
+    },
+    {
+      "epoch": 0.6085333333333334,
+      "grad_norm": 0.49142407085977796,
+      "learning_rate": 7.021796925368667e-05,
+      "loss": 0.8295,
+      "step": 1141
+    },
+    {
+      "epoch": 0.6090666666666666,
+      "grad_norm": 0.5282617440640158,
+      "learning_rate": 7.005305047477566e-05,
+      "loss": 0.7301,
+      "step": 1142
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.5451141058517067,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.7744,
+      "step": 1143
+    },
+    {
+      "epoch": 0.6101333333333333,
+      "grad_norm": 0.40934033588093743,
+      "learning_rate": 6.972348168756983e-05,
+      "loss": 0.6475,
+      "step": 1144
+    },
+    {
+      "epoch": 0.6106666666666667,
+      "grad_norm": 0.48871504711122515,
+      "learning_rate": 6.955883266341741e-05,
+      "loss": 0.6969,
+      "step": 1145
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.5172307509140853,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.7246,
+      "step": 1146
+    },
+    {
+      "epoch": 0.6117333333333334,
+      "grad_norm": 0.5100529082891403,
+      "learning_rate": 6.922980781234699e-05,
+      "loss": 0.7351,
+      "step": 1147
+    },
+    {
+      "epoch": 0.6122666666666666,
+      "grad_norm": 0.5249875756447643,
+      "learning_rate": 6.906543296794714e-05,
+      "loss": 0.7637,
+      "step": 1148
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.47400722879437185,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6943,
+      "step": 1149
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.45904504089238257,
+      "learning_rate": 6.873696089565786e-05,
+      "loss": 0.7526,
+      "step": 1150
+    },
+    {
+      "epoch": 0.6138666666666667,
+      "grad_norm": 0.4934751039540469,
+      "learning_rate": 6.85728646486359e-05,
+      "loss": 0.727,
+      "step": 1151
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.46594305801411295,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.7175,
+      "step": 1152
+    },
+    {
+      "epoch": 0.6149333333333333,
+      "grad_norm": 0.5510438341683022,
+      "learning_rate": 6.82449541829174e-05,
+      "loss": 0.7947,
+      "step": 1153
+    },
+    {
+      "epoch": 0.6154666666666667,
+      "grad_norm": 0.5352113629485974,
+      "learning_rate": 6.80811409434113e-05,
+      "loss": 0.8727,
+      "step": 1154
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.48080489649554553,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.7542,
+      "step": 1155
+    },
+    {
+      "epoch": 0.6165333333333334,
+      "grad_norm": 0.5130889735164389,
+      "learning_rate": 6.775380089695986e-05,
+      "loss": 0.7984,
+      "step": 1156
+    },
+    {
+      "epoch": 0.6170666666666667,
+      "grad_norm": 0.5169453233769978,
+      "learning_rate": 6.759027506750158e-05,
+      "loss": 0.6668,
+      "step": 1157
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.6683968572616635,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.868,
+      "step": 1158
+    },
+    {
+      "epoch": 0.6181333333333333,
+      "grad_norm": 0.513319017914739,
+      "learning_rate": 6.726351423768322e-05,
+      "loss": 0.7778,
+      "step": 1159
+    },
+    {
+      "epoch": 0.6186666666666667,
+      "grad_norm": 0.50500207550488,
+      "learning_rate": 6.710028021308061e-05,
+      "loss": 0.8149,
+      "step": 1160
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.6871309726289309,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.8847,
+      "step": 1161
+    },
+    {
+      "epoch": 0.6197333333333334,
+      "grad_norm": 0.4828761250518926,
+      "learning_rate": 6.677410738169485e-05,
+      "loss": 0.8028,
+      "step": 1162
+    },
+    {
+      "epoch": 0.6202666666666666,
+      "grad_norm": 0.5402832248712572,
+      "learning_rate": 6.661116954891328e-05,
+      "loss": 0.7615,
+      "step": 1163
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.428383779948302,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.7426,
+      "step": 1164
+    },
+    {
+      "epoch": 0.6213333333333333,
+      "grad_norm": 0.4661800045858397,
+      "learning_rate": 6.62855934819569e-05,
+      "loss": 0.7363,
+      "step": 1165
+    },
+    {
+      "epoch": 0.6218666666666667,
+      "grad_norm": 0.4757002484645323,
+      "learning_rate": 6.612295622000162e-05,
+      "loss": 0.7165,
+      "step": 1166
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.5891581050601145,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7801,
+      "step": 1167
+    },
+    {
+      "epoch": 0.6229333333333333,
+      "grad_norm": 0.5877017442395175,
+      "learning_rate": 6.579798566743314e-05,
+      "loss": 0.8208,
+      "step": 1168
+    },
+    {
+      "epoch": 0.6234666666666666,
+      "grad_norm": 0.4374020804860988,
+      "learning_rate": 6.563565334723134e-05,
+      "loss": 0.7092,
+      "step": 1169
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.7680263524757133,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.8502,
+      "step": 1170
+    },
+    {
+      "epoch": 0.6245333333333334,
+      "grad_norm": 1.131632589727481,
+      "learning_rate": 6.531129704273604e-05,
+      "loss": 0.8092,
+      "step": 1171
+    },
+    {
+      "epoch": 0.6250666666666667,
+      "grad_norm": 0.43927601534850325,
+      "learning_rate": 6.514927402701964e-05,
+      "loss": 0.8155,
+      "step": 1172
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.6044194834712543,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.8709,
+      "step": 1173
+    },
+    {
+      "epoch": 0.6261333333333333,
+      "grad_norm": 0.6638143282370651,
+      "learning_rate": 6.48255406877745e-05,
+      "loss": 0.8182,
+      "step": 1174
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 0.41295433463974385,
+      "learning_rate": 6.466383133096267e-05,
+      "loss": 0.7358,
+      "step": 1175
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.4185388022872638,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6662,
+      "step": 1176
+    },
+    {
+      "epoch": 0.6277333333333334,
+      "grad_norm": 0.5425672199599714,
+      "learning_rate": 6.434072965740242e-05,
+      "loss": 0.7387,
+      "step": 1177
+    },
+    {
+      "epoch": 0.6282666666666666,
+      "grad_norm": 0.48854563428955444,
+      "learning_rate": 6.417933830548467e-05,
+      "loss": 0.8425,
+      "step": 1178
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.43279089007911115,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.644,
+      "step": 1179
+    },
+    {
+      "epoch": 0.6293333333333333,
+      "grad_norm": 0.4624374296243218,
+      "learning_rate": 6.385687698106781e-05,
+      "loss": 0.7802,
+      "step": 1180
+    },
+    {
+      "epoch": 0.6298666666666667,
+      "grad_norm": 0.4287093400356675,
+      "learning_rate": 6.369580797148718e-05,
+      "loss": 0.698,
+      "step": 1181
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.43881856116583695,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.677,
+      "step": 1182
+    },
+    {
+      "epoch": 0.6309333333333333,
+      "grad_norm": 0.4724279515832686,
+      "learning_rate": 6.337399566246257e-05,
+      "loss": 0.8122,
+      "step": 1183
+    },
+    {
+      "epoch": 0.6314666666666666,
+      "grad_norm": 0.4632779979930065,
+      "learning_rate": 6.321325332399903e-05,
+      "loss": 0.6816,
+      "step": 1184
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.41653471994783514,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.6731,
+      "step": 1185
+    },
+    {
+      "epoch": 0.6325333333333333,
+      "grad_norm": 0.4041036126580989,
+      "learning_rate": 6.289209867917312e-05,
+      "loss": 0.6171,
+      "step": 1186
+    },
+    {
+      "epoch": 0.6330666666666667,
+      "grad_norm": 0.4244669309942605,
+      "learning_rate": 6.273168733182722e-05,
+      "loss": 0.6509,
+      "step": 1187
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.4480670699233299,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6782,
+      "step": 1188
+    },
+    {
+      "epoch": 0.6341333333333333,
+      "grad_norm": 0.5147817666607787,
+      "learning_rate": 6.241119898233144e-05,
+      "loss": 0.7843,
+      "step": 1189
+    },
+    {
+      "epoch": 0.6346666666666667,
+      "grad_norm": 0.6590523887277467,
+      "learning_rate": 6.225112293720836e-05,
+      "loss": 0.6622,
+      "step": 1190
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.5906784821557615,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.8182,
+      "step": 1191
+    },
+    {
+      "epoch": 0.6357333333333334,
+      "grad_norm": 0.79123920087716,
+      "learning_rate": 6.19313094962673e-05,
+      "loss": 0.7645,
+      "step": 1192
+    },
+    {
+      "epoch": 0.6362666666666666,
+      "grad_norm": 0.4934105295980472,
+      "learning_rate": 6.177157305546078e-05,
+      "loss": 0.6993,
+      "step": 1193
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.5169576112242033,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.7903,
+      "step": 1194
+    },
+    {
+      "epoch": 0.6373333333333333,
+      "grad_norm": 0.5951619801235911,
+      "learning_rate": 6.145244311816063e-05,
+      "loss": 0.8015,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6378666666666667,
+      "grad_norm": 0.6270626637751038,
+      "learning_rate": 6.129305057463741e-05,
+      "loss": 0.8664,
+      "step": 1196
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.5411196988348133,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.8788,
+      "step": 1197
+    },
+    {
+      "epoch": 0.6389333333333334,
+      "grad_norm": 0.5145499844752968,
+      "learning_rate": 6.0974612717695004e-05,
+      "loss": 0.7333,
+      "step": 1198
+    },
+    {
+      "epoch": 0.6394666666666666,
+      "grad_norm": 0.45312931845329213,
+      "learning_rate": 6.0815568355179556e-05,
+      "loss": 0.7871,
+      "step": 1199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.479732080413242,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.7336,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6405333333333333,
+      "grad_norm": 0.5463357924719869,
+      "learning_rate": 6.0497831136711836e-05,
+      "loss": 0.744,
+      "step": 1201
+    },
+    {
+      "epoch": 0.6410666666666667,
+      "grad_norm": 0.5279290163752073,
+      "learning_rate": 6.0339139229571116e-05,
+      "loss": 0.7531,
+      "step": 1202
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.49836728800476254,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.7327,
+      "step": 1203
+    },
+    {
+      "epoch": 0.6421333333333333,
+      "grad_norm": 0.4971802403963867,
+      "learning_rate": 6.002211118886514e-05,
+      "loss": 0.7987,
+      "step": 1204
+    },
+    {
+      "epoch": 0.6426666666666667,
+      "grad_norm": 0.48463827627897155,
+      "learning_rate": 5.986377600199371e-05,
+      "loss": 0.7463,
+      "step": 1205
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.5098697199096175,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6947,
+      "step": 1206
+    },
+    {
+      "epoch": 0.6437333333333334,
+      "grad_norm": 0.5234828393057811,
+      "learning_rate": 5.9547465659277215e-05,
+      "loss": 0.7572,
+      "step": 1207
+    },
+    {
+      "epoch": 0.6442666666666667,
+      "grad_norm": 0.4458409726885109,
+      "learning_rate": 5.938949144798279e-05,
+      "loss": 0.7151,
+      "step": 1208
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.46498815645957925,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.7341,
+      "step": 1209
+    },
+    {
+      "epoch": 0.6453333333333333,
+      "grad_norm": 0.48012200293932256,
+      "learning_rate": 5.907390730419507e-05,
+      "loss": 0.7592,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6458666666666667,
+      "grad_norm": 0.5805583747116773,
+      "learning_rate": 5.8916298314083915e-05,
+      "loss": 0.8637,
+      "step": 1211
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4433313879267007,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6725,
+      "step": 1212
+    },
+    {
+      "epoch": 0.6469333333333334,
+      "grad_norm": 0.5194855571239937,
+      "learning_rate": 5.860144885064751e-05,
+      "loss": 0.776,
+      "step": 1213
+    },
+    {
+      "epoch": 0.6474666666666666,
+      "grad_norm": 0.6018279111933093,
+      "learning_rate": 5.8444209317510514e-05,
+      "loss": 0.8028,
+      "step": 1214
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.4240158310913475,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6878,
+      "step": 1215
+    },
+    {
+      "epoch": 0.6485333333333333,
+      "grad_norm": 0.5762406227870703,
+      "learning_rate": 5.813010299610313e-05,
+      "loss": 0.8156,
+      "step": 1216
+    },
+    {
+      "epoch": 0.6490666666666667,
+      "grad_norm": 0.457905008831502,
+      "learning_rate": 5.797323714580192e-05,
+      "loss": 0.7923,
+      "step": 1217
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.4932329588950252,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6956,
+      "step": 1218
+    },
+    {
+      "epoch": 0.6501333333333333,
+      "grad_norm": 0.49378929371510405,
+      "learning_rate": 5.765988240812921e-05,
+      "loss": 0.7152,
+      "step": 1219
+    },
+    {
+      "epoch": 0.6506666666666666,
+      "grad_norm": 0.6409762670531869,
+      "learning_rate": 5.750339445648252e-05,
+      "loss": 0.766,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.5376631985987848,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.7525,
+      "step": 1221
+    },
+    {
+      "epoch": 0.6517333333333334,
+      "grad_norm": 0.5240791531669154,
+      "learning_rate": 5.7190799724050924e-05,
+      "loss": 0.8279,
+      "step": 1222
+    },
+    {
+      "epoch": 0.6522666666666667,
+      "grad_norm": 0.44664839730064054,
+      "learning_rate": 5.7034693876721376e-05,
+      "loss": 0.78,
+      "step": 1223
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.4039073575383679,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.6781,
+      "step": 1224
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 0.450923506585266,
+      "learning_rate": 5.6722867550612116e-05,
+      "loss": 0.8058,
+      "step": 1225
+    },
+    {
+      "epoch": 0.6538666666666667,
+      "grad_norm": 0.48897683906804906,
+      "learning_rate": 5.6567148002993164e-05,
+      "loss": 0.7291,
+      "step": 1226
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.4808742667856329,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.7342,
+      "step": 1227
+    },
+    {
+      "epoch": 0.6549333333333334,
+      "grad_norm": 0.6253985573620412,
+      "learning_rate": 5.625609846363622e-05,
+      "loss": 0.8818,
+      "step": 1228
+    },
+    {
+      "epoch": 0.6554666666666666,
+      "grad_norm": 0.4846287892781986,
+      "learning_rate": 5.6100769400739383e-05,
+      "loss": 0.6947,
+      "step": 1229
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.478933595678575,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.8098,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6565333333333333,
+      "grad_norm": 0.4943540669969761,
+      "learning_rate": 5.579050500768836e-05,
+      "loss": 0.7402,
+      "step": 1231
+    },
+    {
+      "epoch": 0.6570666666666667,
+      "grad_norm": 0.5985943815299349,
+      "learning_rate": 5.5635570604030705e-05,
+      "loss": 0.7581,
+      "step": 1232
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.5936083742537585,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.8027,
+      "step": 1233
+    },
+    {
+      "epoch": 0.6581333333333333,
+      "grad_norm": 0.4802903816045552,
+      "learning_rate": 5.53260996957381e-05,
+      "loss": 0.7532,
+      "step": 1234
+    },
+    {
+      "epoch": 0.6586666666666666,
+      "grad_norm": 0.47401679138024017,
+      "learning_rate": 5.5171564115230254e-05,
+      "loss": 0.7888,
+      "step": 1235
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.5245939430829126,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6565,
+      "step": 1236
+    },
+    {
+      "epoch": 0.6597333333333333,
+      "grad_norm": 0.6094963923285219,
+      "learning_rate": 5.486289500882355e-05,
+      "loss": 0.761,
+      "step": 1237
+    },
+    {
+      "epoch": 0.6602666666666667,
+      "grad_norm": 0.5166758468287893,
+      "learning_rate": 5.47087624046575e-05,
+      "loss": 0.7691,
+      "step": 1238
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.5986893700296746,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.8324,
+      "step": 1239
+    },
+    {
+      "epoch": 0.6613333333333333,
+      "grad_norm": 0.5608641556504018,
+      "learning_rate": 5.4400903395715366e-05,
+      "loss": 0.744,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6618666666666667,
+      "grad_norm": 0.43588479094760113,
+      "learning_rate": 5.424717791025302e-05,
+      "loss": 0.7413,
+      "step": 1241
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.5047332229040048,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.7423,
+      "step": 1242
+    },
+    {
+      "epoch": 0.6629333333333334,
+      "grad_norm": 1.4880627722155477,
+      "learning_rate": 5.394013727258254e-05,
+      "loss": 0.7006,
+      "step": 1243
+    },
+    {
+      "epoch": 0.6634666666666666,
+      "grad_norm": 0.453024387315063,
+      "learning_rate": 5.378682303724435e-05,
+      "loss": 0.6529,
+      "step": 1244
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.5195667651892474,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.8618,
+      "step": 1245
+    },
+    {
+      "epoch": 0.6645333333333333,
+      "grad_norm": 0.427840085987605,
+      "learning_rate": 5.348060902265871e-05,
+      "loss": 0.6182,
+      "step": 1246
+    },
+    {
+      "epoch": 0.6650666666666667,
+      "grad_norm": 0.6190438453635412,
+      "learning_rate": 5.332771015781275e-05,
+      "loss": 0.8217,
+      "step": 1247
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.49760488405259695,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.7374,
+      "step": 1248
+    },
+    {
+      "epoch": 0.6661333333333334,
+      "grad_norm": 0.4331316786801372,
+      "learning_rate": 5.302233099590928e-05,
+      "loss": 0.6793,
+      "step": 1249
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.5556857259709147,
+      "learning_rate": 5.286985161076029e-05,
+      "loss": 0.8043,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.48880450793228364,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.7714,
+      "step": 1251
+    },
+    {
+      "epoch": 0.6677333333333333,
+      "grad_norm": 0.48399177289629297,
+      "learning_rate": 5.2565315508699376e-05,
+      "loss": 0.6613,
+      "step": 1252
+    },
+    {
+      "epoch": 0.6682666666666667,
+      "grad_norm": 0.5238996308406841,
+      "learning_rate": 5.2413259701178505e-05,
+      "loss": 0.6827,
+      "step": 1253
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.6337195832994496,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6894,
+      "step": 1254
+    },
+    {
+      "epoch": 0.6693333333333333,
+      "grad_norm": 0.6348308055489945,
+      "learning_rate": 5.210957484346314e-05,
+      "loss": 0.8044,
+      "step": 1255
+    },
+    {
+      "epoch": 0.6698666666666667,
+      "grad_norm": 0.4025760674722977,
+      "learning_rate": 5.195794670011776e-05,
+      "loss": 0.6456,
+      "step": 1256
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.4452860285112815,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6119,
+      "step": 1257
+    },
+    {
+      "epoch": 0.6709333333333334,
+      "grad_norm": 0.5334380567489596,
+      "learning_rate": 5.165512124837344e-05,
+      "loss": 0.8085,
+      "step": 1258
+    },
+    {
+      "epoch": 0.6714666666666667,
+      "grad_norm": 0.514311178905084,
+      "learning_rate": 5.150392484425728e-05,
+      "loss": 0.6978,
+      "step": 1259
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4399321327958649,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.7175,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6725333333333333,
+      "grad_norm": 0.4772773737004066,
+      "learning_rate": 5.120196693701267e-05,
+      "loss": 0.7418,
+      "step": 1261
+    },
+    {
+      "epoch": 0.6730666666666667,
+      "grad_norm": 0.5594951719696981,
+      "learning_rate": 5.105120633557634e-05,
+      "loss": 0.7048,
+      "step": 1262
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.4157946441972806,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.6922,
+      "step": 1263
+    },
+    {
+      "epoch": 0.6741333333333334,
+      "grad_norm": 0.4806133334704124,
+      "learning_rate": 5.075012408804458e-05,
+      "loss": 0.7044,
+      "step": 1264
+    },
+    {
+      "epoch": 0.6746666666666666,
+      "grad_norm": 0.5117520912077905,
+      "learning_rate": 5.059980334102637e-05,
+      "loss": 0.7487,
+      "step": 1265
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.569944929975138,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.8501,
+      "step": 1266
+    },
+    {
+      "epoch": 0.6757333333333333,
+      "grad_norm": 0.5969408820785443,
+      "learning_rate": 5.0299604844886985e-05,
+      "loss": 0.8674,
+      "step": 1267
+    },
+    {
+      "epoch": 0.6762666666666667,
+      "grad_norm": 0.4941985486441663,
+      "learning_rate": 5.014972799220403e-05,
+      "loss": 0.82,
+      "step": 1268
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.6502519671746156,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.8603,
+      "step": 1269
+    },
+    {
+      "epoch": 0.6773333333333333,
+      "grad_norm": 0.42783689687814525,
+      "learning_rate": 4.985042131538545e-05,
+      "loss": 0.7035,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6778666666666666,
+      "grad_norm": 0.39953505531503863,
+      "learning_rate": 4.9700992385024934e-05,
+      "loss": 0.6791,
+      "step": 1271
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.5454433994457187,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6927,
+      "step": 1272
+    },
+    {
+      "epoch": 0.6789333333333334,
+      "grad_norm": 0.4846780456609193,
+      "learning_rate": 4.940258557148765e-05,
+      "loss": 0.7063,
+      "step": 1273
+    },
+    {
+      "epoch": 0.6794666666666667,
+      "grad_norm": 0.521882062564805,
+      "learning_rate": 4.9253608579398855e-05,
+      "loss": 0.7471,
+      "step": 1274
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.5398532028341828,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.7122,
+      "step": 1275
+    },
+    {
+      "epoch": 0.6805333333333333,
+      "grad_norm": 0.4532324599227224,
+      "learning_rate": 4.895610964891923e-05,
+      "loss": 0.7293,
+      "step": 1276
+    },
+    {
+      "epoch": 0.6810666666666667,
+      "grad_norm": 0.545815898973468,
+      "learning_rate": 4.880758859890536e-05,
+      "loss": 0.7505,
+      "step": 1277
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.47865165562544837,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.7213,
+      "step": 1278
+    },
+    {
+      "epoch": 0.6821333333333334,
+      "grad_norm": 0.4891032017620811,
+      "learning_rate": 4.851100554686021e-05,
+      "loss": 0.7227,
+      "step": 1279
+    },
+    {
+      "epoch": 0.6826666666666666,
+      "grad_norm": 0.6903768073965876,
+      "learning_rate": 4.836294443047088e-05,
+      "loss": 0.7913,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.5174446406993433,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.6764,
+      "step": 1281
+    },
+    {
+      "epoch": 0.6837333333333333,
+      "grad_norm": 0.5456937956755602,
+      "learning_rate": 4.8067285227622404e-05,
+      "loss": 0.756,
+      "step": 1282
+    },
+    {
+      "epoch": 0.6842666666666667,
+      "grad_norm": 0.48884842701907966,
+      "learning_rate": 4.791968802404648e-05,
+      "loss": 0.7131,
+      "step": 1283
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.46462372035273325,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6943,
+      "step": 1284
+    },
+    {
+      "epoch": 0.6853333333333333,
+      "grad_norm": 0.37672762008403704,
+      "learning_rate": 4.762496061632814e-05,
+      "loss": 0.6181,
+      "step": 1285
+    },
+    {
+      "epoch": 0.6858666666666666,
+      "grad_norm": 0.5825245132202216,
+      "learning_rate": 4.747783129228656e-05,
+      "loss": 0.7794,
+      "step": 1286
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.45910540935082,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.7712,
+      "step": 1287
+    },
+    {
+      "epoch": 0.6869333333333333,
+      "grad_norm": 0.6243536274834849,
+      "learning_rate": 4.718404360058966e-05,
+      "loss": 0.8211,
+      "step": 1288
+    },
+    {
+      "epoch": 0.6874666666666667,
+      "grad_norm": 0.5785678929556791,
+      "learning_rate": 4.7037386110228985e-05,
+      "loss": 0.8172,
+      "step": 1289
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.5632530690227657,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.8021,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6885333333333333,
+      "grad_norm": 0.5924913339482613,
+      "learning_rate": 4.6744546030189486e-05,
+      "loss": 0.7137,
+      "step": 1291
+    },
+    {
+      "epoch": 0.6890666666666667,
+      "grad_norm": 0.6408171320388871,
+      "learning_rate": 4.659836431497563e-05,
+      "loss": 0.8802,
+      "step": 1292
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.43466130623589433,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.7327,
+      "step": 1293
+    },
+    {
+      "epoch": 0.6901333333333334,
+      "grad_norm": 0.41419333541385567,
+      "learning_rate": 4.630647971676232e-05,
+      "loss": 0.6299,
+      "step": 1294
+    },
+    {
+      "epoch": 0.6906666666666667,
+      "grad_norm": 0.6802342126781251,
+      "learning_rate": 4.6160777705374524e-05,
+      "loss": 0.8703,
+      "step": 1295
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.516618660644921,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.7347,
+      "step": 1296
+    },
+    {
+      "epoch": 0.6917333333333333,
+      "grad_norm": 0.5224342097400351,
+      "learning_rate": 4.586985643347717e-05,
+      "loss": 0.657,
+      "step": 1297
+    },
+    {
+      "epoch": 0.6922666666666667,
+      "grad_norm": 0.482786023733216,
+      "learning_rate": 4.572463804170263e-05,
+      "loss": 0.6935,
+      "step": 1298
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.5440058879369418,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6983,
+      "step": 1299
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.5802944767597759,
+      "learning_rate": 4.543468791472131e-05,
+      "loss": 0.7421,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6938666666666666,
+      "grad_norm": 0.5377633957272775,
+      "learning_rate": 4.5289957045349653e-05,
+      "loss": 0.7127,
+      "step": 1301
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.5207231642793138,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.816,
+      "step": 1302
+    },
+    {
+      "epoch": 0.6949333333333333,
+      "grad_norm": 0.46890830943659795,
+      "learning_rate": 4.5000985855784746e-05,
+      "loss": 0.7444,
+      "step": 1303
+    },
+    {
+      "epoch": 0.6954666666666667,
+      "grad_norm": 0.5239841467518617,
+      "learning_rate": 4.485674639850333e-05,
+      "loss": 0.7862,
+      "step": 1304
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.4558894972380989,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7325,
+      "step": 1305
+    },
+    {
+      "epoch": 0.6965333333333333,
+      "grad_norm": 0.37375074027943683,
+      "learning_rate": 4.456876191254582e-05,
+      "loss": 0.6262,
+      "step": 1306
+    },
+    {
+      "epoch": 0.6970666666666666,
+      "grad_norm": 0.5494650548069104,
+      "learning_rate": 4.442501774383515e-05,
+      "loss": 0.9288,
+      "step": 1307
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.4818770521338313,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.6742,
+      "step": 1308
+    },
+    {
+      "epoch": 0.6981333333333334,
+      "grad_norm": 0.3657880080152438,
+      "learning_rate": 4.413802770115816e-05,
+      "loss": 0.5971,
+      "step": 1309
+    },
+    {
+      "epoch": 0.6986666666666667,
+      "grad_norm": 0.4221079896962407,
+      "learning_rate": 4.399478268418771e-05,
+      "loss": 0.7058,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.4764515830267938,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.6907,
+      "step": 1311
+    },
+    {
+      "epoch": 0.6997333333333333,
+      "grad_norm": 0.4738166520918258,
+      "learning_rate": 4.3708794797738375e-05,
+      "loss": 0.8141,
+      "step": 1312
+    },
+    {
+      "epoch": 0.7002666666666667,
+      "grad_norm": 0.38835481161871727,
+      "learning_rate": 4.3566052782262735e-05,
+      "loss": 0.681,
+      "step": 1313
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.4565891315033268,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.7752,
+      "step": 1314
+    },
+    {
+      "epoch": 0.7013333333333334,
+      "grad_norm": 0.4608422462138753,
+      "learning_rate": 4.328107473805487e-05,
+      "loss": 0.6388,
+      "step": 1315
+    },
+    {
+      "epoch": 0.7018666666666666,
+      "grad_norm": 0.6591127677132887,
+      "learning_rate": 4.3138839560310303e-05,
+      "loss": 0.8727,
+      "step": 1316
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.4862142948347619,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6995,
+      "step": 1317
+    },
+    {
+      "epoch": 0.7029333333333333,
+      "grad_norm": 0.5528915723787529,
+      "learning_rate": 4.2854879017217894e-05,
+      "loss": 0.7565,
+      "step": 1318
+    },
+    {
+      "epoch": 0.7034666666666667,
+      "grad_norm": 0.5284686733306023,
+      "learning_rate": 4.271315449981934e-05,
+      "loss": 0.7969,
+      "step": 1319
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.6239566996614822,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.8646,
+      "step": 1320
+    },
+    {
+      "epoch": 0.7045333333333333,
+      "grad_norm": 0.503868202525054,
+      "learning_rate": 4.2430219089370823e-05,
+      "loss": 0.7229,
+      "step": 1321
+    },
+    {
+      "epoch": 0.7050666666666666,
+      "grad_norm": 0.47973915900013236,
+      "learning_rate": 4.228900904120895e-05,
+      "loss": 0.7528,
+      "step": 1322
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.5042533217056323,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.7988,
+      "step": 1323
+    },
+    {
+      "epoch": 0.7061333333333333,
+      "grad_norm": 0.5117440316579105,
+      "learning_rate": 4.200710636738189e-05,
+      "loss": 0.792,
+      "step": 1324
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 0.48090769570543257,
+      "learning_rate": 4.1866414583520877e-05,
+      "loss": 0.6092,
+      "step": 1325
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.4546460753128134,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.7465,
+      "step": 1326
+    },
+    {
+      "epoch": 0.7077333333333333,
+      "grad_norm": 0.4778685816741002,
+      "learning_rate": 4.158555222253771e-05,
+      "loss": 0.7818,
+      "step": 1327
+    },
+    {
+      "epoch": 0.7082666666666667,
+      "grad_norm": 0.4172931822775071,
+      "learning_rate": 4.14453824841132e-05,
+      "loss": 0.6907,
+      "step": 1328
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.4416266010096713,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6804,
+      "step": 1329
+    },
+    {
+      "epoch": 0.7093333333333334,
+      "grad_norm": 0.41224064168095553,
+      "learning_rate": 4.1165567984237764e-05,
+      "loss": 0.6721,
+      "step": 1330
+    },
+    {
+      "epoch": 0.7098666666666666,
+      "grad_norm": 0.44723339787007826,
+      "learning_rate": 4.102592405835536e-05,
+      "loss": 0.7507,
+      "step": 1331
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.5353240653381068,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7261,
+      "step": 1332
+    },
+    {
+      "epoch": 0.7109333333333333,
+      "grad_norm": 0.4561756949502303,
+      "learning_rate": 4.074716493968975e-05,
+      "loss": 0.6876,
+      "step": 1333
+    },
+    {
+      "epoch": 0.7114666666666667,
+      "grad_norm": 0.40559531788607533,
+      "learning_rate": 4.060805057932359e-05,
+      "loss": 0.6803,
+      "step": 1334
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.42581332166379354,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6682,
+      "step": 1335
+    },
+    {
+      "epoch": 0.7125333333333334,
+      "grad_norm": 0.48054075181260647,
+      "learning_rate": 4.0330354333606234e-05,
+      "loss": 0.6991,
+      "step": 1336
+    },
+    {
+      "epoch": 0.7130666666666666,
+      "grad_norm": 0.49404378864320575,
+      "learning_rate": 4.019177327749822e-05,
+      "loss": 0.7297,
+      "step": 1337
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.6032366633974174,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.8139,
+      "step": 1338
+    },
+    {
+      "epoch": 0.7141333333333333,
+      "grad_norm": 0.6478730032015604,
+      "learning_rate": 3.991514736790258e-05,
+      "loss": 0.6995,
+      "step": 1339
+    },
+    {
+      "epoch": 0.7146666666666667,
+      "grad_norm": 0.4753162644667982,
+      "learning_rate": 3.977710334046193e-05,
+      "loss": 0.7458,
+      "step": 1340
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.546975265700906,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.7909,
+      "step": 1341
+    },
+    {
+      "epoch": 0.7157333333333333,
+      "grad_norm": 0.38565874944226974,
+      "learning_rate": 3.950155520139581e-05,
+      "loss": 0.5954,
+      "step": 1342
+    },
+    {
+      "epoch": 0.7162666666666667,
+      "grad_norm": 0.5505491228691735,
+      "learning_rate": 3.936405191259891e-05,
+      "loss": 0.7357,
+      "step": 1343
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.46402518754307825,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.7473,
+      "step": 1344
+    },
+    {
+      "epoch": 0.7173333333333334,
+      "grad_norm": 0.5596790726024379,
+      "learning_rate": 3.9089588949504655e-05,
+      "loss": 0.7905,
+      "step": 1345
+    },
+    {
+      "epoch": 0.7178666666666667,
+      "grad_norm": 0.45655679397131826,
+      "learning_rate": 3.895263009479534e-05,
+      "loss": 0.7756,
+      "step": 1346
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.6093963165564299,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.8257,
+      "step": 1347
+    },
+    {
+      "epoch": 0.7189333333333333,
+      "grad_norm": 0.4738455718490557,
+      "learning_rate": 3.867925968395085e-05,
+      "loss": 0.7141,
+      "step": 1348
+    },
+    {
+      "epoch": 0.7194666666666667,
+      "grad_norm": 0.5129489762341312,
+      "learning_rate": 3.854284894414122e-05,
+      "loss": 0.7099,
+      "step": 1349
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.4753950044121046,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.7651,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7205333333333334,
+      "grad_norm": 0.49435225998858384,
+      "learning_rate": 3.82705784324618e-05,
+      "loss": 0.6927,
+      "step": 1351
+    },
+    {
+      "epoch": 0.7210666666666666,
+      "grad_norm": 0.5149207631621087,
+      "learning_rate": 3.8134719473633094e-05,
+      "loss": 0.7084,
+      "step": 1352
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.48962380021064045,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6561,
+      "step": 1353
+    },
+    {
+      "epoch": 0.7221333333333333,
+      "grad_norm": 0.4817734820863134,
+      "learning_rate": 3.786355617847385e-05,
+      "loss": 0.6415,
+      "step": 1354
+    },
+    {
+      "epoch": 0.7226666666666667,
+      "grad_norm": 0.46244492687186617,
+      "learning_rate": 3.772825265187802e-05,
+      "loss": 0.6482,
+      "step": 1355
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.4456649711346683,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.7377,
+      "step": 1356
+    },
+    {
+      "epoch": 0.7237333333333333,
+      "grad_norm": 0.47727175429766844,
+      "learning_rate": 3.7458203860837234e-05,
+      "loss": 0.7589,
+      "step": 1357
+    },
+    {
+      "epoch": 0.7242666666666666,
+      "grad_norm": 0.4771095634002514,
+      "learning_rate": 3.732345940279893e-05,
+      "loss": 0.7587,
+      "step": 1358
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.5224356015984947,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.7609,
+      "step": 1359
+    },
+    {
+      "epoch": 0.7253333333333334,
+      "grad_norm": 0.47419551634832074,
+      "learning_rate": 3.705453237352227e-05,
+      "loss": 0.6966,
+      "step": 1360
+    },
+    {
+      "epoch": 0.7258666666666667,
+      "grad_norm": 0.6484886986470106,
+      "learning_rate": 3.692035060534088e-05,
+      "loss": 0.8549,
+      "step": 1361
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.40119852570672876,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.7171,
+      "step": 1362
+    },
+    {
+      "epoch": 0.7269333333333333,
+      "grad_norm": 0.5106224372744584,
+      "learning_rate": 3.665255256532638e-05,
+      "loss": 0.7613,
+      "step": 1363
+    },
+    {
+      "epoch": 0.7274666666666667,
+      "grad_norm": 0.5559868341959251,
+      "learning_rate": 3.651893709317887e-05,
+      "loss": 0.8186,
+      "step": 1364
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.42241843133295354,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6973,
+      "step": 1365
+    },
+    {
+      "epoch": 0.7285333333333334,
+      "grad_norm": 0.3997332202408528,
+      "learning_rate": 3.625227523958252e-05,
+      "loss": 0.7467,
+      "step": 1366
+    },
+    {
+      "epoch": 0.7290666666666666,
+      "grad_norm": 0.542573865655012,
+      "learning_rate": 3.611922965442648e-05,
+      "loss": 0.7384,
+      "step": 1367
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.4158894795983539,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.6843,
+      "step": 1368
+    },
+    {
+      "epoch": 0.7301333333333333,
+      "grad_norm": 0.5562155006440779,
+      "learning_rate": 3.5853711153868965e-05,
+      "loss": 0.8085,
+      "step": 1369
+    },
+    {
+      "epoch": 0.7306666666666667,
+      "grad_norm": 0.44157287828678954,
+      "learning_rate": 3.5721239031346066e-05,
+      "loss": 0.6756,
+      "step": 1370
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.5523897884351046,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.7646,
+      "step": 1371
+    },
+    {
+      "epoch": 0.7317333333333333,
+      "grad_norm": 0.40132248104520923,
+      "learning_rate": 3.545687101972013e-05,
+      "loss": 0.6557,
+      "step": 1372
+    },
+    {
+      "epoch": 0.7322666666666666,
+      "grad_norm": 0.45195078908680764,
+      "learning_rate": 3.53249759200601e-05,
+      "loss": 0.6369,
+      "step": 1373
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4884570689666154,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.8075,
+      "step": 1374
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.4884060026975244,
+      "learning_rate": 3.506176550233863e-05,
+      "loss": 0.7314,
+      "step": 1375
+    },
+    {
+      "epoch": 0.7338666666666667,
+      "grad_norm": 0.48232054442174005,
+      "learning_rate": 3.4930450970263485e-05,
+      "loss": 0.6823,
+      "step": 1376
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.4782472833326341,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6893,
+      "step": 1377
+    },
+    {
+      "epoch": 0.7349333333333333,
+      "grad_norm": 0.587509602421774,
+      "learning_rate": 3.46684052203088e-05,
+      "loss": 0.8276,
+      "step": 1378
+    },
+    {
+      "epoch": 0.7354666666666667,
+      "grad_norm": 0.46554498264893784,
+      "learning_rate": 3.4537674784937614e-05,
+      "loss": 0.7039,
+      "step": 1379
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.5771668265979287,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.6921,
+      "step": 1380
+    },
+    {
+      "epoch": 0.7365333333333334,
+      "grad_norm": 0.43425107121264933,
+      "learning_rate": 3.427680074531113e-05,
+      "loss": 0.6942,
+      "step": 1381
+    },
+    {
+      "epoch": 0.7370666666666666,
+      "grad_norm": 0.5690585351463997,
+      "learning_rate": 3.4146657920065285e-05,
+      "loss": 0.7757,
+      "step": 1382
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.5866616526886929,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7548,
+      "step": 1383
+    },
+    {
+      "epoch": 0.7381333333333333,
+      "grad_norm": 0.3859883378448622,
+      "learning_rate": 3.388696260183832e-05,
+      "loss": 0.6484,
+      "step": 1384
+    },
+    {
+      "epoch": 0.7386666666666667,
+      "grad_norm": 0.5445627210962285,
+      "learning_rate": 3.3757410884346894e-05,
+      "loss": 0.843,
+      "step": 1385
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.5125557887469259,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.7498,
+      "step": 1386
+    },
+    {
+      "epoch": 0.7397333333333334,
+      "grad_norm": 0.5395159672227288,
+      "learning_rate": 3.3498901266912396e-05,
+      "loss": 0.7333,
+      "step": 1387
+    },
+    {
+      "epoch": 0.7402666666666666,
+      "grad_norm": 0.5829779547713551,
+      "learning_rate": 3.336994413891828e-05,
+      "loss": 0.733,
+      "step": 1388
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.44359651415082885,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.7156,
+      "step": 1389
+    },
+    {
+      "epoch": 0.7413333333333333,
+      "grad_norm": 0.4547494731971776,
+      "learning_rate": 3.3112627169802946e-05,
+      "loss": 0.684,
+      "step": 1390
+    },
+    {
+      "epoch": 0.7418666666666667,
+      "grad_norm": 0.5131654521142365,
+      "learning_rate": 3.298426809706928e-05,
+      "loss": 0.7467,
+      "step": 1391
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.7262209422589005,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.8295,
+      "step": 1392
+    },
+    {
+      "epoch": 0.7429333333333333,
+      "grad_norm": 0.4936651159819274,
+      "learning_rate": 3.2728150691747115e-05,
+      "loss": 0.6335,
+      "step": 1393
+    },
+    {
+      "epoch": 0.7434666666666667,
+      "grad_norm": 0.5554414008987123,
+      "learning_rate": 3.2600393123964113e-05,
+      "loss": 0.8386,
+      "step": 1394
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4408668117277434,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6766,
+      "step": 1395
+    },
+    {
+      "epoch": 0.7445333333333334,
+      "grad_norm": 0.6336439233367365,
+      "learning_rate": 3.234548216567049e-05,
+      "loss": 0.8759,
+      "step": 1396
+    },
+    {
+      "epoch": 0.7450666666666667,
+      "grad_norm": 0.539206563480282,
+      "learning_rate": 3.2218329536362704e-05,
+      "loss": 0.7304,
+      "step": 1397
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.5727836618844765,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.7783,
+      "step": 1398
+    },
+    {
+      "epoch": 0.7461333333333333,
+      "grad_norm": 0.43515016127065365,
+      "learning_rate": 3.196463187590929e-05,
+      "loss": 0.7522,
+      "step": 1399
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.48313578334329116,
+      "learning_rate": 3.1838087602343344e-05,
+      "loss": 0.7079,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.4296980252575872,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.7411,
+      "step": 1401
+    },
+    {
+      "epoch": 0.7477333333333334,
+      "grad_norm": 0.5549622257031763,
+      "learning_rate": 3.158561005793402e-05,
+      "loss": 0.7094,
+      "step": 1402
+    },
+    {
+      "epoch": 0.7482666666666666,
+      "grad_norm": 0.4829372579543755,
+      "learning_rate": 3.145967754102691e-05,
+      "loss": 0.812,
+      "step": 1403
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.5241206593297438,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.7042,
+      "step": 1404
+    },
+    {
+      "epoch": 0.7493333333333333,
+      "grad_norm": 0.5021517259541153,
+      "learning_rate": 3.120842689807468e-05,
+      "loss": 0.7994,
+      "step": 1405
+    },
+    {
+      "epoch": 0.7498666666666667,
+      "grad_norm": 0.45570593394802356,
+      "learning_rate": 3.108310952230212e-05,
+      "loss": 0.7288,
+      "step": 1406
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.4469788796959992,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.7239,
+      "step": 1407
+    },
+    {
+      "epoch": 0.7509333333333333,
+      "grad_norm": 0.6211504895337312,
+      "learning_rate": 3.083309253324651e-05,
+      "loss": 0.8837,
+      "step": 1408
+    },
+    {
+      "epoch": 0.7514666666666666,
+      "grad_norm": 0.5173983081339508,
+      "learning_rate": 3.070839366655215e-05,
+      "loss": 0.7577,
+      "step": 1409
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.6005497546462576,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.8438,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7525333333333334,
+      "grad_norm": 0.6769735434298438,
+      "learning_rate": 3.0459617050677868e-05,
+      "loss": 0.7799,
+      "step": 1411
+    },
+    {
+      "epoch": 0.7530666666666667,
+      "grad_norm": 0.49118468134244414,
+      "learning_rate": 3.0335540044382694e-05,
+      "loss": 0.6405,
+      "step": 1412
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.7473252064539618,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.8578,
+      "step": 1413
+    },
+    {
+      "epoch": 0.7541333333333333,
+      "grad_norm": 0.5319378972264717,
+      "learning_rate": 3.008801048763914e-05,
+      "loss": 0.6932,
+      "step": 1414
+    },
+    {
+      "epoch": 0.7546666666666667,
+      "grad_norm": 0.4476301854869852,
+      "learning_rate": 2.996455867635155e-05,
+      "loss": 0.697,
+      "step": 1415
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.4885567824260128,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.7583,
+      "step": 1416
+    },
+    {
+      "epoch": 0.7557333333333334,
+      "grad_norm": 0.5086670274026673,
+      "learning_rate": 2.9718282831172883e-05,
+      "loss": 0.7399,
+      "step": 1417
+    },
+    {
+      "epoch": 0.7562666666666666,
+      "grad_norm": 0.6873643208618484,
+      "learning_rate": 2.9595459532698854e-05,
+      "loss": 0.6774,
+      "step": 1418
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.5727613478332071,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.835,
+      "step": 1419
+    },
+    {
+      "epoch": 0.7573333333333333,
+      "grad_norm": 0.4545663499750636,
+      "learning_rate": 2.9350444017825385e-05,
+      "loss": 0.633,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7578666666666667,
+      "grad_norm": 0.5094745047780689,
+      "learning_rate": 2.922825253307947e-05,
+      "loss": 0.7591,
+      "step": 1421
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.5811358589874416,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.7293,
+      "step": 1422
+    },
+    {
+      "epoch": 0.7589333333333333,
+      "grad_norm": 0.5202623326748433,
+      "learning_rate": 2.898450393337977e-05,
+      "loss": 0.7304,
+      "step": 1423
+    },
+    {
+      "epoch": 0.7594666666666666,
+      "grad_norm": 0.453238071655243,
+      "learning_rate": 2.8862947546296315e-05,
+      "loss": 0.7493,
+      "step": 1424
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.47978793526594116,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.6386,
+      "step": 1425
+    },
+    {
+      "epoch": 0.7605333333333333,
+      "grad_norm": 0.5119181666210523,
+      "learning_rate": 2.8620472412590228e-05,
+      "loss": 0.7606,
+      "step": 1426
+    },
+    {
+      "epoch": 0.7610666666666667,
+      "grad_norm": 0.5439472122515798,
+      "learning_rate": 2.8499554390035143e-05,
+      "loss": 0.7197,
+      "step": 1427
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.5127205371166901,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.7982,
+      "step": 1428
+    },
+    {
+      "epoch": 0.7621333333333333,
+      "grad_norm": 0.4721633286931549,
+      "learning_rate": 2.8258359238917665e-05,
+      "loss": 0.7265,
+      "step": 1429
+    },
+    {
+      "epoch": 0.7626666666666667,
+      "grad_norm": 0.48143401096482086,
+      "learning_rate": 2.8138082830600554e-05,
+      "loss": 0.696,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.4833593941086222,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6882,
+      "step": 1431
+    },
+    {
+      "epoch": 0.7637333333333334,
+      "grad_norm": 0.535180650617051,
+      "learning_rate": 2.7898174144266732e-05,
+      "loss": 0.7315,
+      "step": 1432
+    },
+    {
+      "epoch": 0.7642666666666666,
+      "grad_norm": 0.6944748078696262,
+      "learning_rate": 2.7778542582653744e-05,
+      "loss": 0.6996,
+      "step": 1433
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.5037578131433572,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.7503,
+      "step": 1434
+    },
+    {
+      "epoch": 0.7653333333333333,
+      "grad_norm": 0.4495767237600222,
+      "learning_rate": 2.753992680872457e-05,
+      "loss": 0.7238,
+      "step": 1435
+    },
+    {
+      "epoch": 0.7658666666666667,
+      "grad_norm": 0.4483524790086883,
+      "learning_rate": 2.7420943308951284e-05,
+      "loss": 0.6971,
+      "step": 1436
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.5180856413717794,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.7512,
+      "step": 1437
+    },
+    {
+      "epoch": 0.7669333333333334,
+      "grad_norm": 0.45358769303728175,
+      "learning_rate": 2.7183626860300247e-05,
+      "loss": 0.7662,
+      "step": 1438
+    },
+    {
+      "epoch": 0.7674666666666666,
+      "grad_norm": 0.6204504192047962,
+      "learning_rate": 2.7065294620085424e-05,
+      "loss": 0.8175,
+      "step": 1439
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.582202298948077,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.8074,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7685333333333333,
+      "grad_norm": 0.5809861698330605,
+      "learning_rate": 2.6829283874666233e-05,
+      "loss": 0.6598,
+      "step": 1441
+    },
+    {
+      "epoch": 0.7690666666666667,
+      "grad_norm": 0.49271726527820786,
+      "learning_rate": 2.6711606074225782e-05,
+      "loss": 0.7314,
+      "step": 1442
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.5363551881959038,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.7172,
+      "step": 1443
+    },
+    {
+      "epoch": 0.7701333333333333,
+      "grad_norm": 0.4926777999825674,
+      "learning_rate": 2.647690737490106e-05,
+      "loss": 0.7787,
+      "step": 1444
+    },
+    {
+      "epoch": 0.7706666666666667,
+      "grad_norm": 0.46908259578162226,
+      "learning_rate": 2.6359887176862718e-05,
+      "loss": 0.7163,
+      "step": 1445
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.5397282744472077,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.814,
+      "step": 1446
+    },
+    {
+      "epoch": 0.7717333333333334,
+      "grad_norm": 0.4697132224477373,
+      "learning_rate": 2.6126506831233344e-05,
+      "loss": 0.5815,
+      "step": 1447
+    },
+    {
+      "epoch": 0.7722666666666667,
+      "grad_norm": 0.42579868398354936,
+      "learning_rate": 2.6010147380551475e-05,
+      "loss": 0.6658,
+      "step": 1448
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.4569791807916529,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.7226,
+      "step": 1449
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.39669229549545293,
+      "learning_rate": 2.577809166078716e-05,
+      "loss": 0.7225,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7738666666666667,
+      "grad_norm": 0.5297173041860715,
+      "learning_rate": 2.566239608465838e-05,
+      "loss": 0.7861,
+      "step": 1451
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4864412981501662,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.7326,
+      "step": 1452
+    },
+    {
+      "epoch": 0.7749333333333334,
+      "grad_norm": 0.5589885314689083,
+      "learning_rate": 2.543167122732918e-05,
+      "loss": 0.7526,
+      "step": 1453
+    },
+    {
+      "epoch": 0.7754666666666666,
+      "grad_norm": 0.4867758716241291,
+      "learning_rate": 2.5316642635108244e-05,
+      "loss": 0.7805,
+      "step": 1454
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.5318053199685544,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.7644,
+      "step": 1455
+    },
+    {
+      "epoch": 0.7765333333333333,
+      "grad_norm": 0.5354608205298255,
+      "learning_rate": 2.508725484101684e-05,
+      "loss": 0.682,
+      "step": 1456
+    },
+    {
+      "epoch": 0.7770666666666667,
+      "grad_norm": 0.38066758056826167,
+      "learning_rate": 2.4972896324133144e-05,
+      "loss": 0.6337,
+      "step": 1457
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.49654962314840656,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.738,
+      "step": 1458
+    },
+    {
+      "epoch": 0.7781333333333333,
+      "grad_norm": 0.4173555980908285,
+      "learning_rate": 2.4744851758148156e-05,
+      "loss": 0.6503,
+      "step": 1459
+    },
+    {
+      "epoch": 0.7786666666666666,
+      "grad_norm": 0.4833517675096117,
+      "learning_rate": 2.4631166390022574e-05,
+      "loss": 0.7633,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.4457017967605104,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.7198,
+      "step": 1461
+    },
+    {
+      "epoch": 0.7797333333333333,
+      "grad_norm": 0.48172126652085867,
+      "learning_rate": 2.4404471180913058e-05,
+      "loss": 0.7486,
+      "step": 1462
+    },
+    {
+      "epoch": 0.7802666666666667,
+      "grad_norm": 0.4691379373475714,
+      "learning_rate": 2.429146201687538e-05,
+      "loss": 0.7917,
+      "step": 1463
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.4366557171419435,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.7614,
+      "step": 1464
+    },
+    {
+      "epoch": 0.7813333333333333,
+      "grad_norm": 0.6389432991012376,
+      "learning_rate": 2.4066122257145894e-05,
+      "loss": 0.7562,
+      "step": 1465
+    },
+    {
+      "epoch": 0.7818666666666667,
+      "grad_norm": 0.45218296818007325,
+      "learning_rate": 2.3953792334352787e-05,
+      "loss": 0.6676,
+      "step": 1466
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.5127013510044014,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6803,
+      "step": 1467
+    },
+    {
+      "epoch": 0.7829333333333334,
+      "grad_norm": 0.550397023114835,
+      "learning_rate": 2.3729814080079816e-05,
+      "loss": 0.722,
+      "step": 1468
+    },
+    {
+      "epoch": 0.7834666666666666,
+      "grad_norm": 0.588260946539843,
+      "learning_rate": 2.361816641743303e-05,
+      "loss": 0.8028,
+      "step": 1469
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4293044340924837,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.675,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7845333333333333,
+      "grad_norm": 0.43436506152113274,
+      "learning_rate": 2.339555568810221e-05,
+      "loss": 0.704,
+      "step": 1471
+    },
+    {
+      "epoch": 0.7850666666666667,
+      "grad_norm": 0.4693231671304673,
+      "learning_rate": 2.328459328616759e-05,
+      "loss": 0.7876,
+      "step": 1472
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.4534259390563445,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.7488,
+      "step": 1473
+    },
+    {
+      "epoch": 0.7861333333333334,
+      "grad_norm": 0.43524954894664175,
+      "learning_rate": 2.306335606451181e-05,
+      "loss": 0.6833,
+      "step": 1474
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 0.5173068347282955,
+      "learning_rate": 2.295308190543859e-05,
+      "loss": 0.6967,
+      "step": 1475
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.600135698112275,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.7668,
+      "step": 1476
+    },
+    {
+      "epoch": 0.7877333333333333,
+      "grad_norm": 0.5785766990626001,
+      "learning_rate": 2.2733224137277366e-05,
+      "loss": 0.9031,
+      "step": 1477
+    },
+    {
+      "epoch": 0.7882666666666667,
+      "grad_norm": 0.4850535026822109,
+      "learning_rate": 2.262364118471805e-05,
+      "loss": 0.6891,
+      "step": 1478
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.9696176740359524,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.8852,
+      "step": 1479
+    },
+    {
+      "epoch": 0.7893333333333333,
+      "grad_norm": 0.6009802936351409,
+      "learning_rate": 2.2405168778797646e-05,
+      "loss": 0.807,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7898666666666667,
+      "grad_norm": 0.4672352030261569,
+      "learning_rate": 2.2296279977828337e-05,
+      "loss": 0.7554,
+      "step": 1481
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.4218704627178395,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.72,
+      "step": 1482
+    },
+    {
+      "epoch": 0.7909333333333334,
+      "grad_norm": 0.5059599658543962,
+      "learning_rate": 2.2079198805662914e-05,
+      "loss": 0.6995,
+      "step": 1483
+    },
+    {
+      "epoch": 0.7914666666666667,
+      "grad_norm": 0.5656753835448954,
+      "learning_rate": 2.1971007082704164e-05,
+      "loss": 0.7585,
+      "step": 1484
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.4774116641105069,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6692,
+      "step": 1485
+    },
+    {
+      "epoch": 0.7925333333333333,
+      "grad_norm": 0.5271022801821177,
+      "learning_rate": 2.1755322978418137e-05,
+      "loss": 0.8116,
+      "step": 1486
+    },
+    {
+      "epoch": 0.7930666666666667,
+      "grad_norm": 0.5313883619232115,
+      "learning_rate": 2.1647831241156302e-05,
+      "loss": 0.8125,
+      "step": 1487
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.5071211777372121,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6735,
+      "step": 1488
+    },
+    {
+      "epoch": 0.7941333333333334,
+      "grad_norm": 0.4852707362890887,
+      "learning_rate": 2.1433550001327373e-05,
+      "loss": 0.5942,
+      "step": 1489
+    },
+    {
+      "epoch": 0.7946666666666666,
+      "grad_norm": 0.5550116153852038,
+      "learning_rate": 2.1326761138636553e-05,
+      "loss": 0.7453,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.43543146198658933,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6987,
+      "step": 1491
+    },
+    {
+      "epoch": 0.7957333333333333,
+      "grad_norm": 0.42424194634554874,
+      "learning_rate": 2.111388852214001e-05,
+      "loss": 0.677,
+      "step": 1492
+    },
+    {
+      "epoch": 0.7962666666666667,
+      "grad_norm": 0.6108351206781172,
+      "learning_rate": 2.1007805404004242e-05,
+      "loss": 0.7572,
+      "step": 1493
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.5559873692980049,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.7539,
+      "step": 1494
+    },
+    {
+      "epoch": 0.7973333333333333,
+      "grad_norm": 0.3937613220805676,
+      "learning_rate": 2.0796347131858186e-05,
+      "loss": 0.5808,
+      "step": 1495
+    },
+    {
+      "epoch": 0.7978666666666666,
+      "grad_norm": 0.5023593030301023,
+      "learning_rate": 2.069097260929439e-05,
+      "loss": 0.709,
+      "step": 1496
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.45169725609074757,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.7311,
+      "step": 1497
+    },
+    {
+      "epoch": 0.7989333333333334,
+      "grad_norm": 0.5282539621355999,
+      "learning_rate": 2.048093436450603e-05,
+      "loss": 0.729,
+      "step": 1498
+    },
+    {
+      "epoch": 0.7994666666666667,
+      "grad_norm": 0.6313553350650196,
+      "learning_rate": 2.0376271269487514e-05,
+      "loss": 0.8341,
+      "step": 1499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.5219722982471708,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.8209,
+      "step": 1500
+    },
+    {
+      "epoch": 0.8005333333333333,
+      "grad_norm": 0.5833231239077713,
+      "learning_rate": 2.0167658696900317e-05,
+      "loss": 0.8449,
+      "step": 1501
+    },
+    {
+      "epoch": 0.8010666666666667,
+      "grad_norm": 0.43843108756571597,
+      "learning_rate": 2.0063709842280432e-05,
+      "loss": 0.6107,
+      "step": 1502
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.49291061450379847,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.7863,
+      "step": 1503
+    },
+    {
+      "epoch": 0.8021333333333334,
+      "grad_norm": 0.5327844658716501,
+      "learning_rate": 1.985652854842247e-05,
+      "loss": 0.7032,
+      "step": 1504
+    },
+    {
+      "epoch": 0.8026666666666666,
+      "grad_norm": 0.5629117337994617,
+      "learning_rate": 1.9753296727859195e-05,
+      "loss": 0.8057,
+      "step": 1505
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.43057064449013455,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6467,
+      "step": 1506
+    },
+    {
+      "epoch": 0.8037333333333333,
+      "grad_norm": 0.40144706648058803,
+      "learning_rate": 1.9547552280792524e-05,
+      "loss": 0.6154,
+      "step": 1507
+    },
+    {
+      "epoch": 0.8042666666666667,
+      "grad_norm": 0.41224528576852115,
+      "learning_rate": 1.9445040268673298e-05,
+      "loss": 0.7186,
+      "step": 1508
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.5493275235326429,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7174,
+      "step": 1509
+    },
+    {
+      "epoch": 0.8053333333333333,
+      "grad_norm": 0.4774079575940184,
+      "learning_rate": 1.9240738197844278e-05,
+      "loss": 0.6967,
+      "step": 1510
+    },
+    {
+      "epoch": 0.8058666666666666,
+      "grad_norm": 0.48463906773266147,
+      "learning_rate": 1.9138948749211472e-05,
+      "loss": 0.7209,
+      "step": 1511
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.5163153518005792,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.8135,
+      "step": 1512
+    },
+    {
+      "epoch": 0.8069333333333333,
+      "grad_norm": 0.5456312112526748,
+      "learning_rate": 1.8936094545302095e-05,
+      "loss": 0.748,
+      "step": 1513
+    },
+    {
+      "epoch": 0.8074666666666667,
+      "grad_norm": 0.4649146794818222,
+      "learning_rate": 1.883503039577894e-05,
+      "loss": 0.7371,
+      "step": 1514
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.5167663833581301,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7598,
+      "step": 1515
+    },
+    {
+      "epoch": 0.8085333333333333,
+      "grad_norm": 0.39965746354745346,
+      "learning_rate": 1.8633629510559314e-05,
+      "loss": 0.6812,
+      "step": 1516
+    },
+    {
+      "epoch": 0.8090666666666667,
+      "grad_norm": 0.5372340369217614,
+      "learning_rate": 1.8533293376276472e-05,
+      "loss": 0.7217,
+      "step": 1517
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.4367158355141629,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.7026,
+      "step": 1518
+    },
+    {
+      "epoch": 0.8101333333333334,
+      "grad_norm": 0.5245846395952943,
+      "learning_rate": 1.8333351222458407e-05,
+      "loss": 0.8301,
+      "step": 1519
+    },
+    {
+      "epoch": 0.8106666666666666,
+      "grad_norm": 0.44809015973199046,
+      "learning_rate": 1.8233745799980817e-05,
+      "loss": 0.6098,
+      "step": 1520
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.4896095179374265,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.684,
+      "step": 1521
+    },
+    {
+      "epoch": 0.8117333333333333,
+      "grad_norm": 0.5234106203427703,
+      "learning_rate": 1.803526775107217e-05,
+      "loss": 0.7691,
+      "step": 1522
+    },
+    {
+      "epoch": 0.8122666666666667,
+      "grad_norm": 0.46476988376251316,
+      "learning_rate": 1.7936395717326704e-05,
+      "loss": 0.6239,
+      "step": 1523
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.49874973349599516,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.7547,
+      "step": 1524
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 0.5905184413825677,
+      "learning_rate": 1.773938710748706e-05,
+      "loss": 0.8502,
+      "step": 1525
+    },
+    {
+      "epoch": 0.8138666666666666,
+      "grad_norm": 0.5035949347611718,
+      "learning_rate": 1.7641251119690505e-05,
+      "loss": 0.7307,
+      "step": 1526
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.5738022003972325,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.7793,
+      "step": 1527
+    },
+    {
+      "epoch": 0.8149333333333333,
+      "grad_norm": 0.5424653390734194,
+      "learning_rate": 1.744571724358789e-05,
+      "loss": 0.7757,
+      "step": 1528
+    },
+    {
+      "epoch": 0.8154666666666667,
+      "grad_norm": 0.46252207283074476,
+      "learning_rate": 1.7348319939175637e-05,
+      "loss": 0.7,
+      "step": 1529
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.4907851167397113,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.7292,
+      "step": 1530
+    },
+    {
+      "epoch": 0.8165333333333333,
+      "grad_norm": 0.4178661965994772,
+      "learning_rate": 1.715426605184407e-05,
+      "loss": 0.6679,
+      "step": 1531
+    },
+    {
+      "epoch": 0.8170666666666667,
+      "grad_norm": 0.47754089253765014,
+      "learning_rate": 1.705761004839911e-05,
+      "loss": 0.6702,
+      "step": 1532
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.6633324760822404,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7803,
+      "step": 1533
+    },
+    {
+      "epoch": 0.8181333333333334,
+      "grad_norm": 0.4727307422889256,
+      "learning_rate": 1.6865041365097435e-05,
+      "loss": 0.6659,
+      "step": 1534
+    },
+    {
+      "epoch": 0.8186666666666667,
+      "grad_norm": 0.5730428573362569,
+      "learning_rate": 1.676912926028007e-05,
+      "loss": 0.8064,
+      "step": 1535
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.5099587028969585,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.746,
+      "step": 1536
+    },
+    {
+      "epoch": 0.8197333333333333,
+      "grad_norm": 0.5086193753246331,
+      "learning_rate": 1.6578050956351886e-05,
+      "loss": 0.7935,
+      "step": 1537
+    },
+    {
+      "epoch": 0.8202666666666667,
+      "grad_norm": 0.5240438660890212,
+      "learning_rate": 1.6482885327829913e-05,
+      "loss": 0.7281,
+      "step": 1538
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.4504134348879838,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.7004,
+      "step": 1539
+    },
+    {
+      "epoch": 0.8213333333333334,
+      "grad_norm": 0.46641359685348005,
+      "learning_rate": 1.6293302538564382e-05,
+      "loss": 0.6662,
+      "step": 1540
+    },
+    {
+      "epoch": 0.8218666666666666,
+      "grad_norm": 0.5838658371077295,
+      "learning_rate": 1.619888594394382e-05,
+      "loss": 0.7176,
+      "step": 1541
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.7012391602970637,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.872,
+      "step": 1542
+    },
+    {
+      "epoch": 0.8229333333333333,
+      "grad_norm": 0.45478183438206427,
+      "learning_rate": 1.601080376443763e-05,
+      "loss": 0.6577,
+      "step": 1543
+    },
+    {
+      "epoch": 0.8234666666666667,
+      "grad_norm": 0.5536165423047683,
+      "learning_rate": 1.5917138741193973e-05,
+      "loss": 0.7681,
+      "step": 1544
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.6019648360041853,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.8668,
+      "step": 1545
+    },
+    {
+      "epoch": 0.8245333333333333,
+      "grad_norm": 0.528126687311034,
+      "learning_rate": 1.573056222621453e-05,
+      "loss": 0.8946,
+      "step": 1546
+    },
+    {
+      "epoch": 0.8250666666666666,
+      "grad_norm": 0.5787961931296473,
+      "learning_rate": 1.5637651291624523e-05,
+      "loss": 0.7104,
+      "step": 1547
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.5584247292865336,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.7912,
+      "step": 1548
+    },
+    {
+      "epoch": 0.8261333333333334,
+      "grad_norm": 0.5592127404421677,
+      "learning_rate": 1.5452585455473977e-05,
+      "loss": 0.8268,
+      "step": 1549
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 1.6459891604457142,
+      "learning_rate": 1.536043110654809e-05,
+      "loss": 0.7625,
+      "step": 1550
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 1.9460915202760372,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.7972,
+      "step": 1551
+    },
+    {
+      "epoch": 0.8277333333333333,
+      "grad_norm": 0.49389502057942186,
+      "learning_rate": 1.5176880922928616e-05,
+      "loss": 0.7326,
+      "step": 1552
+    },
+    {
+      "epoch": 0.8282666666666667,
+      "grad_norm": 0.5296872403780049,
+      "learning_rate": 1.5085485636343755e-05,
+      "loss": 0.7438,
+      "step": 1553
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.5206900777405126,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.8099,
+      "step": 1554
+    },
+    {
+      "epoch": 0.8293333333333334,
+      "grad_norm": 0.562759680643968,
+      "learning_rate": 1.4903456038223939e-05,
+      "loss": 0.7302,
+      "step": 1555
+    },
+    {
+      "epoch": 0.8298666666666666,
+      "grad_norm": 0.43740427820467087,
+      "learning_rate": 1.4812822270257009e-05,
+      "loss": 0.642,
+      "step": 1556
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.5809060492495025,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.7144,
+      "step": 1557
+    },
+    {
+      "epoch": 0.8309333333333333,
+      "grad_norm": 0.5078337671329715,
+      "learning_rate": 1.4632318149739177e-05,
+      "loss": 0.7088,
+      "step": 1558
+    },
+    {
+      "epoch": 0.8314666666666667,
+      "grad_norm": 0.4223935317492648,
+      "learning_rate": 1.454244833620102e-05,
+      "loss": 0.7237,
+      "step": 1559
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.4306772984469156,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6317,
+      "step": 1560
+    },
+    {
+      "epoch": 0.8325333333333333,
+      "grad_norm": 0.48965342980711823,
+      "learning_rate": 1.4363474544389877e-05,
+      "loss": 0.6731,
+      "step": 1561
+    },
+    {
+      "epoch": 0.8330666666666666,
+      "grad_norm": 0.48871038552264057,
+      "learning_rate": 1.4274371100559791e-05,
+      "loss": 0.7343,
+      "step": 1562
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.48062667686149924,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.7227,
+      "step": 1563
+    },
+    {
+      "epoch": 0.8341333333333333,
+      "grad_norm": 0.47089628085218616,
+      "learning_rate": 1.409693244743192e-05,
+      "loss": 0.6916,
+      "step": 1564
+    },
+    {
+      "epoch": 0.8346666666666667,
+      "grad_norm": 0.5779379966268988,
+      "learning_rate": 1.4008597767992871e-05,
+      "loss": 0.7505,
+      "step": 1565
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.6465429524456934,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.9103,
+      "step": 1566
+    },
+    {
+      "epoch": 0.8357333333333333,
+      "grad_norm": 0.6218601157422986,
+      "learning_rate": 1.3832699022267515e-05,
+      "loss": 0.765,
+      "step": 1567
+    },
+    {
+      "epoch": 0.8362666666666667,
+      "grad_norm": 0.5156130773122601,
+      "learning_rate": 1.37451354812416e-05,
+      "loss": 0.7075,
+      "step": 1568
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.40010584341983213,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6547,
+      "step": 1569
+    },
+    {
+      "epoch": 0.8373333333333334,
+      "grad_norm": 0.6052419811423837,
+      "learning_rate": 1.3570781370252582e-05,
+      "loss": 0.7519,
+      "step": 1570
+    },
+    {
+      "epoch": 0.8378666666666666,
+      "grad_norm": 0.5380969282925244,
+      "learning_rate": 1.3483991320937306e-05,
+      "loss": 0.7498,
+      "step": 1571
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.610094785258102,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.8135,
+      "step": 1572
+    },
+    {
+      "epoch": 0.8389333333333333,
+      "grad_norm": 0.4548361760721386,
+      "learning_rate": 1.3311186530505838e-05,
+      "loss": 0.671,
+      "step": 1573
+    },
+    {
+      "epoch": 0.8394666666666667,
+      "grad_norm": 0.5626173142904772,
+      "learning_rate": 1.322517230541096e-05,
+      "loss": 0.8198,
+      "step": 1574
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.5625778147640857,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.7443,
+      "step": 1575
+    },
+    {
+      "epoch": 0.8405333333333334,
+      "grad_norm": 0.4606738315995986,
+      "learning_rate": 1.30539214797198e-05,
+      "loss": 0.6705,
+      "step": 1576
+    },
+    {
+      "epoch": 0.8410666666666666,
+      "grad_norm": 0.42348707737647995,
+      "learning_rate": 1.2968685390504465e-05,
+      "loss": 0.6206,
+      "step": 1577
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.7293607528561209,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.7697,
+      "step": 1578
+    },
+    {
+      "epoch": 0.8421333333333333,
+      "grad_norm": 0.4803752752873544,
+      "learning_rate": 1.2798993131973091e-05,
+      "loss": 0.7195,
+      "step": 1579
+    },
+    {
+      "epoch": 0.8426666666666667,
+      "grad_norm": 0.4721616013466926,
+      "learning_rate": 1.2714537469383858e-05,
+      "loss": 0.7272,
+      "step": 1580
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.5749202610287232,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.7178,
+      "step": 1581
+    },
+    {
+      "epoch": 0.8437333333333333,
+      "grad_norm": 0.4918692092749649,
+      "learning_rate": 1.2546408338544769e-05,
+      "loss": 0.6898,
+      "step": 1582
+    },
+    {
+      "epoch": 0.8442666666666667,
+      "grad_norm": 0.43734853080318675,
+      "learning_rate": 1.2462735372353996e-05,
+      "loss": 0.6955,
+      "step": 1583
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.5224295333686861,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.78,
+      "step": 1584
+    },
+    {
+      "epoch": 0.8453333333333334,
+      "grad_norm": 0.5922274092117513,
+      "learning_rate": 1.2296173887730123e-05,
+      "loss": 0.7297,
+      "step": 1585
+    },
+    {
+      "epoch": 0.8458666666666667,
+      "grad_norm": 0.5705340603856977,
+      "learning_rate": 1.2213285866674905e-05,
+      "loss": 0.8368,
+      "step": 1586
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.4910596221861991,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.7026,
+      "step": 1587
+    },
+    {
+      "epoch": 0.8469333333333333,
+      "grad_norm": 0.4467032714737605,
+      "learning_rate": 1.2048296504658207e-05,
+      "loss": 0.6869,
+      "step": 1588
+    },
+    {
+      "epoch": 0.8474666666666667,
+      "grad_norm": 0.4207718435624483,
+      "learning_rate": 1.1966195656380031e-05,
+      "loss": 0.7002,
+      "step": 1589
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.5165282223653747,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.7054,
+      "step": 1590
+    },
+    {
+      "epoch": 0.8485333333333334,
+      "grad_norm": 0.4737730901307995,
+      "learning_rate": 1.1802782851111205e-05,
+      "loss": 0.6875,
+      "step": 1591
+    },
+    {
+      "epoch": 0.8490666666666666,
+      "grad_norm": 0.4563247614617513,
+      "learning_rate": 1.1721471382096027e-05,
+      "loss": 0.693,
+      "step": 1592
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.47543609508727075,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6393,
+      "step": 1593
+    },
+    {
+      "epoch": 0.8501333333333333,
+      "grad_norm": 0.3476281053216799,
+      "learning_rate": 1.1559639525345311e-05,
+      "loss": 0.6165,
+      "step": 1594
+    },
+    {
+      "epoch": 0.8506666666666667,
+      "grad_norm": 0.5202231870957309,
+      "learning_rate": 1.1479119620864276e-05,
+      "loss": 0.759,
+      "step": 1595
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.42654028759472845,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6388,
+      "step": 1596
+    },
+    {
+      "epoch": 0.8517333333333333,
+      "grad_norm": 0.46701105025080786,
+      "learning_rate": 1.1318873061913405e-05,
+      "loss": 0.6391,
+      "step": 1597
+    },
+    {
+      "epoch": 0.8522666666666666,
+      "grad_norm": 0.5536664806031145,
+      "learning_rate": 1.123914688596409e-05,
+      "loss": 0.7509,
+      "step": 1598
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.6799427233853165,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.837,
+      "step": 1599
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.4786584684295808,
+      "learning_rate": 1.1080489931489391e-05,
+      "loss": 0.6551,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8538666666666667,
+      "grad_norm": 0.6346363261580554,
+      "learning_rate": 1.1001559626737756e-05,
+      "loss": 0.7388,
+      "step": 1601
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.5249745108943299,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.7307,
+      "step": 1602
+    },
+    {
+      "epoch": 0.8549333333333333,
+      "grad_norm": 0.5420943687474257,
+      "learning_rate": 1.0844496540694515e-05,
+      "loss": 0.81,
+      "step": 1603
+    },
+    {
+      "epoch": 0.8554666666666667,
+      "grad_norm": 0.44520089650783257,
+      "learning_rate": 1.0766364228417148e-05,
+      "loss": 0.707,
+      "step": 1604
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.4415726863620001,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6431,
+      "step": 1605
+    },
+    {
+      "epoch": 0.8565333333333334,
+      "grad_norm": 0.5168783328821801,
+      "learning_rate": 1.0610899231924886e-05,
+      "loss": 0.6849,
+      "step": 1606
+    },
+    {
+      "epoch": 0.8570666666666666,
+      "grad_norm": 0.4408776737461188,
+      "learning_rate": 1.0533567011952094e-05,
+      "loss": 0.7219,
+      "step": 1607
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.5600784676932644,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.8664,
+      "step": 1608
+    },
+    {
+      "epoch": 0.8581333333333333,
+      "grad_norm": 0.465766389137664,
+      "learning_rate": 1.0379704283181179e-05,
+      "loss": 0.7233,
+      "step": 1609
+    },
+    {
+      "epoch": 0.8586666666666667,
+      "grad_norm": 0.5088576755246479,
+      "learning_rate": 1.0303174233840528e-05,
+      "loss": 0.7127,
+      "step": 1610
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.631801694369338,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.7959,
+      "step": 1611
+    },
+    {
+      "epoch": 0.8597333333333333,
+      "grad_norm": 0.5218246264212684,
+      "learning_rate": 1.0150917907899926e-05,
+      "loss": 0.8172,
+      "step": 1612
+    },
+    {
+      "epoch": 0.8602666666666666,
+      "grad_norm": 0.4825524954277326,
+      "learning_rate": 1.007519208596045e-05,
+      "loss": 0.7186,
+      "step": 1613
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.5263756401894659,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.7588,
+      "step": 1614
+    },
+    {
+      "epoch": 0.8613333333333333,
+      "grad_norm": 0.5607804957473732,
+      "learning_rate": 9.924546254786493e-06,
+      "loss": 0.7896,
+      "step": 1615
+    },
+    {
+      "epoch": 0.8618666666666667,
+      "grad_norm": 0.5317198707514146,
+      "learning_rate": 9.849626695403324e-06,
+      "loss": 0.7555,
+      "step": 1616
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.4830066670495747,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.7157,
+      "step": 1617
+    },
+    {
+      "epoch": 0.8629333333333333,
+      "grad_norm": 0.4273456175724436,
+      "learning_rate": 9.700595407649805e-06,
+      "loss": 0.6735,
+      "step": 1618
+    },
+    {
+      "epoch": 0.8634666666666667,
+      "grad_norm": 0.45417802530711415,
+      "learning_rate": 9.62648412430951e-06,
+      "loss": 0.7218,
+      "step": 1619
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.42860751675190184,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6094,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8645333333333334,
+      "grad_norm": 0.38947169025510137,
+      "learning_rate": 9.479071385238892e-06,
+      "loss": 0.6599,
+      "step": 1621
+    },
+    {
+      "epoch": 0.8650666666666667,
+      "grad_norm": 0.4075364372659137,
+      "learning_rate": 9.40577036970538e-06,
+      "loss": 0.6509,
+      "step": 1622
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.4131338362822137,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6275,
+      "step": 1623
+    },
+    {
+      "epoch": 0.8661333333333333,
+      "grad_norm": 0.474524778672002,
+      "learning_rate": 9.259980141081115e-06,
+      "loss": 0.6994,
+      "step": 1624
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.4929519260488483,
+      "learning_rate": 9.187491363342093e-06,
+      "loss": 0.8072,
+      "step": 1625
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.5739079113452518,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.7298,
+      "step": 1626
+    },
+    {
+      "epoch": 0.8677333333333334,
+      "grad_norm": 0.4393783480975882,
+      "learning_rate": 9.043327563322112e-06,
+      "loss": 0.6485,
+      "step": 1627
+    },
+    {
+      "epoch": 0.8682666666666666,
+      "grad_norm": 0.492183761772122,
+      "learning_rate": 8.971652971536148e-06,
+      "loss": 0.7441,
+      "step": 1628
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.45490549116062656,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6082,
+      "step": 1629
+    },
+    {
+      "epoch": 0.8693333333333333,
+      "grad_norm": 0.6727059960569023,
+      "learning_rate": 8.829119474567671e-06,
+      "loss": 0.7843,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8698666666666667,
+      "grad_norm": 0.46021591435431813,
+      "learning_rate": 8.758260995011825e-06,
+      "loss": 0.7266,
+      "step": 1631
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.43787677770044686,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6711,
+      "step": 1632
+    },
+    {
+      "epoch": 0.8709333333333333,
+      "grad_norm": 0.5054525066581391,
+      "learning_rate": 8.617361631727138e-06,
+      "loss": 0.6961,
+      "step": 1633
+    },
+    {
+      "epoch": 0.8714666666666666,
+      "grad_norm": 0.4219011331339254,
+      "learning_rate": 8.547321168745193e-06,
+      "loss": 0.6766,
+      "step": 1634
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.6642900014098958,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.7591,
+      "step": 1635
+    },
+    {
+      "epoch": 0.8725333333333334,
+      "grad_norm": 0.5184000275481225,
+      "learning_rate": 8.408059725858719e-06,
+      "loss": 0.6511,
+      "step": 1636
+    },
+    {
+      "epoch": 0.8730666666666667,
+      "grad_norm": 0.48729918547181306,
+      "learning_rate": 8.338839161809997e-06,
+      "loss": 0.7974,
+      "step": 1637
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4763858189554415,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.6517,
+      "step": 1638
+    },
+    {
+      "epoch": 0.8741333333333333,
+      "grad_norm": 0.4797922479036312,
+      "learning_rate": 8.201219382016556e-06,
+      "loss": 0.6274,
+      "step": 1639
+    },
+    {
+      "epoch": 0.8746666666666667,
+      "grad_norm": 0.5065831946679562,
+      "learning_rate": 8.132820577225387e-06,
+      "loss": 0.7318,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.5505070909082705,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.8353,
+      "step": 1641
+    },
+    {
+      "epoch": 0.8757333333333334,
+      "grad_norm": 0.48013622734320494,
+      "learning_rate": 7.996846159099557e-06,
+      "loss": 0.6355,
+      "step": 1642
+    },
+    {
+      "epoch": 0.8762666666666666,
+      "grad_norm": 0.48653054810125534,
+      "learning_rate": 7.929270951805178e-06,
+      "loss": 0.7309,
+      "step": 1643
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.6003193953384854,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.8524,
+      "step": 1644
+    },
+    {
+      "epoch": 0.8773333333333333,
+      "grad_norm": 0.5207762433264173,
+      "learning_rate": 7.794945549701993e-06,
+      "loss": 0.7493,
+      "step": 1645
+    },
+    {
+      "epoch": 0.8778666666666667,
+      "grad_norm": 0.5234557463199222,
+      "learning_rate": 7.728195756009204e-06,
+      "loss": 0.7591,
+      "step": 1646
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.506189512868477,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.741,
+      "step": 1647
+    },
+    {
+      "epoch": 0.8789333333333333,
+      "grad_norm": 0.40236452875488554,
+      "learning_rate": 7.595522979965819e-06,
+      "loss": 0.6004,
+      "step": 1648
+    },
+    {
+      "epoch": 0.8794666666666666,
+      "grad_norm": 0.48698958631856193,
+      "learning_rate": 7.529600393796232e-06,
+      "loss": 0.7003,
+      "step": 1649
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.46135947514065956,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6278,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8805333333333333,
+      "grad_norm": 0.4282588139719773,
+      "learning_rate": 7.3985838094349444e-06,
+      "loss": 0.6055,
+      "step": 1651
+    },
+    {
+      "epoch": 0.8810666666666667,
+      "grad_norm": 0.49488868238005507,
+      "learning_rate": 7.333490202478666e-06,
+      "loss": 0.7311,
+      "step": 1652
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.49645661708457584,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6515,
+      "step": 1653
+    },
+    {
+      "epoch": 0.8821333333333333,
+      "grad_norm": 0.4632243612209577,
+      "learning_rate": 7.204133330911178e-06,
+      "loss": 0.6983,
+      "step": 1654
+    },
+    {
+      "epoch": 0.8826666666666667,
+      "grad_norm": 0.4394766733689113,
+      "learning_rate": 7.1398704525792e-06,
+      "loss": 0.6608,
+      "step": 1655
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.4621408336491037,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6321,
+      "step": 1656
+    },
+    {
+      "epoch": 0.8837333333333334,
+      "grad_norm": 0.4748794449223368,
+      "learning_rate": 7.012176770311862e-06,
+      "loss": 0.8009,
+      "step": 1657
+    },
+    {
+      "epoch": 0.8842666666666666,
+      "grad_norm": 0.5811079235129704,
+      "learning_rate": 6.948746347689183e-06,
+      "loss": 0.7525,
+      "step": 1658
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.5536825290584947,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.7958,
+      "step": 1659
+    },
+    {
+      "epoch": 0.8853333333333333,
+      "grad_norm": 0.4449735221813921,
+      "learning_rate": 6.8227192865295995e-06,
+      "loss": 0.6269,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8858666666666667,
+      "grad_norm": 0.4889710443032619,
+      "learning_rate": 6.760123024328624e-06,
+      "loss": 0.7816,
+      "step": 1661
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.5349562942688116,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.6645,
+      "step": 1662
+    },
+    {
+      "epoch": 0.8869333333333334,
+      "grad_norm": 0.4934466804295362,
+      "learning_rate": 6.635765971293484e-06,
+      "loss": 0.6681,
+      "step": 1663
+    },
+    {
+      "epoch": 0.8874666666666666,
+      "grad_norm": 0.9936281137081363,
+      "learning_rate": 6.5740055518083375e-06,
+      "loss": 0.6872,
+      "step": 1664
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.488307043217961,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.7075,
+      "step": 1665
+    },
+    {
+      "epoch": 0.8885333333333333,
+      "grad_norm": 0.6075844527604656,
+      "learning_rate": 6.451321849032288e-06,
+      "loss": 0.8007,
+      "step": 1666
+    },
+    {
+      "epoch": 0.8890666666666667,
+      "grad_norm": 0.44454223076412264,
+      "learning_rate": 6.390398932093555e-06,
+      "loss": 0.7356,
+      "step": 1667
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.5519065208356204,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.771,
+      "step": 1668
+    },
+    {
+      "epoch": 0.8901333333333333,
+      "grad_norm": 0.5478271754599924,
+      "learning_rate": 6.269391876739495e-06,
+      "loss": 0.7592,
+      "step": 1669
+    },
+    {
+      "epoch": 0.8906666666666667,
+      "grad_norm": 0.4408424564893833,
+      "learning_rate": 6.209308099669597e-06,
+      "loss": 0.6955,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.6270738931351739,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.7421,
+      "step": 1671
+    },
+    {
+      "epoch": 0.8917333333333334,
+      "grad_norm": 0.9517628597807377,
+      "learning_rate": 6.089980943839924e-06,
+      "loss": 0.8323,
+      "step": 1672
+    },
+    {
+      "epoch": 0.8922666666666667,
+      "grad_norm": 0.5364284273406238,
+      "learning_rate": 6.030737921409169e-06,
+      "loss": 0.6865,
+      "step": 1673
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.6419546535789074,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.7216,
+      "step": 1674
+    },
+    {
+      "epoch": 0.8933333333333333,
+      "grad_norm": 0.5269843159023666,
+      "learning_rate": 5.913093872058528e-06,
+      "loss": 0.6767,
+      "step": 1675
+    },
+    {
+      "epoch": 0.8938666666666667,
+      "grad_norm": 0.4352999520696695,
+      "learning_rate": 5.854693196441641e-06,
+      "loss": 0.7933,
+      "step": 1676
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.5580198182493007,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.8353,
+      "step": 1677
+    },
+    {
+      "epoch": 0.8949333333333334,
+      "grad_norm": 0.587909658106802,
+      "learning_rate": 5.738735415290642e-06,
+      "loss": 0.6288,
+      "step": 1678
+    },
+    {
+      "epoch": 0.8954666666666666,
+      "grad_norm": 0.5520557307410818,
+      "learning_rate": 5.681178656024055e-06,
+      "loss": 0.7293,
+      "step": 1679
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.6114940005699151,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.7133,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8965333333333333,
+      "grad_norm": 0.6103014329036469,
+      "learning_rate": 5.566910259474289e-06,
+      "loss": 0.8082,
+      "step": 1681
+    },
+    {
+      "epoch": 0.8970666666666667,
+      "grad_norm": 0.5046918194388451,
+      "learning_rate": 5.510198963413881e-06,
+      "loss": 0.6353,
+      "step": 1682
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.5830522248552155,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6757,
+      "step": 1683
+    },
+    {
+      "epoch": 0.8981333333333333,
+      "grad_norm": 0.4094809100101818,
+      "learning_rate": 5.397623022464226e-06,
+      "loss": 0.6888,
+      "step": 1684
+    },
+    {
+      "epoch": 0.8986666666666666,
+      "grad_norm": 0.5414726816177821,
+      "learning_rate": 5.341758713743828e-06,
+      "loss": 0.7496,
+      "step": 1685
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.6107634014919001,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6886,
+      "step": 1686
+    },
+    {
+      "epoch": 0.8997333333333334,
+      "grad_norm": 0.4106943693814969,
+      "learning_rate": 5.230878253907912e-06,
+      "loss": 0.6593,
+      "step": 1687
+    },
+    {
+      "epoch": 0.9002666666666667,
+      "grad_norm": 0.5230257324815101,
+      "learning_rate": 5.175862433898282e-06,
+      "loss": 0.6109,
+      "step": 1688
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.7562403955437415,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.9042,
+      "step": 1689
+    },
+    {
+      "epoch": 0.9013333333333333,
+      "grad_norm": 0.558874746914631,
+      "learning_rate": 5.066680435123106e-06,
+      "loss": 0.8388,
+      "step": 1690
+    },
+    {
+      "epoch": 0.9018666666666667,
+      "grad_norm": 0.48021688999202583,
+      "learning_rate": 5.012514582391592e-06,
+      "loss": 0.6812,
+      "step": 1691
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.4602437546671285,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.7249,
+      "step": 1692
+    },
+    {
+      "epoch": 0.9029333333333334,
+      "grad_norm": 0.47383038109374326,
+      "learning_rate": 4.905033978977491e-06,
+      "loss": 0.6403,
+      "step": 1693
+    },
+    {
+      "epoch": 0.9034666666666666,
+      "grad_norm": 0.5274325832002795,
+      "learning_rate": 4.851719549248301e-06,
+      "loss": 0.7773,
+      "step": 1694
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.46136179468542826,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6554,
+      "step": 1695
+    },
+    {
+      "epoch": 0.9045333333333333,
+      "grad_norm": 0.4991708505289876,
+      "learning_rate": 4.745943229770122e-06,
+      "loss": 0.7029,
+      "step": 1696
+    },
+    {
+      "epoch": 0.9050666666666667,
+      "grad_norm": 0.626880797898286,
+      "learning_rate": 4.693481655885257e-06,
+      "loss": 0.7866,
+      "step": 1697
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.5845448167722781,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.7247,
+      "step": 1698
+    },
+    {
+      "epoch": 0.9061333333333333,
+      "grad_norm": 0.4862178222397938,
+      "learning_rate": 4.58941246311464e-06,
+      "loss": 0.653,
+      "step": 1699
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.4861391733533597,
+      "learning_rate": 4.537805154995278e-06,
+      "loss": 0.6721,
+      "step": 1700
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.42820206581647385,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6301,
+      "step": 1701
+    },
+    {
+      "epoch": 0.9077333333333333,
+      "grad_norm": 0.560034769193496,
+      "learning_rate": 4.435445885824285e-06,
+      "loss": 0.8402,
+      "step": 1702
+    },
+    {
+      "epoch": 0.9082666666666667,
+      "grad_norm": 0.570800689301257,
+      "learning_rate": 4.384694230432984e-06,
+      "loss": 0.6529,
+      "step": 1703
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.42588385382805777,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6535,
+      "step": 1704
+    },
+    {
+      "epoch": 0.9093333333333333,
+      "grad_norm": 0.5415931163910935,
+      "learning_rate": 4.2840476357989825e-06,
+      "loss": 0.7774,
+      "step": 1705
+    },
+    {
+      "epoch": 0.9098666666666667,
+      "grad_norm": 0.6272470407089946,
+      "learning_rate": 4.2341529971023255e-06,
+      "loss": 0.7145,
+      "step": 1706
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.4535081229895934,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6937,
+      "step": 1707
+    },
+    {
+      "epoch": 0.9109333333333334,
+      "grad_norm": 0.6417022320016039,
+      "learning_rate": 4.135221781914034e-06,
+      "loss": 0.7383,
+      "step": 1708
+    },
+    {
+      "epoch": 0.9114666666666666,
+      "grad_norm": 0.45964017202954827,
+      "learning_rate": 4.0861855008460405e-06,
+      "loss": 0.6948,
+      "step": 1709
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.5320742897519447,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.6478,
+      "step": 1710
+    },
+    {
+      "epoch": 0.9125333333333333,
+      "grad_norm": 0.4134215065349257,
+      "learning_rate": 3.988972323910778e-06,
+      "loss": 0.6269,
+      "step": 1711
+    },
+    {
+      "epoch": 0.9130666666666667,
+      "grad_norm": 0.607172942134624,
+      "learning_rate": 3.9407957183368095e-06,
+      "loss": 0.6956,
+      "step": 1712
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.5252893129734731,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.7253,
+      "step": 1713
+    },
+    {
+      "epoch": 0.9141333333333334,
+      "grad_norm": 0.46582545750354765,
+      "learning_rate": 3.845303192289074e-06,
+      "loss": 0.7394,
+      "step": 1714
+    },
+    {
+      "epoch": 0.9146666666666666,
+      "grad_norm": 0.4094338874129953,
+      "learning_rate": 3.797987556970495e-06,
+      "loss": 0.6637,
+      "step": 1715
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.4636293196252806,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6563,
+      "step": 1716
+    },
+    {
+      "epoch": 0.9157333333333333,
+      "grad_norm": 0.5415115913571019,
+      "learning_rate": 3.7042182482018075e-06,
+      "loss": 0.8612,
+      "step": 1717
+    },
+    {
+      "epoch": 0.9162666666666667,
+      "grad_norm": 0.4522640549950337,
+      "learning_rate": 3.6577648547611033e-06,
+      "loss": 0.7838,
+      "step": 1718
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.38016753165518974,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6309,
+      "step": 1719
+    },
+    {
+      "epoch": 0.9173333333333333,
+      "grad_norm": 0.39069729040605633,
+      "learning_rate": 3.565721283350931e-06,
+      "loss": 0.684,
+      "step": 1720
+    },
+    {
+      "epoch": 0.9178666666666667,
+      "grad_norm": 0.45806841285698824,
+      "learning_rate": 3.5201313802375456e-06,
+      "loss": 0.6847,
+      "step": 1721
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.43406155609265734,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6533,
+      "step": 1722
+    },
+    {
+      "epoch": 0.9189333333333334,
+      "grad_norm": 0.5676457830066718,
+      "learning_rate": 3.4298160198856568e-06,
+      "loss": 0.6769,
+      "step": 1723
+    },
+    {
+      "epoch": 0.9194666666666667,
+      "grad_norm": 0.4362044805517463,
+      "learning_rate": 3.3850908323424967e-06,
+      "loss": 0.7128,
+      "step": 1724
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.4448641004513165,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6975,
+      "step": 1725
+    },
+    {
+      "epoch": 0.9205333333333333,
+      "grad_norm": 0.4245635378541507,
+      "learning_rate": 3.296506110302422e-06,
+      "loss": 0.6425,
+      "step": 1726
+    },
+    {
+      "epoch": 0.9210666666666667,
+      "grad_norm": 0.4422408012266385,
+      "learning_rate": 3.252646840332918e-06,
+      "loss": 0.7702,
+      "step": 1727
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4657874956661579,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6791,
+      "step": 1728
+    },
+    {
+      "epoch": 0.9221333333333334,
+      "grad_norm": 0.4812393443605173,
+      "learning_rate": 3.1657951373467497e-06,
+      "loss": 0.6028,
+      "step": 1729
+    },
+    {
+      "epoch": 0.9226666666666666,
+      "grad_norm": 0.43947797613439554,
+      "learning_rate": 3.1228029636824475e-06,
+      "loss": 0.6834,
+      "step": 1730
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.46533180700732835,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.673,
+      "step": 1731
+    },
+    {
+      "epoch": 0.9237333333333333,
+      "grad_norm": 0.5505691048109808,
+      "learning_rate": 3.037686613916857e-06,
+      "loss": 0.6438,
+      "step": 1732
+    },
+    {
+      "epoch": 0.9242666666666667,
+      "grad_norm": 0.6099905659325539,
+      "learning_rate": 2.995562691985898e-06,
+      "loss": 0.7642,
+      "step": 1733
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.4772093942998137,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.6966,
+      "step": 1734
+    },
+    {
+      "epoch": 0.9253333333333333,
+      "grad_norm": 0.8552915050342436,
+      "learning_rate": 2.912183982969385e-06,
+      "loss": 1.0042,
+      "step": 1735
+    },
+    {
+      "epoch": 0.9258666666666666,
+      "grad_norm": 0.47144045944851415,
+      "learning_rate": 2.8709294448653225e-06,
+      "loss": 0.6995,
+      "step": 1736
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.4764511314947748,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6833,
+      "step": 1737
+    },
+    {
+      "epoch": 0.9269333333333334,
+      "grad_norm": 0.40338923322749526,
+      "learning_rate": 2.789290617426765e-06,
+      "loss": 0.6012,
+      "step": 1738
+    },
+    {
+      "epoch": 0.9274666666666667,
+      "grad_norm": 0.5469624653093387,
+      "learning_rate": 2.748906571878207e-06,
+      "loss": 0.6508,
+      "step": 1739
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4033243451447917,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.5882,
+      "step": 1740
+    },
+    {
+      "epoch": 0.9285333333333333,
+      "grad_norm": 0.5650299736539638,
+      "learning_rate": 2.6690098200866098e-06,
+      "loss": 0.748,
+      "step": 1741
+    },
+    {
+      "epoch": 0.9290666666666667,
+      "grad_norm": 0.4378622632528653,
+      "learning_rate": 2.6294973524274125e-06,
+      "loss": 0.6493,
+      "step": 1742
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.4459629050866768,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6236,
+      "step": 1743
+    },
+    {
+      "epoch": 0.9301333333333334,
+      "grad_norm": 0.6222359720620512,
+      "learning_rate": 2.551344823532964e-06,
+      "loss": 0.7108,
+      "step": 1744
+    },
+    {
+      "epoch": 0.9306666666666666,
+      "grad_norm": 0.5122806021831506,
+      "learning_rate": 2.5127049956730207e-06,
+      "loss": 0.7062,
+      "step": 1745
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.5414807022479352,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.7937,
+      "step": 1746
+    },
+    {
+      "epoch": 0.9317333333333333,
+      "grad_norm": 0.5881527424573568,
+      "learning_rate": 2.436298790049363e-06,
+      "loss": 0.7238,
+      "step": 1747
+    },
+    {
+      "epoch": 0.9322666666666667,
+      "grad_norm": 0.4265515449250894,
+      "learning_rate": 2.3985326404461604e-06,
+      "loss": 0.644,
+      "step": 1748
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.5090537767426827,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.7131,
+      "step": 1749
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.4973165848843641,
+      "learning_rate": 2.3238748115339324e-06,
+      "loss": 0.7288,
+      "step": 1750
+    },
+    {
+      "epoch": 0.9338666666666666,
+      "grad_norm": 0.49711395640263095,
+      "learning_rate": 2.286983355164529e-06,
+      "loss": 0.8063,
+      "step": 1751
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.45003147938779353,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6672,
+      "step": 1752
+    },
+    {
+      "epoch": 0.9349333333333333,
+      "grad_norm": 0.5263873461041867,
+      "learning_rate": 2.2140759094162467e-06,
+      "loss": 0.6554,
+      "step": 1753
+    },
+    {
+      "epoch": 0.9354666666666667,
+      "grad_norm": 0.4473989869953113,
+      "learning_rate": 2.178060137750071e-06,
+      "loss": 0.7079,
+      "step": 1754
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.5555097211297417,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.7683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.9365333333333333,
+      "grad_norm": 0.4139824487629015,
+      "learning_rate": 2.106905034576112e-06,
+      "loss": 0.6547,
+      "step": 1756
+    },
+    {
+      "epoch": 0.9370666666666667,
+      "grad_norm": 0.6032150172121085,
+      "learning_rate": 2.0717659155482738e-06,
+      "loss": 0.7181,
+      "step": 1757
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.5419524456251257,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.6984,
+      "step": 1758
+    },
+    {
+      "epoch": 0.9381333333333334,
+      "grad_norm": 0.4466457886122112,
+      "learning_rate": 2.002365067264289e-06,
+      "loss": 0.7033,
+      "step": 1759
+    },
+    {
+      "epoch": 0.9386666666666666,
+      "grad_norm": 0.5912517841993868,
+      "learning_rate": 1.968103545249611e-06,
+      "loss": 0.7759,
+      "step": 1760
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.4169609800985717,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6302,
+      "step": 1761
+    },
+    {
+      "epoch": 0.9397333333333333,
+      "grad_norm": 0.3825795729910924,
+      "learning_rate": 1.900458817025097e-06,
+      "loss": 0.6144,
+      "step": 1762
+    },
+    {
+      "epoch": 0.9402666666666667,
+      "grad_norm": 0.46253675399641014,
+      "learning_rate": 1.8670758128126909e-06,
+      "loss": 0.7467,
+      "step": 1763
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.614394404605922,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.7495,
+      "step": 1764
+    },
+    {
+      "epoch": 0.9413333333333334,
+      "grad_norm": 0.5042783672677922,
+      "learning_rate": 1.8011890226208527e-06,
+      "loss": 0.8186,
+      "step": 1765
+    },
+    {
+      "epoch": 0.9418666666666666,
+      "grad_norm": 0.5560448870125844,
+      "learning_rate": 1.7686854333893833e-06,
+      "loss": 0.7447,
+      "step": 1766
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.490352727204397,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.7362,
+      "step": 1767
+    },
+    {
+      "epoch": 0.9429333333333333,
+      "grad_norm": 0.44728772727080823,
+      "learning_rate": 1.7045583519583074e-06,
+      "loss": 0.6484,
+      "step": 1768
+    },
+    {
+      "epoch": 0.9434666666666667,
+      "grad_norm": 0.49411244121884584,
+      "learning_rate": 1.6729350512519005e-06,
+      "loss": 0.7414,
+      "step": 1769
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3850409913369315,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.5999,
+      "step": 1770
+    },
+    {
+      "epoch": 0.9445333333333333,
+      "grad_norm": 0.46460818555883665,
+      "learning_rate": 1.6105694020169593e-06,
+      "loss": 0.6602,
+      "step": 1771
+    },
+    {
+      "epoch": 0.9450666666666667,
+      "grad_norm": 0.4258599918242338,
+      "learning_rate": 1.5798272397217095e-06,
+      "loss": 0.6777,
+      "step": 1772
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.5053769678163469,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6584,
+      "step": 1773
+    },
+    {
+      "epoch": 0.9461333333333334,
+      "grad_norm": 0.4327361124299862,
+      "learning_rate": 1.5192246987791981e-06,
+      "loss": 0.651,
+      "step": 1774
+    },
+    {
+      "epoch": 0.9466666666666667,
+      "grad_norm": 0.4267839667595951,
+      "learning_rate": 1.489364501100332e-06,
+      "loss": 0.7154,
+      "step": 1775
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.5273772134438605,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6509,
+      "step": 1776
+    },
+    {
+      "epoch": 0.9477333333333333,
+      "grad_norm": 0.47718116203120287,
+      "learning_rate": 1.430526697162482e-06,
+      "loss": 0.6662,
+      "step": 1777
+    },
+    {
+      "epoch": 0.9482666666666667,
+      "grad_norm": 0.5424380948717015,
+      "learning_rate": 1.4015492666021312e-06,
+      "loss": 0.7798,
+      "step": 1778
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.5015991268010247,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.7824,
+      "step": 1779
+    },
+    {
+      "epoch": 0.9493333333333334,
+      "grad_norm": 0.723900086009267,
+      "learning_rate": 1.344477780953346e-06,
+      "loss": 0.7137,
+      "step": 1780
+    },
+    {
+      "epoch": 0.9498666666666666,
+      "grad_norm": 0.4085920312350529,
+      "learning_rate": 1.3163838962890195e-06,
+      "loss": 0.6213,
+      "step": 1781
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.49665624100972683,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.7234,
+      "step": 1782
+    },
+    {
+      "epoch": 0.9509333333333333,
+      "grad_norm": 0.5304273846910255,
+      "learning_rate": 1.261080262743297e-06,
+      "loss": 0.7091,
+      "step": 1783
+    },
+    {
+      "epoch": 0.9514666666666667,
+      "grad_norm": 0.4961187437951281,
+      "learning_rate": 1.2338706790069431e-06,
+      "loss": 0.7463,
+      "step": 1784
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.5448542537532,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.7914,
+      "step": 1785
+    },
+    {
+      "epoch": 0.9525333333333333,
+      "grad_norm": 0.3958449956876178,
+      "learning_rate": 1.1803363838667092e-06,
+      "loss": 0.6833,
+      "step": 1786
+    },
+    {
+      "epoch": 0.9530666666666666,
+      "grad_norm": 0.402377124921106,
+      "learning_rate": 1.1540118323243865e-06,
+      "loss": 0.6247,
+      "step": 1787
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.4151752614619234,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6616,
+      "step": 1788
+    },
+    {
+      "epoch": 0.9541333333333334,
+      "grad_norm": 0.4802399299257456,
+      "learning_rate": 1.1022483143405705e-06,
+      "loss": 0.7318,
+      "step": 1789
+    },
+    {
+      "epoch": 0.9546666666666667,
+      "grad_norm": 0.3979602633245147,
+      "learning_rate": 1.076809502472831e-06,
+      "loss": 0.6792,
+      "step": 1790
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.4669863303336634,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.7233,
+      "step": 1791
+    },
+    {
+      "epoch": 0.9557333333333333,
+      "grad_norm": 0.5489679394586826,
+      "learning_rate": 1.0268181528061749e-06,
+      "loss": 0.6343,
+      "step": 1792
+    },
+    {
+      "epoch": 0.9562666666666667,
+      "grad_norm": 0.5123990189343095,
+      "learning_rate": 1.0022657642890231e-06,
+      "loss": 0.6827,
+      "step": 1793
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.5575525400757082,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.7718,
+      "step": 1794
+    },
+    {
+      "epoch": 0.9573333333333334,
+      "grad_norm": 0.5732715043210574,
+      "learning_rate": 9.540479264726676e-07,
+      "loss": 0.7825,
+      "step": 1795
+    },
+    {
+      "epoch": 0.9578666666666666,
+      "grad_norm": 0.5111553239009662,
+      "learning_rate": 9.303826211592315e-07,
+      "loss": 0.6764,
+      "step": 1796
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.47616444580861356,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6491,
+      "step": 1797
+    },
+    {
+      "epoch": 0.9589333333333333,
+      "grad_norm": 0.46051172278486796,
+      "learning_rate": 8.839395910626213e-07,
+      "loss": 0.6965,
+      "step": 1798
+    },
+    {
+      "epoch": 0.9594666666666667,
+      "grad_norm": 0.5059183466848223,
+      "learning_rate": 8.611620049653879e-07,
+      "loss": 0.731,
+      "step": 1799
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.49106201960409346,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6666,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9605333333333334,
+      "grad_norm": 0.5782792211532685,
+      "learning_rate": 8.16495030759501e-07,
+      "loss": 0.7777,
+      "step": 1801
+    },
+    {
+      "epoch": 0.9610666666666666,
+      "grad_norm": 0.5543293060255573,
+      "learning_rate": 7.946057760332193e-07,
+      "loss": 0.7129,
+      "step": 1802
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.5528950661278335,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.7632,
+      "step": 1803
+    },
+    {
+      "epoch": 0.9621333333333333,
+      "grad_norm": 0.3778988008025402,
+      "learning_rate": 7.517160581569372e-07,
+      "loss": 0.56,
+      "step": 1804
+    },
+    {
+      "epoch": 0.9626666666666667,
+      "grad_norm": 0.4139560921395551,
+      "learning_rate": 7.307157230821426e-07,
+      "loss": 0.6697,
+      "step": 1805
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.5485450536674931,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6698,
+      "step": 1806
+    },
+    {
+      "epoch": 0.9637333333333333,
+      "grad_norm": 0.44495726703478017,
+      "learning_rate": 6.896044142100433e-07,
+      "loss": 0.6781,
+      "step": 1807
+    },
+    {
+      "epoch": 0.9642666666666667,
+      "grad_norm": 0.6346426038661458,
+      "learning_rate": 6.694935631773258e-07,
+      "loss": 0.7519,
+      "step": 1808
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.435310939222078,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6706,
+      "step": 1809
+    },
+    {
+      "epoch": 0.9653333333333334,
+      "grad_norm": 0.4741403213511459,
+      "learning_rate": 6.301617681886863e-07,
+      "loss": 0.685,
+      "step": 1810
+    },
+    {
+      "epoch": 0.9658666666666667,
+      "grad_norm": 0.47941646677875055,
+      "learning_rate": 6.109409416834688e-07,
+      "loss": 0.7037,
+      "step": 1811
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.5271890177485758,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.7678,
+      "step": 1812
+    },
+    {
+      "epoch": 0.9669333333333333,
+      "grad_norm": 0.5799828402558466,
+      "learning_rate": 5.733897176325665e-07,
+      "loss": 0.7509,
+      "step": 1813
+    },
+    {
+      "epoch": 0.9674666666666667,
+      "grad_norm": 0.49652524125300823,
+      "learning_rate": 5.550594322205504e-07,
+      "loss": 0.7167,
+      "step": 1814
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.5713359605247931,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.7461,
+      "step": 1815
+    },
+    {
+      "epoch": 0.9685333333333334,
+      "grad_norm": 0.5331404008804096,
+      "learning_rate": 5.192897883082747e-07,
+      "loss": 0.8511,
+      "step": 1816
+    },
+    {
+      "epoch": 0.9690666666666666,
+      "grad_norm": 0.48242135683654225,
+      "learning_rate": 5.018505366216175e-07,
+      "loss": 0.6862,
+      "step": 1817
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.4957894244323012,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6923,
+      "step": 1818
+    },
+    {
+      "epoch": 0.9701333333333333,
+      "grad_norm": 0.3579312113376426,
+      "learning_rate": 4.678634341683252e-07,
+      "loss": 0.5774,
+      "step": 1819
+    },
+    {
+      "epoch": 0.9706666666666667,
+      "grad_norm": 0.41496835638705143,
+      "learning_rate": 4.5131568489236166e-07,
+      "loss": 0.6858,
+      "step": 1820
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.4641364341036602,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6982,
+      "step": 1821
+    },
+    {
+      "epoch": 0.9717333333333333,
+      "grad_norm": 0.47077736559547806,
+      "learning_rate": 4.191120373120749e-07,
+      "loss": 0.6239,
+      "step": 1822
+    },
+    {
+      "epoch": 0.9722666666666666,
+      "grad_norm": 0.5389246898448312,
+      "learning_rate": 4.034562351727389e-07,
+      "loss": 0.7296,
+      "step": 1823
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.5032207629100807,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6301,
+      "step": 1824
+    },
+    {
+      "epoch": 0.9733333333333334,
+      "grad_norm": 0.4950965243486184,
+      "learning_rate": 3.73036907948543e-07,
+      "loss": 0.7424,
+      "step": 1825
+    },
+    {
+      "epoch": 0.9738666666666667,
+      "grad_norm": 1.3905925807886583,
+      "learning_rate": 3.582734737004101e-07,
+      "loss": 0.7117,
+      "step": 1826
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.4544486959060869,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.686,
+      "step": 1827
+    },
+    {
+      "epoch": 0.9749333333333333,
+      "grad_norm": 0.43078132109881695,
+      "learning_rate": 3.296392843612273e-07,
+      "loss": 0.6374,
+      "step": 1828
+    },
+    {
+      "epoch": 0.9754666666666667,
+      "grad_norm": 0.4541643311828197,
+      "learning_rate": 3.1576861477621287e-07,
+      "loss": 0.7426,
+      "step": 1829
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.46086034918164714,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6856,
+      "step": 1830
+    },
+    {
+      "epoch": 0.9765333333333334,
+      "grad_norm": 0.4963135460344512,
+      "learning_rate": 2.889203328748424e-07,
+      "loss": 0.6885,
+      "step": 1831
+    },
+    {
+      "epoch": 0.9770666666666666,
+      "grad_norm": 0.5880509527579628,
+      "learning_rate": 2.759428007315212e-07,
+      "loss": 0.7538,
+      "step": 1832
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.45817989964313643,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6883,
+      "step": 1833
+    },
+    {
+      "epoch": 0.9781333333333333,
+      "grad_norm": 0.47083292914551456,
+      "learning_rate": 2.5088114782392257e-07,
+      "loss": 0.7467,
+      "step": 1834
+    },
+    {
+      "epoch": 0.9786666666666667,
+      "grad_norm": 0.4270945612153523,
+      "learning_rate": 2.3879710189753656e-07,
+      "loss": 0.6991,
+      "step": 1835
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.43395210880349344,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6581,
+      "step": 1836
+    },
+    {
+      "epoch": 0.9797333333333333,
+      "grad_norm": 0.5120487977928043,
+      "learning_rate": 2.15522751523467e-07,
+      "loss": 0.7571,
+      "step": 1837
+    },
+    {
+      "epoch": 0.9802666666666666,
+      "grad_norm": 0.5426408727132688,
+      "learning_rate": 2.0433251657653308e-07,
+      "loss": 0.7361,
+      "step": 1838
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.5048709941315147,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6776,
+      "step": 1839
+    },
+    {
+      "epoch": 0.9813333333333333,
+      "grad_norm": 0.4406697455325257,
+      "learning_rate": 1.8284609424142895e-07,
+      "loss": 0.6452,
+      "step": 1840
+    },
+    {
+      "epoch": 0.9818666666666667,
+      "grad_norm": 0.49960312933395784,
+      "learning_rate": 1.7254997101500137e-07,
+      "loss": 0.7075,
+      "step": 1841
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.4827963100118666,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.677,
+      "step": 1842
+    },
+    {
+      "epoch": 0.9829333333333333,
+      "grad_norm": 0.5025373556902175,
+      "learning_rate": 1.5285205417319149e-07,
+      "loss": 0.7196,
+      "step": 1843
+    },
+    {
+      "epoch": 0.9834666666666667,
+      "grad_norm": 0.4193837396130738,
+      "learning_rate": 1.4345031937879062e-07,
+      "loss": 0.6838,
+      "step": 1844
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.44675396034287024,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.7201,
+      "step": 1845
+    },
+    {
+      "epoch": 0.9845333333333334,
+      "grad_norm": 0.4900721781740654,
+      "learning_rate": 1.255414374179531e-07,
+      "loss": 0.6876,
+      "step": 1846
+    },
+    {
+      "epoch": 0.9850666666666666,
+      "grad_norm": 0.4506382278303298,
+      "learning_rate": 1.170343437301491e-07,
+      "loss": 0.7381,
+      "step": 1847
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.34987146431543165,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.578,
+      "step": 1848
+    },
+    {
+      "epoch": 0.9861333333333333,
+      "grad_norm": 0.5034243662178048,
+      "learning_rate": 1.0091497795706728e-07,
+      "loss": 0.7319,
+      "step": 1849
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.4227055102320483,
+      "learning_rate": 9.330275400666332e-08,
+      "loss": 0.6679,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.45871344074784653,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.7228,
+      "step": 1851
+    },
+    {
+      "epoch": 0.9877333333333334,
+      "grad_norm": 0.57147546388469,
+      "learning_rate": 7.8973337634336e-08,
+      "loss": 0.7528,
+      "step": 1852
+    },
+    {
+      "epoch": 0.9882666666666666,
+      "grad_norm": 0.58725852317797,
+      "learning_rate": 7.225618800222877e-08,
+      "loss": 0.755,
+      "step": 1853
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.39234786416075723,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6112,
+      "step": 1854
+    },
+    {
+      "epoch": 0.9893333333333333,
+      "grad_norm": 0.4341603371983377,
+      "learning_rate": 5.971710613821291e-08,
+      "loss": 0.6532,
+      "step": 1855
+    },
+    {
+      "epoch": 0.9898666666666667,
+      "grad_norm": 0.46665268783732106,
+      "learning_rate": 5.389521134989695e-08,
+      "loss": 0.7195,
+      "step": 1856
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.4343399933377864,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6428,
+      "step": 1857
+    },
+    {
+      "epoch": 0.9909333333333333,
+      "grad_norm": 0.5401607092202143,
+      "learning_rate": 4.314680098592705e-08,
+      "loss": 0.7836,
+      "step": 1858
+    },
+    {
+      "epoch": 0.9914666666666667,
+      "grad_norm": 0.48561638655807204,
+      "learning_rate": 3.8220317506654226e-08,
+      "loss": 0.7433,
+      "step": 1859
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.5705619733589412,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.7619,
+      "step": 1860
+    },
+    {
+      "epoch": 0.9925333333333334,
+      "grad_norm": 0.4723919850354279,
+      "learning_rate": 2.9262867509605163e-08,
+      "loss": 0.6844,
+      "step": 1861
+    },
+    {
+      "epoch": 0.9930666666666667,
+      "grad_norm": 0.6641172197701478,
+      "learning_rate": 2.5231927740154704e-08,
+      "loss": 0.8525,
+      "step": 1862
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.49334083555221586,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.7167,
+      "step": 1863
+    },
+    {
+      "epoch": 0.9941333333333333,
+      "grad_norm": 0.4676196339068881,
+      "learning_rate": 1.8065678844314538e-08,
+      "loss": 0.6979,
+      "step": 1864
+    },
+    {
+      "epoch": 0.9946666666666667,
+      "grad_norm": 0.7208173416387388,
+      "learning_rate": 1.4930391117451426e-08,
+      "loss": 0.8211,
+      "step": 1865
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.47014724542518643,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6786,
+      "step": 1866
+    },
+    {
+      "epoch": 0.9957333333333334,
+      "grad_norm": 0.8022893409373152,
+      "learning_rate": 9.555535917993297e-09,
+      "loss": 0.8995,
+      "step": 1867
+    },
+    {
+      "epoch": 0.9962666666666666,
+      "grad_norm": 0.6701439219566838,
+      "learning_rate": 7.315984495548378e-09,
+      "loss": 0.8955,
+      "step": 1868
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.43662100159815975,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6931,
+      "step": 1869
+    },
+    {
+      "epoch": 0.9973333333333333,
+      "grad_norm": 0.5397248237775718,
+      "learning_rate": 3.732667443390181e-09,
+      "loss": 0.6742,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9978666666666667,
+      "grad_norm": 0.4175834153515395,
+      "learning_rate": 2.388912514017516e-09,
+      "loss": 0.6579,
+      "step": 1871
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.42630390827681963,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.6214,
+      "step": 1872
+    },
+    {
+      "epoch": 0.9989333333333333,
+      "grad_norm": 0.4994756342367005,
+      "learning_rate": 5.972299119250125e-10,
+      "loss": 0.7364,
+      "step": 1873
+    },
+    {
+      "epoch": 0.9994666666666666,
+      "grad_norm": 0.4771620720318906,
+      "learning_rate": 1.4930758944764479e-10,
+      "loss": 0.7127,
+      "step": 1874
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.4090006633917449,
+      "learning_rate": 0.0,
+      "loss": 0.6534,
+      "step": 1875
+    },
+    {
+      "epoch": 1.0,
+      "step": 1875,
+      "total_flos": 1522821307072512.0,
+      "train_loss": 0.7911884547869364,
+      "train_runtime": 28059.907,
+      "train_samples_per_second": 1.069,
+      "train_steps_per_second": 0.067
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1522821307072512.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..06461bad72253ec3f573313ab1262f14be032f30
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "v_proj",
+    "k_proj",
+    "down_proj",
+    "gate_proj",
+    "o_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ddaf7c2b1b57ed0830d8135342e9e50886de2f53
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7fdcc810a4d0279ffcc37f0eb3cef201b38a742e5705de63aec05fbeaa6fedec
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7b8f3008a98b36d12f915dbd60226f676e8ba091
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:201058e5be6f78f9b81917e024d5fcc86aa13da0e17b59942fca6ad12920fac6
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0cc5d7d6ac85fdf80193e4cf5a62172db1a38d2
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,13167 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005333333333333334,
+      "grad_norm": 0.75233063420523,
+      "learning_rate": 3.5087719298245615e-06,
+      "loss": 1.2255,
+      "step": 1
+    },
+    {
+      "epoch": 0.0010666666666666667,
+      "grad_norm": 1.0166309192256069,
+      "learning_rate": 7.017543859649123e-06,
+      "loss": 1.1534,
+      "step": 2
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.132417232096234,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.2803,
+      "step": 3
+    },
+    {
+      "epoch": 0.0021333333333333334,
+      "grad_norm": 0.9352205978088273,
+      "learning_rate": 1.4035087719298246e-05,
+      "loss": 1.3408,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026666666666666666,
+      "grad_norm": 0.8267363258300194,
+      "learning_rate": 1.7543859649122806e-05,
+      "loss": 1.353,
+      "step": 5
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8607407611901363,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.2463,
+      "step": 6
+    },
+    {
+      "epoch": 0.0037333333333333333,
+      "grad_norm": 0.7587680789518995,
+      "learning_rate": 2.456140350877193e-05,
+      "loss": 1.2031,
+      "step": 7
+    },
+    {
+      "epoch": 0.004266666666666667,
+      "grad_norm": 0.7229754022476446,
+      "learning_rate": 2.8070175438596492e-05,
+      "loss": 1.1776,
+      "step": 8
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.7072269871649532,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.1774,
+      "step": 9
+    },
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 0.7979191350531996,
+      "learning_rate": 3.508771929824561e-05,
+      "loss": 1.2332,
+      "step": 10
+    },
+    {
+      "epoch": 0.005866666666666667,
+      "grad_norm": 0.732784118427125,
+      "learning_rate": 3.859649122807018e-05,
+      "loss": 1.0808,
+      "step": 11
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8714380275232437,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.1643,
+      "step": 12
+    },
+    {
+      "epoch": 0.006933333333333333,
+      "grad_norm": 0.7759061513691938,
+      "learning_rate": 4.56140350877193e-05,
+      "loss": 1.045,
+      "step": 13
+    },
+    {
+      "epoch": 0.007466666666666667,
+      "grad_norm": 0.6991535365961644,
+      "learning_rate": 4.912280701754386e-05,
+      "loss": 0.9502,
+      "step": 14
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.1604696086280564,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.167,
+      "step": 15
+    },
+    {
+      "epoch": 0.008533333333333334,
+      "grad_norm": 0.7315881408984929,
+      "learning_rate": 5.6140350877192984e-05,
+      "loss": 1.0043,
+      "step": 16
+    },
+    {
+      "epoch": 0.009066666666666667,
+      "grad_norm": 0.8424740742961367,
+      "learning_rate": 5.9649122807017544e-05,
+      "loss": 1.1145,
+      "step": 17
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7494124445848623,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.0957,
+      "step": 18
+    },
+    {
+      "epoch": 0.010133333333333333,
+      "grad_norm": 0.6606564603126561,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.8768,
+      "step": 19
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": 0.8399242186106046,
+      "learning_rate": 7.017543859649122e-05,
+      "loss": 1.0244,
+      "step": 20
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.7826675090493671,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.0998,
+      "step": 21
+    },
+    {
+      "epoch": 0.011733333333333333,
+      "grad_norm": 0.5583109291998765,
+      "learning_rate": 7.719298245614036e-05,
+      "loss": 0.8725,
+      "step": 22
+    },
+    {
+      "epoch": 0.012266666666666667,
+      "grad_norm": 0.6256868157176779,
+      "learning_rate": 8.070175438596491e-05,
+      "loss": 1.0145,
+      "step": 23
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.7795845544031471,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 1.0163,
+      "step": 24
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 0.5955611793656965,
+      "learning_rate": 8.771929824561403e-05,
+      "loss": 1.0053,
+      "step": 25
+    },
+    {
+      "epoch": 0.013866666666666666,
+      "grad_norm": 0.4745775961075087,
+      "learning_rate": 9.12280701754386e-05,
+      "loss": 0.8087,
+      "step": 26
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.7200636042605665,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9417,
+      "step": 27
+    },
+    {
+      "epoch": 0.014933333333333333,
+      "grad_norm": 0.6711680178676414,
+      "learning_rate": 9.824561403508771e-05,
+      "loss": 0.9923,
+      "step": 28
+    },
+    {
+      "epoch": 0.015466666666666667,
+      "grad_norm": 0.5953871452449566,
+      "learning_rate": 0.0001017543859649123,
+      "loss": 0.9618,
+      "step": 29
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.6178325939358567,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.9101,
+      "step": 30
+    },
+    {
+      "epoch": 0.016533333333333334,
+      "grad_norm": 0.6040273680854279,
+      "learning_rate": 0.00010877192982456141,
+      "loss": 0.9319,
+      "step": 31
+    },
+    {
+      "epoch": 0.017066666666666667,
+      "grad_norm": 0.46179099610956637,
+      "learning_rate": 0.00011228070175438597,
+      "loss": 0.7867,
+      "step": 32
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.7770792886943544,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 1.0039,
+      "step": 33
+    },
+    {
+      "epoch": 0.018133333333333335,
+      "grad_norm": 0.6762096229135146,
+      "learning_rate": 0.00011929824561403509,
+      "loss": 1.006,
+      "step": 34
+    },
+    {
+      "epoch": 0.018666666666666668,
+      "grad_norm": 0.5679138696476846,
+      "learning_rate": 0.00012280701754385965,
+      "loss": 0.9809,
+      "step": 35
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.733382199403288,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8995,
+      "step": 36
+    },
+    {
+      "epoch": 0.019733333333333332,
+      "grad_norm": 0.7190720122239778,
+      "learning_rate": 0.0001298245614035088,
+      "loss": 0.984,
+      "step": 37
+    },
+    {
+      "epoch": 0.020266666666666665,
+      "grad_norm": 0.5398148623052902,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.8568,
+      "step": 38
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.6012517612506856,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9214,
+      "step": 39
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 0.5024500935278182,
+      "learning_rate": 0.00014035087719298245,
+      "loss": 0.8186,
+      "step": 40
+    },
+    {
+      "epoch": 0.021866666666666666,
+      "grad_norm": 0.5079206358540236,
+      "learning_rate": 0.00014385964912280703,
+      "loss": 0.789,
+      "step": 41
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5774736050190657,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.9261,
+      "step": 42
+    },
+    {
+      "epoch": 0.022933333333333333,
+      "grad_norm": 0.608909561912015,
+      "learning_rate": 0.00015087719298245616,
+      "loss": 0.9369,
+      "step": 43
+    },
+    {
+      "epoch": 0.023466666666666667,
+      "grad_norm": 0.655766067594755,
+      "learning_rate": 0.0001543859649122807,
+      "loss": 0.9121,
+      "step": 44
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.561485841015699,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.9179,
+      "step": 45
+    },
+    {
+      "epoch": 0.024533333333333334,
+      "grad_norm": 0.544693161395086,
+      "learning_rate": 0.00016140350877192982,
+      "loss": 0.8253,
+      "step": 46
+    },
+    {
+      "epoch": 0.025066666666666668,
+      "grad_norm": 0.7240102546369898,
+      "learning_rate": 0.0001649122807017544,
+      "loss": 0.9902,
+      "step": 47
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6136903666564417,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.9077,
+      "step": 48
+    },
+    {
+      "epoch": 0.026133333333333335,
+      "grad_norm": 0.7395175295085202,
+      "learning_rate": 0.00017192982456140353,
+      "loss": 0.985,
+      "step": 49
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.42343314793329256,
+      "learning_rate": 0.00017543859649122806,
+      "loss": 0.746,
+      "step": 50
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.7340484577277879,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 1.0382,
+      "step": 51
+    },
+    {
+      "epoch": 0.027733333333333332,
+      "grad_norm": 0.8434600088153975,
+      "learning_rate": 0.0001824561403508772,
+      "loss": 1.0219,
+      "step": 52
+    },
+    {
+      "epoch": 0.028266666666666666,
+      "grad_norm": 0.5230606202440629,
+      "learning_rate": 0.00018596491228070177,
+      "loss": 0.8054,
+      "step": 53
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5088409387329998,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8681,
+      "step": 54
+    },
+    {
+      "epoch": 0.029333333333333333,
+      "grad_norm": 0.612321126960915,
+      "learning_rate": 0.00019298245614035088,
+      "loss": 0.8984,
+      "step": 55
+    },
+    {
+      "epoch": 0.029866666666666666,
+      "grad_norm": 0.48597178715632633,
+      "learning_rate": 0.00019649122807017543,
+      "loss": 0.861,
+      "step": 56
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.49815647600195784,
+      "learning_rate": 0.0002,
+      "loss": 0.8282,
+      "step": 57
+    },
+    {
+      "epoch": 0.030933333333333334,
+      "grad_norm": 0.5957716890357231,
+      "learning_rate": 0.00019999985069241055,
+      "loss": 0.8735,
+      "step": 58
+    },
+    {
+      "epoch": 0.031466666666666664,
+      "grad_norm": 0.5712024806738941,
+      "learning_rate": 0.00019999940277008808,
+      "loss": 0.8615,
+      "step": 59
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5149023413528725,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.865,
+      "step": 60
+    },
+    {
+      "epoch": 0.03253333333333333,
+      "grad_norm": 0.6329754633640781,
+      "learning_rate": 0.00019999761108748597,
+      "loss": 0.9009,
+      "step": 61
+    },
+    {
+      "epoch": 0.03306666666666667,
+      "grad_norm": 0.6750849329772663,
+      "learning_rate": 0.00019999626733255662,
+      "loss": 1.0187,
+      "step": 62
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.7041355749011133,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.9702,
+      "step": 63
+    },
+    {
+      "epoch": 0.034133333333333335,
+      "grad_norm": 0.6514821462486268,
+      "learning_rate": 0.00019999268401550447,
+      "loss": 0.9102,
+      "step": 64
+    },
+    {
+      "epoch": 0.034666666666666665,
+      "grad_norm": 0.6236717189775234,
+      "learning_rate": 0.000199990444464082,
+      "loss": 0.9897,
+      "step": 65
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5320977040159945,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.9185,
+      "step": 66
+    },
+    {
+      "epoch": 0.03573333333333333,
+      "grad_norm": 0.5634172509699379,
+      "learning_rate": 0.00019998506960888256,
+      "loss": 0.9061,
+      "step": 67
+    },
+    {
+      "epoch": 0.03626666666666667,
+      "grad_norm": 0.5067857180737967,
+      "learning_rate": 0.00019998193432115572,
+      "loss": 0.714,
+      "step": 68
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.8713818108473501,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.9746,
+      "step": 69
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 0.6691172402828967,
+      "learning_rate": 0.00019997476807225985,
+      "loss": 0.8702,
+      "step": 70
+    },
+    {
+      "epoch": 0.037866666666666667,
+      "grad_norm": 0.5715105590453465,
+      "learning_rate": 0.0001999707371324904,
+      "loss": 0.8766,
+      "step": 71
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.7245869944292714,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 1.0353,
+      "step": 72
+    },
+    {
+      "epoch": 0.038933333333333334,
+      "grad_norm": 0.5718973923822783,
+      "learning_rate": 0.00019996177968249334,
+      "loss": 0.8917,
+      "step": 73
+    },
+    {
+      "epoch": 0.039466666666666664,
+      "grad_norm": 0.5430722243096373,
+      "learning_rate": 0.0001999568531990141,
+      "loss": 0.8779,
+      "step": 74
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6601605568954748,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.987,
+      "step": 75
+    },
+    {
+      "epoch": 0.04053333333333333,
+      "grad_norm": 0.5605403622572899,
+      "learning_rate": 0.00019994610478865011,
+      "loss": 0.8821,
+      "step": 76
+    },
+    {
+      "epoch": 0.04106666666666667,
+      "grad_norm": 0.5418175502646891,
+      "learning_rate": 0.0001999402828938618,
+      "loss": 0.7718,
+      "step": 77
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.6293702158665475,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.9803,
+      "step": 78
+    },
+    {
+      "epoch": 0.042133333333333335,
+      "grad_norm": 0.6507407167279051,
+      "learning_rate": 0.00019992774381199778,
+      "loss": 0.9547,
+      "step": 79
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 0.6484754200572854,
+      "learning_rate": 0.00019992102666236566,
+      "loss": 0.9405,
+      "step": 80
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.884928118391995,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8783,
+      "step": 81
+    },
+    {
+      "epoch": 0.04373333333333333,
+      "grad_norm": 0.6908795419087032,
+      "learning_rate": 0.00019990669724599336,
+      "loss": 0.9867,
+      "step": 82
+    },
+    {
+      "epoch": 0.04426666666666667,
+      "grad_norm": 0.5824682851756094,
+      "learning_rate": 0.00019989908502204292,
+      "loss": 0.9331,
+      "step": 83
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.6316813393610857,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.975,
+      "step": 84
+    },
+    {
+      "epoch": 0.04533333333333334,
+      "grad_norm": 0.5352489975081022,
+      "learning_rate": 0.00019988296565626987,
+      "loss": 0.8376,
+      "step": 85
+    },
+    {
+      "epoch": 0.04586666666666667,
+      "grad_norm": 0.47309528931486894,
+      "learning_rate": 0.00019987445856258206,
+      "loss": 0.8269,
+      "step": 86
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.5246505586243143,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.9217,
+      "step": 87
+    },
+    {
+      "epoch": 0.046933333333333334,
+      "grad_norm": 0.4841678812444819,
+      "learning_rate": 0.00019985654968062122,
+      "loss": 0.8478,
+      "step": 88
+    },
+    {
+      "epoch": 0.047466666666666664,
+      "grad_norm": 0.4966241873726176,
+      "learning_rate": 0.00019984714794582683,
+      "loss": 0.85,
+      "step": 89
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5248880201307413,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8825,
+      "step": 90
+    },
+    {
+      "epoch": 0.04853333333333333,
+      "grad_norm": 0.5061920170969972,
+      "learning_rate": 0.000199827450028985,
+      "loss": 0.8386,
+      "step": 91
+    },
+    {
+      "epoch": 0.04906666666666667,
+      "grad_norm": 0.6017734401928818,
+      "learning_rate": 0.00019981715390575858,
+      "loss": 0.8824,
+      "step": 92
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.7623650810154796,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 1.0427,
+      "step": 93
+    },
+    {
+      "epoch": 0.050133333333333335,
+      "grad_norm": 0.6642439159610598,
+      "learning_rate": 0.00019979566748342347,
+      "loss": 0.8763,
+      "step": 94
+    },
+    {
+      "epoch": 0.050666666666666665,
+      "grad_norm": 0.49909208678794403,
+      "learning_rate": 0.00019978447724847652,
+      "loss": 0.832,
+      "step": 95
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.46393029258690244,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.8565,
+      "step": 96
+    },
+    {
+      "epoch": 0.05173333333333333,
+      "grad_norm": 0.5883008873326704,
+      "learning_rate": 0.00019976120289810247,
+      "loss": 1.044,
+      "step": 97
+    },
+    {
+      "epoch": 0.05226666666666667,
+      "grad_norm": 0.49221859513746163,
+      "learning_rate": 0.00019974911885217608,
+      "loss": 0.8893,
+      "step": 98
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.5871360596459384,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8823,
+      "step": 99
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.5563645290456422,
+      "learning_rate": 0.0001997240571992685,
+      "loss": 0.8756,
+      "step": 100
+    },
+    {
+      "epoch": 0.05386666666666667,
+      "grad_norm": 0.5456951817526335,
+      "learning_rate": 0.00019971107966712518,
+      "loss": 0.8443,
+      "step": 101
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.5868406983232394,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.9256,
+      "step": 102
+    },
+    {
+      "epoch": 0.054933333333333334,
+      "grad_norm": 0.6397539530587217,
+      "learning_rate": 0.0001996842313852238,
+      "loss": 1.0257,
+      "step": 103
+    },
+    {
+      "epoch": 0.055466666666666664,
+      "grad_norm": 0.48140395333780067,
+      "learning_rate": 0.00019967036071563877,
+      "loss": 0.7974,
+      "step": 104
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.5224908629742993,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.874,
+      "step": 105
+    },
+    {
+      "epoch": 0.05653333333333333,
+      "grad_norm": 0.6876692164439447,
+      "learning_rate": 0.0001996417265262996,
+      "loss": 0.9259,
+      "step": 106
+    },
+    {
+      "epoch": 0.05706666666666667,
+      "grad_norm": 0.558106466635048,
+      "learning_rate": 0.00019962696309205148,
+      "loss": 0.9256,
+      "step": 107
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.6866064916024611,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 1.0133,
+      "step": 108
+    },
+    {
+      "epoch": 0.058133333333333335,
+      "grad_norm": 0.635731427860939,
+      "learning_rate": 0.0001995965437648273,
+      "loss": 0.9906,
+      "step": 109
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 0.4682780527874329,
+      "learning_rate": 0.00019958088796268793,
+      "loss": 0.8046,
+      "step": 110
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.5859345250297248,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.9377,
+      "step": 111
+    },
+    {
+      "epoch": 0.05973333333333333,
+      "grad_norm": 0.6351657217287303,
+      "learning_rate": 0.00019954868431510764,
+      "loss": 0.9576,
+      "step": 112
+    },
+    {
+      "epoch": 0.06026666666666667,
+      "grad_norm": 0.6096188854419039,
+      "learning_rate": 0.00019953213656583168,
+      "loss": 0.9146,
+      "step": 113
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.6730316913770924,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.948,
+      "step": 114
+    },
+    {
+      "epoch": 0.06133333333333333,
+      "grad_norm": 0.5963591112984713,
+      "learning_rate": 0.00019949814946337838,
+      "loss": 0.9112,
+      "step": 115
+    },
+    {
+      "epoch": 0.06186666666666667,
+      "grad_norm": 0.5243383010499729,
+      "learning_rate": 0.00019948071021169174,
+      "loss": 0.8733,
+      "step": 116
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.6218918641524634,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.9845,
+      "step": 117
+    },
+    {
+      "epoch": 0.06293333333333333,
+      "grad_norm": 0.510062739859879,
+      "learning_rate": 0.00019944494056777946,
+      "loss": 0.8241,
+      "step": 118
+    },
+    {
+      "epoch": 0.06346666666666667,
+      "grad_norm": 0.5341151787626189,
+      "learning_rate": 0.00019942661028236745,
+      "loss": 0.8744,
+      "step": 119
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5059598231096198,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8382,
+      "step": 120
+    },
+    {
+      "epoch": 0.06453333333333333,
+      "grad_norm": 0.516674937056129,
+      "learning_rate": 0.00019938905905831654,
+      "loss": 0.8452,
+      "step": 121
+    },
+    {
+      "epoch": 0.06506666666666666,
+      "grad_norm": 0.5582815377012604,
+      "learning_rate": 0.00019936983823181132,
+      "loss": 0.8855,
+      "step": 122
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.7055345194572793,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.9673,
+      "step": 123
+    },
+    {
+      "epoch": 0.06613333333333334,
+      "grad_norm": 0.8231949440189873,
+      "learning_rate": 0.00019933050643682269,
+      "loss": 1.015,
+      "step": 124
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.588128888793964,
+      "learning_rate": 0.00019931039558578997,
+      "loss": 0.8971,
+      "step": 125
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.6681449843660198,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.9095,
+      "step": 126
+    },
+    {
+      "epoch": 0.06773333333333334,
+      "grad_norm": 0.5550207384306163,
+      "learning_rate": 0.00019926928427691786,
+      "loss": 0.9013,
+      "step": 127
+    },
+    {
+      "epoch": 0.06826666666666667,
+      "grad_norm": 0.524683912520627,
+      "learning_rate": 0.00019924828394184306,
+      "loss": 0.8574,
+      "step": 128
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.7748097181550015,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.9668,
+      "step": 129
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 0.5283157806951922,
+      "learning_rate": 0.0001992053942239668,
+      "loss": 0.8686,
+      "step": 130
+    },
+    {
+      "epoch": 0.06986666666666666,
+      "grad_norm": 0.5616820687558489,
+      "learning_rate": 0.0001991835049692405,
+      "loss": 0.8975,
+      "step": 131
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.5054594593622966,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8434,
+      "step": 132
+    },
+    {
+      "epoch": 0.07093333333333333,
+      "grad_norm": 0.562220612042354,
+      "learning_rate": 0.0001991388379950346,
+      "loss": 0.7903,
+      "step": 133
+    },
+    {
+      "epoch": 0.07146666666666666,
+      "grad_norm": 0.5603148933465106,
+      "learning_rate": 0.0001991160604089374,
+      "loss": 0.8448,
+      "step": 134
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5569518546926293,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.815,
+      "step": 135
+    },
+    {
+      "epoch": 0.07253333333333334,
+      "grad_norm": 0.5255991576983609,
+      "learning_rate": 0.00019906961737884077,
+      "loss": 0.8619,
+      "step": 136
+    },
+    {
+      "epoch": 0.07306666666666667,
+      "grad_norm": 0.47846179606609723,
+      "learning_rate": 0.00019904595207352737,
+      "loss": 0.705,
+      "step": 137
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.576217255091171,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.9585,
+      "step": 138
+    },
+    {
+      "epoch": 0.07413333333333333,
+      "grad_norm": 0.5919574449406261,
+      "learning_rate": 0.000198997734235711,
+      "loss": 0.9191,
+      "step": 139
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.6072688062314806,
+      "learning_rate": 0.00019897318184719385,
+      "loss": 0.8699,
+      "step": 140
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.597010946898004,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.846,
+      "step": 141
+    },
+    {
+      "epoch": 0.07573333333333333,
+      "grad_norm": 0.5809705626854813,
+      "learning_rate": 0.0001989231904975272,
+      "loss": 1.0107,
+      "step": 142
+    },
+    {
+      "epoch": 0.07626666666666666,
+      "grad_norm": 0.5932886371768736,
+      "learning_rate": 0.00019889775168565943,
+      "loss": 0.934,
+      "step": 143
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.6032409916519083,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.9186,
+      "step": 144
+    },
+    {
+      "epoch": 0.07733333333333334,
+      "grad_norm": 0.5457310796425244,
+      "learning_rate": 0.00019884598816767563,
+      "loss": 0.8617,
+      "step": 145
+    },
+    {
+      "epoch": 0.07786666666666667,
+      "grad_norm": 0.476457612204842,
+      "learning_rate": 0.0001988196636161333,
+      "loss": 0.8619,
+      "step": 146
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.4799963625134939,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8132,
+      "step": 147
+    },
+    {
+      "epoch": 0.07893333333333333,
+      "grad_norm": 0.5463824490629723,
+      "learning_rate": 0.00019876612932099308,
+      "loss": 0.904,
+      "step": 148
+    },
+    {
+      "epoch": 0.07946666666666667,
+      "grad_norm": 0.58196447750242,
+      "learning_rate": 0.0001987389197372567,
+      "loss": 0.9932,
+      "step": 149
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.4640382992323238,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.7935,
+      "step": 150
+    },
+    {
+      "epoch": 0.08053333333333333,
+      "grad_norm": 0.5719287398508892,
+      "learning_rate": 0.00019868361610371097,
+      "loss": 0.8877,
+      "step": 151
+    },
+    {
+      "epoch": 0.08106666666666666,
+      "grad_norm": 0.5236602458110028,
+      "learning_rate": 0.00019865552221904665,
+      "loss": 0.8657,
+      "step": 152
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.6509209711208326,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.9885,
+      "step": 153
+    },
+    {
+      "epoch": 0.08213333333333334,
+      "grad_norm": 0.5609187112002201,
+      "learning_rate": 0.00019859845073339787,
+      "loss": 0.8781,
+      "step": 154
+    },
+    {
+      "epoch": 0.08266666666666667,
+      "grad_norm": 0.492495205615016,
+      "learning_rate": 0.00019856947330283752,
+      "loss": 0.8598,
+      "step": 155
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.6043981736009975,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.8453,
+      "step": 156
+    },
+    {
+      "epoch": 0.08373333333333334,
+      "grad_norm": 0.6610003850374823,
+      "learning_rate": 0.0001985106354988997,
+      "loss": 0.8916,
+      "step": 157
+    },
+    {
+      "epoch": 0.08426666666666667,
+      "grad_norm": 0.4795860359494709,
+      "learning_rate": 0.00019848077530122083,
+      "loss": 0.7991,
+      "step": 158
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.6337024654196615,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 1.0085,
+      "step": 159
+    },
+    {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.4997461207024708,
+      "learning_rate": 0.00019842017276027832,
+      "loss": 0.7467,
+      "step": 160
+    },
+    {
+      "epoch": 0.08586666666666666,
+      "grad_norm": 0.5286499908095579,
+      "learning_rate": 0.00019838943059798304,
+      "loss": 0.8765,
+      "step": 161
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.4704310497131496,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.8461,
+      "step": 162
+    },
+    {
+      "epoch": 0.08693333333333333,
+      "grad_norm": 0.5937567267938444,
+      "learning_rate": 0.0001983270649487481,
+      "loss": 0.8693,
+      "step": 163
+    },
+    {
+      "epoch": 0.08746666666666666,
+      "grad_norm": 0.5371755014877978,
+      "learning_rate": 0.0001982954416480417,
+      "loss": 0.9063,
+      "step": 164
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.5800336586723125,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.9283,
+      "step": 165
+    },
+    {
+      "epoch": 0.08853333333333334,
+      "grad_norm": 0.4958033989577926,
+      "learning_rate": 0.00019823131456661063,
+      "loss": 0.8362,
+      "step": 166
+    },
+    {
+      "epoch": 0.08906666666666667,
+      "grad_norm": 0.5399180175782222,
+      "learning_rate": 0.00019819881097737915,
+      "loss": 0.9651,
+      "step": 167
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5843106376237646,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.9353,
+      "step": 168
+    },
+    {
+      "epoch": 0.09013333333333333,
+      "grad_norm": 0.6848308749700495,
+      "learning_rate": 0.00019813292418718732,
+      "loss": 0.9304,
+      "step": 169
+    },
+    {
+      "epoch": 0.09066666666666667,
+      "grad_norm": 0.4752685849578976,
+      "learning_rate": 0.0001980995411829749,
+      "loss": 0.8504,
+      "step": 170
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.5477850021150672,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.9404,
+      "step": 171
+    },
+    {
+      "epoch": 0.09173333333333333,
+      "grad_norm": 0.5181254404743765,
+      "learning_rate": 0.0001980318964547504,
+      "loss": 0.8639,
+      "step": 172
+    },
+    {
+      "epoch": 0.09226666666666666,
+      "grad_norm": 0.6495017550231028,
+      "learning_rate": 0.0001979976349327357,
+      "loss": 0.9617,
+      "step": 173
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.6380134101944767,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.9427,
+      "step": 174
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 0.5847268292402413,
+      "learning_rate": 0.00019792823408445174,
+      "loss": 0.9004,
+      "step": 175
+    },
+    {
+      "epoch": 0.09386666666666667,
+      "grad_norm": 0.496999883623956,
+      "learning_rate": 0.0001978930949654239,
+      "loss": 0.8021,
+      "step": 176
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.7499340485732703,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 1.0304,
+      "step": 177
+    },
+    {
+      "epoch": 0.09493333333333333,
+      "grad_norm": 0.6816349545708028,
+      "learning_rate": 0.00019782193986224995,
+      "loss": 1.0057,
+      "step": 178
+    },
+    {
+      "epoch": 0.09546666666666667,
+      "grad_norm": 0.5705412165518122,
+      "learning_rate": 0.00019778592409058378,
+      "loss": 0.8076,
+      "step": 179
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.8249649542063773,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 1.0162,
+      "step": 180
+    },
+    {
+      "epoch": 0.09653333333333333,
+      "grad_norm": 0.5405554579229593,
+      "learning_rate": 0.0001977130166448355,
+      "loss": 0.8356,
+      "step": 181
+    },
+    {
+      "epoch": 0.09706666666666666,
+      "grad_norm": 0.6330093176321191,
+      "learning_rate": 0.00019767612518846608,
+      "loss": 0.9122,
+      "step": 182
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.5222870952333465,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8313,
+      "step": 183
+    },
+    {
+      "epoch": 0.09813333333333334,
+      "grad_norm": 0.6560197653961808,
+      "learning_rate": 0.00019760146735955388,
+      "loss": 0.8648,
+      "step": 184
+    },
+    {
+      "epoch": 0.09866666666666667,
+      "grad_norm": 0.630258194763127,
+      "learning_rate": 0.00019756370120995066,
+      "loss": 0.9679,
+      "step": 185
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.48276619536438653,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.8696,
+      "step": 186
+    },
+    {
+      "epoch": 0.09973333333333333,
+      "grad_norm": 0.693055832960493,
+      "learning_rate": 0.000197487295004327,
+      "loss": 0.9758,
+      "step": 187
+    },
+    {
+      "epoch": 0.10026666666666667,
+      "grad_norm": 0.5404047908025655,
+      "learning_rate": 0.00019744865517646706,
+      "loss": 0.8055,
+      "step": 188
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.4932545259714465,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.7993,
+      "step": 189
+    },
+    {
+      "epoch": 0.10133333333333333,
+      "grad_norm": 0.6539678194350204,
+      "learning_rate": 0.0001973705026475726,
+      "loss": 0.9077,
+      "step": 190
+    },
+    {
+      "epoch": 0.10186666666666666,
+      "grad_norm": 0.6892197304317585,
+      "learning_rate": 0.00019733099017991341,
+      "loss": 0.9111,
+      "step": 191
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5834308356502489,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.7855,
+      "step": 192
+    },
+    {
+      "epoch": 0.10293333333333334,
+      "grad_norm": 0.5512081138368022,
+      "learning_rate": 0.0001972510934281218,
+      "loss": 0.9188,
+      "step": 193
+    },
+    {
+      "epoch": 0.10346666666666667,
+      "grad_norm": 0.49902017114107877,
+      "learning_rate": 0.00019721070938257324,
+      "loss": 0.7896,
+      "step": 194
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5446308240927318,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8752,
+      "step": 195
+    },
+    {
+      "epoch": 0.10453333333333334,
+      "grad_norm": 0.5547189415434525,
+      "learning_rate": 0.0001971290705551347,
+      "loss": 0.8857,
+      "step": 196
+    },
+    {
+      "epoch": 0.10506666666666667,
+      "grad_norm": 0.5327935283971226,
+      "learning_rate": 0.00019708781601703065,
+      "loss": 0.8803,
+      "step": 197
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.444289401959508,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.6973,
+      "step": 198
+    },
+    {
+      "epoch": 0.10613333333333333,
+      "grad_norm": 0.6306712871206122,
+      "learning_rate": 0.00019700443730801413,
+      "loss": 0.9123,
+      "step": 199
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.5659955054675453,
+      "learning_rate": 0.00019696231338608316,
+      "loss": 0.8996,
+      "step": 200
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.510245250500772,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.9321,
+      "step": 201
+    },
+    {
+      "epoch": 0.10773333333333333,
+      "grad_norm": 0.5614492231253495,
+      "learning_rate": 0.00019687719703631755,
+      "loss": 0.8889,
+      "step": 202
+    },
+    {
+      "epoch": 0.10826666666666666,
+      "grad_norm": 0.5467141420862776,
+      "learning_rate": 0.00019683420486265327,
+      "loss": 0.823,
+      "step": 203
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.5093701579959352,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.7595,
+      "step": 204
+    },
+    {
+      "epoch": 0.10933333333333334,
+      "grad_norm": 0.5750194113830609,
+      "learning_rate": 0.0001967473531596671,
+      "loss": 0.8937,
+      "step": 205
+    },
+    {
+      "epoch": 0.10986666666666667,
+      "grad_norm": 0.5673603349909078,
+      "learning_rate": 0.0001967034938896976,
+      "loss": 0.8523,
+      "step": 206
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.5895133750544301,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8462,
+      "step": 207
+    },
+    {
+      "epoch": 0.11093333333333333,
+      "grad_norm": 0.7750206260510349,
+      "learning_rate": 0.0001966149091676575,
+      "loss": 0.9396,
+      "step": 208
+    },
+    {
+      "epoch": 0.11146666666666667,
+      "grad_norm": 0.45157308135391144,
+      "learning_rate": 0.00019657018398011434,
+      "loss": 0.7762,
+      "step": 209
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5306904599961818,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8957,
+      "step": 210
+    },
+    {
+      "epoch": 0.11253333333333333,
+      "grad_norm": 0.5196935041544928,
+      "learning_rate": 0.00019647986861976246,
+      "loss": 0.8466,
+      "step": 211
+    },
+    {
+      "epoch": 0.11306666666666666,
+      "grad_norm": 0.551998514437722,
+      "learning_rate": 0.0001964342787166491,
+      "loss": 0.8251,
+      "step": 212
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.5231235609049675,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.919,
+      "step": 213
+    },
+    {
+      "epoch": 0.11413333333333334,
+      "grad_norm": 0.545005886462309,
+      "learning_rate": 0.0001963422351452389,
+      "loss": 0.8389,
+      "step": 214
+    },
+    {
+      "epoch": 0.11466666666666667,
+      "grad_norm": 0.617161346116059,
+      "learning_rate": 0.0001962957817517982,
+      "loss": 0.8845,
+      "step": 215
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.533965318790452,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.8025,
+      "step": 216
+    },
+    {
+      "epoch": 0.11573333333333333,
+      "grad_norm": 0.6120440370891225,
+      "learning_rate": 0.00019620201244302952,
+      "loss": 0.9859,
+      "step": 217
+    },
+    {
+      "epoch": 0.11626666666666667,
+      "grad_norm": 0.5430165201923808,
+      "learning_rate": 0.00019615469680771096,
+      "loss": 0.8282,
+      "step": 218
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.5189411745406076,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.8734,
+      "step": 219
+    },
+    {
+      "epoch": 0.11733333333333333,
+      "grad_norm": 0.5416218607957094,
+      "learning_rate": 0.00019605920428166323,
+      "loss": 0.9556,
+      "step": 220
+    },
+    {
+      "epoch": 0.11786666666666666,
+      "grad_norm": 0.540844652707924,
+      "learning_rate": 0.00019601102767608923,
+      "loss": 0.9289,
+      "step": 221
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.6125147723856628,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8679,
+      "step": 222
+    },
+    {
+      "epoch": 0.11893333333333334,
+      "grad_norm": 0.5519197783369388,
+      "learning_rate": 0.00019591381449915397,
+      "loss": 0.8236,
+      "step": 223
+    },
+    {
+      "epoch": 0.11946666666666667,
+      "grad_norm": 0.6667872057607972,
+      "learning_rate": 0.00019586477821808597,
+      "loss": 0.9513,
+      "step": 224
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.6665613951386896,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.977,
+      "step": 225
+    },
+    {
+      "epoch": 0.12053333333333334,
+      "grad_norm": 0.3899502563708877,
+      "learning_rate": 0.00019576584700289768,
+      "loss": 0.7104,
+      "step": 226
+    },
+    {
+      "epoch": 0.12106666666666667,
+      "grad_norm": 0.5289508773870094,
+      "learning_rate": 0.00019571595236420102,
+      "loss": 0.8844,
+      "step": 227
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.5085346436930963,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.8637,
+      "step": 228
+    },
+    {
+      "epoch": 0.12213333333333333,
+      "grad_norm": 0.5272601306833273,
+      "learning_rate": 0.00019561530576956703,
+      "loss": 0.8849,
+      "step": 229
+    },
+    {
+      "epoch": 0.12266666666666666,
+      "grad_norm": 0.6525105891946158,
+      "learning_rate": 0.00019556455411417573,
+      "loss": 0.8833,
+      "step": 230
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.6234486059350676,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8809,
+      "step": 231
+    },
+    {
+      "epoch": 0.12373333333333333,
+      "grad_norm": 0.5296725270937737,
+      "learning_rate": 0.00019546219484500475,
+      "loss": 0.8608,
+      "step": 232
+    },
+    {
+      "epoch": 0.12426666666666666,
+      "grad_norm": 0.6297752187019564,
+      "learning_rate": 0.00019541058753688538,
+      "loss": 0.869,
+      "step": 233
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.5797725950739668,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.9539,
+      "step": 234
+    },
+    {
+      "epoch": 0.12533333333333332,
+      "grad_norm": 0.5322587181419928,
+      "learning_rate": 0.00019530651834411474,
+      "loss": 0.8305,
+      "step": 235
+    },
+    {
+      "epoch": 0.12586666666666665,
+      "grad_norm": 0.574481904911282,
+      "learning_rate": 0.00019525405677022989,
+      "loss": 0.8696,
+      "step": 236
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.4799510430555526,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7752,
+      "step": 237
+    },
+    {
+      "epoch": 0.12693333333333334,
+      "grad_norm": 0.4547495523917592,
+      "learning_rate": 0.0001951482804507517,
+      "loss": 0.7918,
+      "step": 238
+    },
+    {
+      "epoch": 0.12746666666666667,
+      "grad_norm": 0.5263436873112797,
+      "learning_rate": 0.00019509496602102252,
+      "loss": 0.8603,
+      "step": 239
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.6252578470714629,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.9067,
+      "step": 240
+    },
+    {
+      "epoch": 0.12853333333333333,
+      "grad_norm": 0.4867148597600659,
+      "learning_rate": 0.00019498748541760846,
+      "loss": 0.9251,
+      "step": 241
+    },
+    {
+      "epoch": 0.12906666666666666,
+      "grad_norm": 0.5290662373217285,
+      "learning_rate": 0.0001949333195648769,
+      "loss": 0.7274,
+      "step": 242
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.5153066665623456,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.9242,
+      "step": 243
+    },
+    {
+      "epoch": 0.13013333333333332,
+      "grad_norm": 0.6625691460737685,
+      "learning_rate": 0.00019482413756610173,
+      "loss": 0.8166,
+      "step": 244
+    },
+    {
+      "epoch": 0.13066666666666665,
+      "grad_norm": 0.670819395240693,
+      "learning_rate": 0.0001947691217460921,
+      "loss": 0.9512,
+      "step": 245
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.5151189803391039,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8542,
+      "step": 246
+    },
+    {
+      "epoch": 0.13173333333333334,
+      "grad_norm": 0.6362609947941695,
+      "learning_rate": 0.00019465824128625617,
+      "loss": 0.9869,
+      "step": 247
+    },
+    {
+      "epoch": 0.13226666666666667,
+      "grad_norm": 0.5341739523116258,
+      "learning_rate": 0.00019460237697753577,
+      "loss": 0.8713,
+      "step": 248
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.5080000356479262,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.854,
+      "step": 249
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.5532806823745345,
+      "learning_rate": 0.00019448980103658613,
+      "loss": 0.9342,
+      "step": 250
+    },
+    {
+      "epoch": 0.13386666666666666,
+      "grad_norm": 0.5000267304985636,
+      "learning_rate": 0.0001944330897405257,
+      "loss": 0.892,
+      "step": 251
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.7752667597486199,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8867,
+      "step": 252
+    },
+    {
+      "epoch": 0.13493333333333332,
+      "grad_norm": 0.48210521426119607,
+      "learning_rate": 0.00019431882134397598,
+      "loss": 0.784,
+      "step": 253
+    },
+    {
+      "epoch": 0.13546666666666668,
+      "grad_norm": 0.726766942591237,
+      "learning_rate": 0.00019426126458470936,
+      "loss": 1.0296,
+      "step": 254
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.48997429914486446,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.8225,
+      "step": 255
+    },
+    {
+      "epoch": 0.13653333333333334,
+      "grad_norm": 0.6901249877257869,
+      "learning_rate": 0.00019414530680355837,
+      "loss": 0.9044,
+      "step": 256
+    },
+    {
+      "epoch": 0.13706666666666667,
+      "grad_norm": 0.435930713686654,
+      "learning_rate": 0.00019408690612794148,
+      "loss": 0.7588,
+      "step": 257
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5974265537076754,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.9269,
+      "step": 258
+    },
+    {
+      "epoch": 0.13813333333333333,
+      "grad_norm": 0.5342606116909296,
+      "learning_rate": 0.00019396926207859084,
+      "loss": 0.8764,
+      "step": 259
+    },
+    {
+      "epoch": 0.13866666666666666,
+      "grad_norm": 0.49358763046628124,
+      "learning_rate": 0.0001939100190561601,
+      "loss": 0.8179,
+      "step": 260
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.7336425072326883,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 1.0536,
+      "step": 261
+    },
+    {
+      "epoch": 0.13973333333333332,
+      "grad_norm": 0.5743919866990923,
+      "learning_rate": 0.0001937906919003304,
+      "loss": 0.9086,
+      "step": 262
+    },
+    {
+      "epoch": 0.14026666666666668,
+      "grad_norm": 0.44425441099827134,
+      "learning_rate": 0.00019373060812326052,
+      "loss": 0.8629,
+      "step": 263
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5164911733879413,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.7725,
+      "step": 264
+    },
+    {
+      "epoch": 0.14133333333333334,
+      "grad_norm": 0.46286711402137637,
+      "learning_rate": 0.00019360960106790643,
+      "loss": 0.769,
+      "step": 265
+    },
+    {
+      "epoch": 0.14186666666666667,
+      "grad_norm": 1.399245238341527,
+      "learning_rate": 0.0001935486781509677,
+      "loss": 1.0842,
+      "step": 266
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.5045294232571674,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.8658,
+      "step": 267
+    },
+    {
+      "epoch": 0.14293333333333333,
+      "grad_norm": 0.49840155683797577,
+      "learning_rate": 0.00019342599444819168,
+      "loss": 0.8284,
+      "step": 268
+    },
+    {
+      "epoch": 0.14346666666666666,
+      "grad_norm": 0.7117131658267387,
+      "learning_rate": 0.00019336423402870653,
+      "loss": 0.8987,
+      "step": 269
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.5128146847876454,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8484,
+      "step": 270
+    },
+    {
+      "epoch": 0.14453333333333335,
+      "grad_norm": 0.7722720996758503,
+      "learning_rate": 0.0001932398769756714,
+      "loss": 1.1469,
+      "step": 271
+    },
+    {
+      "epoch": 0.14506666666666668,
+      "grad_norm": 0.5803708158094779,
+      "learning_rate": 0.0001931772807134704,
+      "loss": 0.8368,
+      "step": 272
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.42993346683667927,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.8031,
+      "step": 273
+    },
+    {
+      "epoch": 0.14613333333333334,
+      "grad_norm": 0.6065197949647673,
+      "learning_rate": 0.00019305125365231084,
+      "loss": 0.9472,
+      "step": 274
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 0.6908488885342616,
+      "learning_rate": 0.00019298782322968815,
+      "loss": 0.9874,
+      "step": 275
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.5363581236326376,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.847,
+      "step": 276
+    },
+    {
+      "epoch": 0.14773333333333333,
+      "grad_norm": 0.558700883977016,
+      "learning_rate": 0.0001928601295474208,
+      "loss": 0.8182,
+      "step": 277
+    },
+    {
+      "epoch": 0.14826666666666666,
+      "grad_norm": 0.5947475743975708,
+      "learning_rate": 0.00019279586666908884,
+      "loss": 0.9263,
+      "step": 278
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.5380157540832908,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.722,
+      "step": 279
+    },
+    {
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.5538227064782307,
+      "learning_rate": 0.00019266650979752136,
+      "loss": 0.8307,
+      "step": 280
+    },
+    {
+      "epoch": 0.14986666666666668,
+      "grad_norm": 0.4832783044469024,
+      "learning_rate": 0.00019260141619056507,
+      "loss": 0.8599,
+      "step": 281
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.5262719970151286,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8081,
+      "step": 282
+    },
+    {
+      "epoch": 0.15093333333333334,
+      "grad_norm": 0.6223522228511364,
+      "learning_rate": 0.0001924703996062038,
+      "loss": 0.8736,
+      "step": 283
+    },
+    {
+      "epoch": 0.15146666666666667,
+      "grad_norm": 0.6213342599722824,
+      "learning_rate": 0.0001924044770200342,
+      "loss": 0.9558,
+      "step": 284
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.5561811603373654,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.9001,
+      "step": 285
+    },
+    {
+      "epoch": 0.15253333333333333,
+      "grad_norm": 0.7575560964349567,
+      "learning_rate": 0.0001922718042439908,
+      "loss": 1.0266,
+      "step": 286
+    },
+    {
+      "epoch": 0.15306666666666666,
+      "grad_norm": 0.5184421232657922,
+      "learning_rate": 0.000192205054450298,
+      "loss": 0.8553,
+      "step": 287
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.5837734427615423,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8718,
+      "step": 288
+    },
+    {
+      "epoch": 0.15413333333333334,
+      "grad_norm": 0.5577590220415555,
+      "learning_rate": 0.00019207072904819486,
+      "loss": 0.9378,
+      "step": 289
+    },
+    {
+      "epoch": 0.15466666666666667,
+      "grad_norm": 0.45233445244978454,
+      "learning_rate": 0.00019200315384090044,
+      "loss": 0.7427,
+      "step": 290
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.5558031950828102,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.8903,
+      "step": 291
+    },
+    {
+      "epoch": 0.15573333333333333,
+      "grad_norm": 0.5093962275712094,
+      "learning_rate": 0.00019186717942277462,
+      "loss": 0.8061,
+      "step": 292
+    },
+    {
+      "epoch": 0.15626666666666666,
+      "grad_norm": 0.5050638192275981,
+      "learning_rate": 0.00019179878061798347,
+      "loss": 0.8559,
+      "step": 293
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.6324928777374244,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 1.0162,
+      "step": 294
+    },
+    {
+      "epoch": 0.15733333333333333,
+      "grad_norm": 0.5195438049240261,
+      "learning_rate": 0.00019166116083819002,
+      "loss": 0.8277,
+      "step": 295
+    },
+    {
+      "epoch": 0.15786666666666666,
+      "grad_norm": 0.4761289332277623,
+      "learning_rate": 0.00019159194027414128,
+      "loss": 0.8011,
+      "step": 296
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.47188232850246264,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.7454,
+      "step": 297
+    },
+    {
+      "epoch": 0.15893333333333334,
+      "grad_norm": 0.5162299384722681,
+      "learning_rate": 0.00019145267883125482,
+      "loss": 0.8799,
+      "step": 298
+    },
+    {
+      "epoch": 0.15946666666666667,
+      "grad_norm": 0.5238150633584967,
+      "learning_rate": 0.00019138263836827288,
+      "loss": 0.8069,
+      "step": 299
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.46840724242613746,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.7969,
+      "step": 300
+    },
+    {
+      "epoch": 0.16053333333333333,
+      "grad_norm": 0.6269601229860577,
+      "learning_rate": 0.00019124173900498818,
+      "loss": 1.0077,
+      "step": 301
+    },
+    {
+      "epoch": 0.16106666666666666,
+      "grad_norm": 0.5600203459620383,
+      "learning_rate": 0.00019117088052543233,
+      "loss": 0.9362,
+      "step": 302
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.491478107722541,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8715,
+      "step": 303
+    },
+    {
+      "epoch": 0.16213333333333332,
+      "grad_norm": 0.5271381347138833,
+      "learning_rate": 0.00019102834702846387,
+      "loss": 0.8597,
+      "step": 304
+    },
+    {
+      "epoch": 0.16266666666666665,
+      "grad_norm": 0.4824130420607745,
+      "learning_rate": 0.0001909566724366779,
+      "loss": 0.8014,
+      "step": 305
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5134450187561762,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.8139,
+      "step": 306
+    },
+    {
+      "epoch": 0.16373333333333334,
+      "grad_norm": 0.5823218443587672,
+      "learning_rate": 0.00019081250863665794,
+      "loss": 0.8517,
+      "step": 307
+    },
+    {
+      "epoch": 0.16426666666666667,
+      "grad_norm": 0.6349330124167318,
+      "learning_rate": 0.0001907400198589189,
+      "loss": 0.9057,
+      "step": 308
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.5295789449060487,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.8535,
+      "step": 309
+    },
+    {
+      "epoch": 0.16533333333333333,
+      "grad_norm": 0.4831281030968079,
+      "learning_rate": 0.00019059422963029464,
+      "loss": 0.8261,
+      "step": 310
+    },
+    {
+      "epoch": 0.16586666666666666,
+      "grad_norm": 0.5910041302313082,
+      "learning_rate": 0.0001905209286147611,
+      "loss": 0.7828,
+      "step": 311
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.5392756185044956,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.8113,
+      "step": 312
+    },
+    {
+      "epoch": 0.16693333333333332,
+      "grad_norm": 0.557452760417172,
+      "learning_rate": 0.0001903735158756905,
+      "loss": 1.0161,
+      "step": 313
+    },
+    {
+      "epoch": 0.16746666666666668,
+      "grad_norm": 0.522505954409253,
+      "learning_rate": 0.0001902994045923502,
+      "loss": 0.761,
+      "step": 314
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.7233142924067804,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.9351,
+      "step": 315
+    },
+    {
+      "epoch": 0.16853333333333334,
+      "grad_norm": 0.4676711125844684,
+      "learning_rate": 0.0001901503733045967,
+      "loss": 0.7071,
+      "step": 316
+    },
+    {
+      "epoch": 0.16906666666666667,
+      "grad_norm": 0.47685551246661534,
+      "learning_rate": 0.00019007545374521355,
+      "loss": 0.8177,
+      "step": 317
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.4533488930327886,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7436,
+      "step": 318
+    },
+    {
+      "epoch": 0.17013333333333333,
+      "grad_norm": 0.4533058132862425,
+      "learning_rate": 0.00018992480791403958,
+      "loss": 0.7291,
+      "step": 319
+    },
+    {
+      "epoch": 0.17066666666666666,
+      "grad_norm": 0.5458120540129042,
+      "learning_rate": 0.0001898490820921001,
+      "loss": 0.8137,
+      "step": 320
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.5643466697441072,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.8801,
+      "step": 321
+    },
+    {
+      "epoch": 0.17173333333333332,
+      "grad_norm": 0.5380033786204588,
+      "learning_rate": 0.0001896968257661595,
+      "loss": 0.7599,
+      "step": 322
+    },
+    {
+      "epoch": 0.17226666666666668,
+      "grad_norm": 0.617314497027414,
+      "learning_rate": 0.00018962029571681886,
+      "loss": 0.8547,
+      "step": 323
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.6264038818277112,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.8778,
+      "step": 324
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 0.5846003637502903,
+      "learning_rate": 0.00018946643298804793,
+      "loss": 0.9112,
+      "step": 325
+    },
+    {
+      "epoch": 0.17386666666666667,
+      "grad_norm": 0.47567665918466145,
+      "learning_rate": 0.00018938910076807513,
+      "loss": 0.8198,
+      "step": 326
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.49196104288845344,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.8299,
+      "step": 327
+    },
+    {
+      "epoch": 0.17493333333333333,
+      "grad_norm": 0.5601486377476347,
+      "learning_rate": 0.0001892336357715829,
+      "loss": 0.8747,
+      "step": 328
+    },
+    {
+      "epoch": 0.17546666666666666,
+      "grad_norm": 0.43468064391038164,
+      "learning_rate": 0.0001891555034593055,
+      "loss": 0.7781,
+      "step": 329
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 1.028194014871719,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.991,
+      "step": 330
+    },
+    {
+      "epoch": 0.17653333333333332,
+      "grad_norm": 0.627738619519797,
+      "learning_rate": 0.00018899844037326225,
+      "loss": 0.8718,
+      "step": 331
+    },
+    {
+      "epoch": 0.17706666666666668,
+      "grad_norm": 0.47026200802077206,
+      "learning_rate": 0.0001889195100685106,
+      "loss": 0.8163,
+      "step": 332
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.5361118321266511,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.8452,
+      "step": 333
+    },
+    {
+      "epoch": 0.17813333333333334,
+      "grad_norm": 0.5993619983899835,
+      "learning_rate": 0.00018876085311403593,
+      "loss": 0.942,
+      "step": 334
+    },
+    {
+      "epoch": 0.17866666666666667,
+      "grad_norm": 0.46253089860484015,
+      "learning_rate": 0.00018868112693808665,
+      "loss": 0.8561,
+      "step": 335
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.5110368561327658,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7982,
+      "step": 336
+    },
+    {
+      "epoch": 0.17973333333333333,
+      "grad_norm": 0.45178298535160205,
+      "learning_rate": 0.00018852088037913577,
+      "loss": 0.7952,
+      "step": 337
+    },
+    {
+      "epoch": 0.18026666666666666,
+      "grad_norm": 0.6283455503137422,
+      "learning_rate": 0.0001884403604746547,
+      "loss": 0.9345,
+      "step": 338
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.41016500264183564,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.7344,
+      "step": 339
+    },
+    {
+      "epoch": 0.18133333333333335,
+      "grad_norm": 0.5778912728374959,
+      "learning_rate": 0.00018827852861790398,
+      "loss": 0.9244,
+      "step": 340
+    },
+    {
+      "epoch": 0.18186666666666668,
+      "grad_norm": 0.5307862089180293,
+      "learning_rate": 0.00018819721714888877,
+      "loss": 0.7725,
+      "step": 341
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.5408782628800713,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.8191,
+      "step": 342
+    },
+    {
+      "epoch": 0.18293333333333334,
+      "grad_norm": 0.5343495537158315,
+      "learning_rate": 0.00018803380434362,
+      "loss": 0.9992,
+      "step": 343
+    },
+    {
+      "epoch": 0.18346666666666667,
+      "grad_norm": 0.4629573452278619,
+      "learning_rate": 0.0001879517034953418,
+      "loss": 0.7467,
+      "step": 344
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.5432787902120778,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.9322,
+      "step": 345
+    },
+    {
+      "epoch": 0.18453333333333333,
+      "grad_norm": 0.46361653410119125,
+      "learning_rate": 0.00018778671413332513,
+      "loss": 0.7921,
+      "step": 346
+    },
+    {
+      "epoch": 0.18506666666666666,
+      "grad_norm": 0.7190701351188497,
+      "learning_rate": 0.00018770382611226987,
+      "loss": 0.9846,
+      "step": 347
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.5165758662612719,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.8487,
+      "step": 348
+    },
+    {
+      "epoch": 0.18613333333333335,
+      "grad_norm": 0.6198398172150162,
+      "learning_rate": 0.000187537264627646,
+      "loss": 0.8864,
+      "step": 349
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.5285512826336245,
+      "learning_rate": 0.00018745359166145523,
+      "loss": 0.8763,
+      "step": 350
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.42007644809898664,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7347,
+      "step": 351
+    },
+    {
+      "epoch": 0.18773333333333334,
+      "grad_norm": 0.5828585755855576,
+      "learning_rate": 0.00018728546253061614,
+      "loss": 0.8223,
+      "step": 352
+    },
+    {
+      "epoch": 0.18826666666666667,
+      "grad_norm": 0.42651129826500506,
+      "learning_rate": 0.00018720100686802694,
+      "loss": 0.7122,
+      "step": 353
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5797530839376076,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.9591,
+      "step": 354
+    },
+    {
+      "epoch": 0.18933333333333333,
+      "grad_norm": 0.544495478635436,
+      "learning_rate": 0.00018703131460949554,
+      "loss": 0.913,
+      "step": 355
+    },
+    {
+      "epoch": 0.18986666666666666,
+      "grad_norm": 0.4432574119212112,
+      "learning_rate": 0.0001869460785202802,
+      "loss": 0.752,
+      "step": 356
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.4798537314874882,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.9229,
+      "step": 357
+    },
+    {
+      "epoch": 0.19093333333333334,
+      "grad_norm": 0.581634013330689,
+      "learning_rate": 0.00018677482769458904,
+      "loss": 0.9095,
+      "step": 358
+    },
+    {
+      "epoch": 0.19146666666666667,
+      "grad_norm": 0.6178175001226105,
+      "learning_rate": 0.00018668881346949417,
+      "loss": 0.8316,
+      "step": 359
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.5503240721251285,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8607,
+      "step": 360
+    },
+    {
+      "epoch": 0.19253333333333333,
+      "grad_norm": 0.4425801376441001,
+      "learning_rate": 0.00018651600867906272,
+      "loss": 0.8211,
+      "step": 361
+    },
+    {
+      "epoch": 0.19306666666666666,
+      "grad_norm": 0.539226309972477,
+      "learning_rate": 0.00018642921862974742,
+      "loss": 0.7202,
+      "step": 362
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.5453302685171948,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.8175,
+      "step": 363
+    },
+    {
+      "epoch": 0.19413333333333332,
+      "grad_norm": 0.4884968861852387,
+      "learning_rate": 0.00018625486451875843,
+      "loss": 0.8352,
+      "step": 364
+    },
+    {
+      "epoch": 0.19466666666666665,
+      "grad_norm": 0.596533126573078,
+      "learning_rate": 0.0001861673009777325,
+      "loss": 0.9176,
+      "step": 365
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.5116158046966843,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.8144,
+      "step": 366
+    },
+    {
+      "epoch": 0.19573333333333334,
+      "grad_norm": 0.5085625162646913,
+      "learning_rate": 0.00018599140223200716,
+      "loss": 0.7584,
+      "step": 367
+    },
+    {
+      "epoch": 0.19626666666666667,
+      "grad_norm": 0.6091533276019632,
+      "learning_rate": 0.0001859030675525681,
+      "loss": 0.7884,
+      "step": 368
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.5443685151140606,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.9256,
+      "step": 369
+    },
+    {
+      "epoch": 0.19733333333333333,
+      "grad_norm": 0.5236325125412632,
+      "learning_rate": 0.0001857256288994402,
+      "loss": 0.8713,
+      "step": 370
+    },
+    {
+      "epoch": 0.19786666666666666,
+      "grad_norm": 0.5867356801037134,
+      "learning_rate": 0.00018563652545561013,
+      "loss": 0.8452,
+      "step": 371
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.5095813956183218,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7325,
+      "step": 372
+    },
+    {
+      "epoch": 0.19893333333333332,
+      "grad_norm": 0.6069538436884427,
+      "learning_rate": 0.000185457551663799,
+      "loss": 0.8661,
+      "step": 373
+    },
+    {
+      "epoch": 0.19946666666666665,
+      "grad_norm": 0.40768137417188727,
+      "learning_rate": 0.00018536768185026083,
+      "loss": 0.7727,
+      "step": 374
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.633628916149109,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.9909,
+      "step": 375
+    },
+    {
+      "epoch": 0.20053333333333334,
+      "grad_norm": 0.4957646817286244,
+      "learning_rate": 0.00018518717772974302,
+      "loss": 0.8159,
+      "step": 376
+    },
+    {
+      "epoch": 0.20106666666666667,
+      "grad_norm": 0.47105987995879206,
+      "learning_rate": 0.00018509654396177609,
+      "loss": 0.7986,
+      "step": 377
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.6163522938264158,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.9178,
+      "step": 378
+    },
+    {
+      "epoch": 0.20213333333333333,
+      "grad_norm": 0.5832180274336531,
+      "learning_rate": 0.00018491451436365627,
+      "loss": 0.8922,
+      "step": 379
+    },
+    {
+      "epoch": 0.20266666666666666,
+      "grad_norm": 0.509935329197866,
+      "learning_rate": 0.0001848231190770714,
+      "loss": 0.816,
+      "step": 380
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.669176687285736,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.8657,
+      "step": 381
+    },
+    {
+      "epoch": 0.20373333333333332,
+      "grad_norm": 0.4752882620373262,
+      "learning_rate": 0.00018463956889345194,
+      "loss": 0.8092,
+      "step": 382
+    },
+    {
+      "epoch": 0.20426666666666668,
+      "grad_norm": 0.7051297197996066,
+      "learning_rate": 0.00018454741454452603,
+      "loss": 0.8506,
+      "step": 383
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.5577527025167552,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.8511,
+      "step": 384
+    },
+    {
+      "epoch": 0.20533333333333334,
+      "grad_norm": 0.4139542702196724,
+      "learning_rate": 0.00018436234870837547,
+      "loss": 0.6828,
+      "step": 385
+    },
+    {
+      "epoch": 0.20586666666666667,
+      "grad_norm": 0.5182027854667481,
+      "learning_rate": 0.00018426943777378552,
+      "loss": 0.8287,
+      "step": 386
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4299886279907164,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7708,
+      "step": 387
+    },
+    {
+      "epoch": 0.20693333333333333,
+      "grad_norm": 0.5143149967352305,
+      "learning_rate": 0.00018408286125880604,
+      "loss": 0.8828,
+      "step": 388
+    },
+    {
+      "epoch": 0.20746666666666666,
+      "grad_norm": 0.6078729480646635,
+      "learning_rate": 0.00018398919623556238,
+      "loss": 0.7933,
+      "step": 389
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.5517572170783533,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.8796,
+      "step": 390
+    },
+    {
+      "epoch": 0.20853333333333332,
+      "grad_norm": 0.48253470214967964,
+      "learning_rate": 0.0001838011140560562,
+      "loss": 0.7825,
+      "step": 391
+    },
+    {
+      "epoch": 0.20906666666666668,
+      "grad_norm": 0.5577749832898266,
+      "learning_rate": 0.00018370669746143564,
+      "loss": 0.8553,
+      "step": 392
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5226852442696008,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.8066,
+      "step": 393
+    },
+    {
+      "epoch": 0.21013333333333334,
+      "grad_norm": 0.5367652669016284,
+      "learning_rate": 0.0001835171146721701,
+      "loss": 0.8055,
+      "step": 394
+    },
+    {
+      "epoch": 0.21066666666666667,
+      "grad_norm": 0.5403403005109394,
+      "learning_rate": 0.00018342194904364813,
+      "loss": 0.8121,
+      "step": 395
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.48584158907093455,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.7834,
+      "step": 396
+    },
+    {
+      "epoch": 0.21173333333333333,
+      "grad_norm": 0.499235660344172,
+      "learning_rate": 0.00018323087073971993,
+      "loss": 0.7473,
+      "step": 397
+    },
+    {
+      "epoch": 0.21226666666666666,
+      "grad_norm": 0.5367712416434521,
+      "learning_rate": 0.00018313495863490258,
+      "loss": 0.8446,
+      "step": 398
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.5824307197166317,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.9132,
+      "step": 399
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.4978245923360915,
+      "learning_rate": 0.00018294238995160094,
+      "loss": 0.8089,
+      "step": 400
+    },
+    {
+      "epoch": 0.21386666666666668,
+      "grad_norm": 0.5328348890517718,
+      "learning_rate": 0.00018284573394815597,
+      "loss": 0.8156,
+      "step": 401
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.5082485708902648,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7834,
+      "step": 402
+    },
+    {
+      "epoch": 0.21493333333333334,
+      "grad_norm": 0.45628945910244506,
+      "learning_rate": 0.00018265168006082437,
+      "loss": 0.7999,
+      "step": 403
+    },
+    {
+      "epoch": 0.21546666666666667,
+      "grad_norm": 0.4163463024035274,
+      "learning_rate": 0.00018255428275641214,
+      "loss": 0.7293,
+      "step": 404
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5061406032370239,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.8125,
+      "step": 405
+    },
+    {
+      "epoch": 0.21653333333333333,
+      "grad_norm": 0.6363393218642723,
+      "learning_rate": 0.0001823587488803095,
+      "loss": 0.9316,
+      "step": 406
+    },
+    {
+      "epoch": 0.21706666666666666,
+      "grad_norm": 0.5504945090482256,
+      "learning_rate": 0.00018226061289251298,
+      "loss": 0.8154,
+      "step": 407
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.5391672309228815,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.8646,
+      "step": 408
+    },
+    {
+      "epoch": 0.21813333333333335,
+      "grad_norm": 0.4793822093382232,
+      "learning_rate": 0.00018206360428267332,
+      "loss": 0.8117,
+      "step": 409
+    },
+    {
+      "epoch": 0.21866666666666668,
+      "grad_norm": 0.5498770917213628,
+      "learning_rate": 0.00018196473224892784,
+      "loss": 0.8909,
+      "step": 410
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.5654925398384275,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.9082,
+      "step": 411
+    },
+    {
+      "epoch": 0.21973333333333334,
+      "grad_norm": 0.5483893173014566,
+      "learning_rate": 0.0001817662542000192,
+      "loss": 0.9328,
+      "step": 412
+    },
+    {
+      "epoch": 0.22026666666666667,
+      "grad_norm": 0.5094528950078349,
+      "learning_rate": 0.0001816666487775416,
+      "loss": 0.751,
+      "step": 413
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.8131714559962321,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.9416,
+      "step": 414
+    },
+    {
+      "epoch": 0.22133333333333333,
+      "grad_norm": 0.5625225241486854,
+      "learning_rate": 0.00018146670662372354,
+      "loss": 0.8736,
+      "step": 415
+    },
+    {
+      "epoch": 0.22186666666666666,
+      "grad_norm": 0.619670441403755,
+      "learning_rate": 0.0001813663704894407,
+      "loss": 0.8054,
+      "step": 416
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.520660453368968,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.8804,
+      "step": 417
+    },
+    {
+      "epoch": 0.22293333333333334,
+      "grad_norm": 0.5590866916233668,
+      "learning_rate": 0.00018116496960422107,
+      "loss": 0.8684,
+      "step": 418
+    },
+    {
+      "epoch": 0.22346666666666667,
+      "grad_norm": 2.7958795352687495,
+      "learning_rate": 0.00018106390545469795,
+      "loss": 0.812,
+      "step": 419
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5850438335172121,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.8854,
+      "step": 420
+    },
+    {
+      "epoch": 0.22453333333333333,
+      "grad_norm": 0.5500640216510291,
+      "learning_rate": 0.00018086105125078857,
+      "loss": 0.8491,
+      "step": 421
+    },
+    {
+      "epoch": 0.22506666666666666,
+      "grad_norm": 0.45447930553133686,
+      "learning_rate": 0.00018075926180215576,
+      "loss": 0.7655,
+      "step": 422
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.7192110122318098,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.9129,
+      "step": 423
+    },
+    {
+      "epoch": 0.22613333333333333,
+      "grad_norm": 0.5894740072945653,
+      "learning_rate": 0.0001805549597313267,
+      "loss": 0.9886,
+      "step": 424
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 0.586829510685889,
+      "learning_rate": 0.0001804524477192075,
+      "loss": 0.9216,
+      "step": 425
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.5177084599142368,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.8435,
+      "step": 426
+    },
+    {
+      "epoch": 0.22773333333333334,
+      "grad_norm": 0.5747541873884852,
+      "learning_rate": 0.00018024670327214084,
+      "loss": 0.8624,
+      "step": 427
+    },
+    {
+      "epoch": 0.22826666666666667,
+      "grad_norm": 0.6199485085369928,
+      "learning_rate": 0.00018014347145157755,
+      "loss": 0.878,
+      "step": 428
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.5666686132982203,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.8008,
+      "step": 429
+    },
+    {
+      "epoch": 0.22933333333333333,
+      "grad_norm": 0.5010951768121458,
+      "learning_rate": 0.0001799362901577196,
+      "loss": 0.87,
+      "step": 430
+    },
+    {
+      "epoch": 0.22986666666666666,
+      "grad_norm": 0.5081946802813396,
+      "learning_rate": 0.00017983234130309968,
+      "loss": 0.8655,
+      "step": 431
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.46053639541147723,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7879,
+      "step": 432
+    },
+    {
+      "epoch": 0.23093333333333332,
+      "grad_norm": 0.6588624487113882,
+      "learning_rate": 0.00017962372873051252,
+      "loss": 0.9208,
+      "step": 433
+    },
+    {
+      "epoch": 0.23146666666666665,
+      "grad_norm": 0.434622738600863,
+      "learning_rate": 0.00017951906563549397,
+      "loss": 0.6974,
+      "step": 434
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.6294823997970527,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.892,
+      "step": 435
+    },
+    {
+      "epoch": 0.23253333333333334,
+      "grad_norm": 0.48523784665379055,
+      "learning_rate": 0.00017930902739070562,
+      "loss": 0.7763,
+      "step": 436
+    },
+    {
+      "epoch": 0.23306666666666667,
+      "grad_norm": 0.5373909243025393,
+      "learning_rate": 0.00017920365286814183,
+      "loss": 0.8374,
+      "step": 437
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.44329198405302284,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7473,
+      "step": 438
+    },
+    {
+      "epoch": 0.23413333333333333,
+      "grad_norm": 0.48584097444788127,
+      "learning_rate": 0.0001789921945959958,
+      "loss": 0.8479,
+      "step": 439
+    },
+    {
+      "epoch": 0.23466666666666666,
+      "grad_norm": 0.49974502928648723,
+      "learning_rate": 0.00017888611147786002,
+      "loss": 0.823,
+      "step": 440
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.5026833029263131,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7422,
+      "step": 441
+    },
+    {
+      "epoch": 0.23573333333333332,
+      "grad_norm": 0.5307811503098481,
+      "learning_rate": 0.00017867323886136348,
+      "loss": 0.7479,
+      "step": 442
+    },
+    {
+      "epoch": 0.23626666666666668,
+      "grad_norm": 0.5306542674659223,
+      "learning_rate": 0.00017856644999867264,
+      "loss": 0.8433,
+      "step": 443
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5209472676621327,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7319,
+      "step": 444
+    },
+    {
+      "epoch": 0.23733333333333334,
+      "grad_norm": 0.6062715684433185,
+      "learning_rate": 0.00017835216875884368,
+      "loss": 0.8592,
+      "step": 445
+    },
+    {
+      "epoch": 0.23786666666666667,
+      "grad_norm": 0.45574144099355285,
+      "learning_rate": 0.0001782446770215819,
+      "loss": 0.7948,
+      "step": 446
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.47381837609008487,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7444,
+      "step": 447
+    },
+    {
+      "epoch": 0.23893333333333333,
+      "grad_norm": 0.49309845559709176,
+      "learning_rate": 0.00017802899291729585,
+      "loss": 0.7693,
+      "step": 448
+    },
+    {
+      "epoch": 0.23946666666666666,
+      "grad_norm": 0.5168908058408266,
+      "learning_rate": 0.0001779208011943371,
+      "loss": 0.74,
+      "step": 449
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5738186976667065,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.9781,
+      "step": 450
+    },
+    {
+      "epoch": 0.24053333333333332,
+      "grad_norm": 0.49598478169438215,
+      "learning_rate": 0.00017770372002217172,
+      "loss": 0.7199,
+      "step": 451
+    },
+    {
+      "epoch": 0.24106666666666668,
+      "grad_norm": 0.7236920935018343,
+      "learning_rate": 0.00017759483122120238,
+      "loss": 0.9363,
+      "step": 452
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.41008240762151804,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7539,
+      "step": 453
+    },
+    {
+      "epoch": 0.24213333333333334,
+      "grad_norm": 0.5116016775806168,
+      "learning_rate": 0.00017737635881528196,
+      "loss": 0.8255,
+      "step": 454
+    },
+    {
+      "epoch": 0.24266666666666667,
+      "grad_norm": 0.729540440321037,
+      "learning_rate": 0.00017726677586272263,
+      "loss": 0.9091,
+      "step": 455
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.5939518708163508,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.8069,
+      "step": 456
+    },
+    {
+      "epoch": 0.24373333333333333,
+      "grad_norm": 0.47339713598201816,
+      "learning_rate": 0.00017704691809456143,
+      "loss": 0.7996,
+      "step": 457
+    },
+    {
+      "epoch": 0.24426666666666666,
+      "grad_norm": 0.6182943082393394,
+      "learning_rate": 0.0001769366439354882,
+      "loss": 0.8422,
+      "step": 458
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.4956243203684921,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.8494,
+      "step": 459
+    },
+    {
+      "epoch": 0.24533333333333332,
+      "grad_norm": 0.508301548382395,
+      "learning_rate": 0.00017671540671383243,
+      "loss": 0.8665,
+      "step": 460
+    },
+    {
+      "epoch": 0.24586666666666668,
+      "grad_norm": 0.5509188319582408,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 0.883,
+      "step": 461
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.548518166698976,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.8754,
+      "step": 462
+    },
+    {
+      "epoch": 0.24693333333333334,
+      "grad_norm": 0.5297762745417619,
+      "learning_rate": 0.00017638183358256696,
+      "loss": 0.8727,
+      "step": 463
+    },
+    {
+      "epoch": 0.24746666666666667,
+      "grad_norm": 0.6384739843727559,
+      "learning_rate": 0.00017627018591992018,
+      "loss": 0.7691,
+      "step": 464
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.5796092277488748,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.8753,
+      "step": 465
+    },
+    {
+      "epoch": 0.24853333333333333,
+      "grad_norm": 0.4480463427628124,
+      "learning_rate": 0.00017604620766564723,
+      "loss": 0.7904,
+      "step": 466
+    },
+    {
+      "epoch": 0.24906666666666666,
+      "grad_norm": 0.7029097235032447,
+      "learning_rate": 0.00017593387774285412,
+      "loss": 0.9765,
+      "step": 467
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.6096410772724791,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.7711,
+      "step": 468
+    },
+    {
+      "epoch": 0.2501333333333333,
+      "grad_norm": 0.6376346604968903,
+      "learning_rate": 0.0001757085379831246,
+      "loss": 1.021,
+      "step": 469
+    },
+    {
+      "epoch": 0.25066666666666665,
+      "grad_norm": 0.7241821384453352,
+      "learning_rate": 0.00017559552881908695,
+      "loss": 0.8957,
+      "step": 470
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.4989190729708759,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8056,
+      "step": 471
+    },
+    {
+      "epoch": 0.2517333333333333,
+      "grad_norm": 0.4224851417231786,
+      "learning_rate": 0.00017536883360997743,
+      "loss": 0.7753,
+      "step": 472
+    },
+    {
+      "epoch": 0.25226666666666664,
+      "grad_norm": 0.5785550065502728,
+      "learning_rate": 0.00017525514824185185,
+      "loss": 0.8712,
+      "step": 473
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.5346221293744353,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.9194,
+      "step": 474
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 0.4976250533874276,
+      "learning_rate": 0.00017502710367586687,
+      "loss": 0.8288,
+      "step": 475
+    },
+    {
+      "epoch": 0.2538666666666667,
+      "grad_norm": 0.5605203286789534,
+      "learning_rate": 0.0001749127451589832,
+      "loss": 0.8594,
+      "step": 476
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.5002828789623779,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7608,
+      "step": 477
+    },
+    {
+      "epoch": 0.25493333333333335,
+      "grad_norm": 0.5440527531770649,
+      "learning_rate": 0.00017468335736489177,
+      "loss": 0.8472,
+      "step": 478
+    },
+    {
+      "epoch": 0.2554666666666667,
+      "grad_norm": 0.473801147894544,
+      "learning_rate": 0.00017456832877267084,
+      "loss": 0.7923,
+      "step": 479
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4472347584396617,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.8068,
+      "step": 480
+    },
+    {
+      "epoch": 0.25653333333333334,
+      "grad_norm": 0.8784206403575792,
+      "learning_rate": 0.00017433760391534167,
+      "loss": 1.0083,
+      "step": 481
+    },
+    {
+      "epoch": 0.25706666666666667,
+      "grad_norm": 0.5100070260977737,
+      "learning_rate": 0.00017422190833921283,
+      "loss": 0.8667,
+      "step": 482
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.5209546159346587,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7523,
+      "step": 483
+    },
+    {
+      "epoch": 0.2581333333333333,
+      "grad_norm": 0.8013445926468988,
+      "learning_rate": 0.00017398985261944856,
+      "loss": 0.9211,
+      "step": 484
+    },
+    {
+      "epoch": 0.25866666666666666,
+      "grad_norm": 0.6181769629664103,
+      "learning_rate": 0.00017387349316876666,
+      "loss": 0.866,
+      "step": 485
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.5384962948779574,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.8699,
+      "step": 486
+    },
+    {
+      "epoch": 0.2597333333333333,
+      "grad_norm": 0.4955773239230858,
+      "learning_rate": 0.0001736401128231373,
+      "loss": 0.7785,
+      "step": 487
+    },
+    {
+      "epoch": 0.26026666666666665,
+      "grad_norm": 0.5699962373055503,
+      "learning_rate": 0.00017352309262509894,
+      "loss": 0.8766,
+      "step": 488
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.4874255045850295,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.8068,
+      "step": 489
+    },
+    {
+      "epoch": 0.2613333333333333,
+      "grad_norm": 0.4963726627949967,
+      "learning_rate": 0.0001732883939257742,
+      "loss": 0.8507,
+      "step": 490
+    },
+    {
+      "epoch": 0.2618666666666667,
+      "grad_norm": 0.5182793668484437,
+      "learning_rate": 0.0001731707161253338,
+      "loss": 0.7986,
+      "step": 491
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.5967711305347155,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.9506,
+      "step": 492
+    },
+    {
+      "epoch": 0.26293333333333335,
+      "grad_norm": 0.48206999659371763,
+      "learning_rate": 0.00017293470537991463,
+      "loss": 0.8616,
+      "step": 493
+    },
+    {
+      "epoch": 0.2634666666666667,
+      "grad_norm": 0.593173402003062,
+      "learning_rate": 0.00017281637313969978,
+      "loss": 0.8472,
+      "step": 494
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.52689705134063,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.8237,
+      "step": 495
+    },
+    {
+      "epoch": 0.26453333333333334,
+      "grad_norm": 0.40518253160702855,
+      "learning_rate": 0.00017257905669104874,
+      "loss": 0.6664,
+      "step": 496
+    },
+    {
+      "epoch": 0.2650666666666667,
+      "grad_norm": 0.40625602545500733,
+      "learning_rate": 0.00017246007319127545,
+      "loss": 0.6755,
+      "step": 497
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.4705409099200104,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7428,
+      "step": 498
+    },
+    {
+      "epoch": 0.26613333333333333,
+      "grad_norm": 0.6227074667393875,
+      "learning_rate": 0.00017222145741734626,
+      "loss": 0.923,
+      "step": 499
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.49906766278188625,
+      "learning_rate": 0.00017210182585573327,
+      "loss": 0.8099,
+      "step": 500
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.5369831859736955,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.849,
+      "step": 501
+    },
+    {
+      "epoch": 0.2677333333333333,
+      "grad_norm": 0.5758348686795218,
+      "learning_rate": 0.00017186191716939944,
+      "loss": 0.872,
+      "step": 502
+    },
+    {
+      "epoch": 0.26826666666666665,
+      "grad_norm": 0.4432309288935212,
+      "learning_rate": 0.0001717416407610824,
+      "loss": 0.7717,
+      "step": 503
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.5423226988590671,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8917,
+      "step": 504
+    },
+    {
+      "epoch": 0.2693333333333333,
+      "grad_norm": 0.4924505451254493,
+      "learning_rate": 0.00017150044560996488,
+      "loss": 0.7686,
+      "step": 505
+    },
+    {
+      "epoch": 0.26986666666666664,
+      "grad_norm": 0.45580850054236843,
+      "learning_rate": 0.00017137952758740978,
+      "loss": 0.8152,
+      "step": 506
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.5358146105236021,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.8728,
+      "step": 507
+    },
+    {
+      "epoch": 0.27093333333333336,
+      "grad_norm": 0.49970281242685627,
+      "learning_rate": 0.00017113705245370368,
+      "loss": 0.8341,
+      "step": 508
+    },
+    {
+      "epoch": 0.2714666666666667,
+      "grad_norm": 0.5905759371849212,
+      "learning_rate": 0.00017101549606662024,
+      "loss": 1.0342,
+      "step": 509
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.5811480896043241,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7844,
+      "step": 510
+    },
+    {
+      "epoch": 0.27253333333333335,
+      "grad_norm": 0.5151835007482878,
+      "learning_rate": 0.00017077174746692056,
+      "loss": 0.7968,
+      "step": 511
+    },
+    {
+      "epoch": 0.2730666666666667,
+      "grad_norm": 0.5059531882182916,
+      "learning_rate": 0.00017064955598217462,
+      "loss": 0.8128,
+      "step": 512
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.4574818857154386,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7983,
+      "step": 513
+    },
+    {
+      "epoch": 0.27413333333333334,
+      "grad_norm": 0.47905678836689186,
+      "learning_rate": 0.00017040454046730115,
+      "loss": 0.8213,
+      "step": 514
+    },
+    {
+      "epoch": 0.27466666666666667,
+      "grad_norm": 0.6704893566972991,
+      "learning_rate": 0.00017028171716882714,
+      "loss": 1.0397,
+      "step": 515
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4277287971472794,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7041,
+      "step": 516
+    },
+    {
+      "epoch": 0.27573333333333333,
+      "grad_norm": 0.7264039727777464,
+      "learning_rate": 0.00017003544132364846,
+      "loss": 0.8519,
+      "step": 517
+    },
+    {
+      "epoch": 0.27626666666666666,
+      "grad_norm": 0.4691708986843132,
+      "learning_rate": 0.00016991198951236088,
+      "loss": 0.8379,
+      "step": 518
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.6577844600409193,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.9765,
+      "step": 519
+    },
+    {
+      "epoch": 0.2773333333333333,
+      "grad_norm": 0.5412708227465047,
+      "learning_rate": 0.00016966445995561727,
+      "loss": 0.7459,
+      "step": 520
+    },
+    {
+      "epoch": 0.27786666666666665,
+      "grad_norm": 0.4638379143755818,
+      "learning_rate": 0.00016954038294932216,
+      "loss": 0.7704,
+      "step": 521
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.5719164794068985,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.8899,
+      "step": 522
+    },
+    {
+      "epoch": 0.2789333333333333,
+      "grad_norm": 0.44866976447629964,
+      "learning_rate": 0.0001692916063334479,
+      "loss": 0.7286,
+      "step": 523
+    },
+    {
+      "epoch": 0.27946666666666664,
+      "grad_norm": 0.5435761773073292,
+      "learning_rate": 0.0001691669074667535,
+      "loss": 0.8717,
+      "step": 524
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5419190131475101,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.8028,
+      "step": 525
+    },
+    {
+      "epoch": 0.28053333333333336,
+      "grad_norm": 0.5653884038342555,
+      "learning_rate": 0.0001689168904776979,
+      "loss": 0.8189,
+      "step": 526
+    },
+    {
+      "epoch": 0.2810666666666667,
+      "grad_norm": 0.47985429486474734,
+      "learning_rate": 0.00016879157310192535,
+      "loss": 0.8004,
+      "step": 527
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.45725501060587703,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7343,
+      "step": 528
+    },
+    {
+      "epoch": 0.28213333333333335,
+      "grad_norm": 0.7636790180687056,
+      "learning_rate": 0.00016854032245897308,
+      "loss": 0.9407,
+      "step": 529
+    },
+    {
+      "epoch": 0.2826666666666667,
+      "grad_norm": 0.6454673200210016,
+      "learning_rate": 0.00016841438994206595,
+      "loss": 0.9414,
+      "step": 530
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.5018566675451702,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.8254,
+      "step": 531
+    },
+    {
+      "epoch": 0.28373333333333334,
+      "grad_norm": 0.5313927747180329,
+      "learning_rate": 0.00016816191239765667,
+      "loss": 0.8057,
+      "step": 532
+    },
+    {
+      "epoch": 0.28426666666666667,
+      "grad_norm": 0.5257917504923237,
+      "learning_rate": 0.00016803536812409075,
+      "loss": 0.7753,
+      "step": 533
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.5240194109151427,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.7936,
+      "step": 534
+    },
+    {
+      "epoch": 0.2853333333333333,
+      "grad_norm": 0.535866550069367,
+      "learning_rate": 0.00016778167046363734,
+      "loss": 0.8626,
+      "step": 535
+    },
+    {
+      "epoch": 0.28586666666666666,
+      "grad_norm": 0.48948103579799007,
+      "learning_rate": 0.00016765451783432953,
+      "loss": 0.796,
+      "step": 536
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.47524504416647967,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.8374,
+      "step": 537
+    },
+    {
+      "epoch": 0.2869333333333333,
+      "grad_norm": 0.5012843780074685,
+      "learning_rate": 0.0001673996068760359,
+      "loss": 0.7437,
+      "step": 538
+    },
+    {
+      "epoch": 0.28746666666666665,
+      "grad_norm": 0.5262813096929368,
+      "learning_rate": 0.00016727184930825288,
+      "loss": 0.8007,
+      "step": 539
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.47653902866981207,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.8214,
+      "step": 540
+    },
+    {
+      "epoch": 0.2885333333333333,
+      "grad_norm": 0.5172011787522601,
+      "learning_rate": 0.00016701573190293077,
+      "loss": 0.8412,
+      "step": 541
+    },
+    {
+      "epoch": 0.2890666666666667,
+      "grad_norm": 0.5149235899195433,
+      "learning_rate": 0.00016688737283019706,
+      "loss": 0.8536,
+      "step": 542
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.4427009159931128,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.759,
+      "step": 543
+    },
+    {
+      "epoch": 0.29013333333333335,
+      "grad_norm": 0.5292275381933234,
+      "learning_rate": 0.00016663005586108176,
+      "loss": 0.8827,
+      "step": 544
+    },
+    {
+      "epoch": 0.2906666666666667,
+      "grad_norm": 0.582521020164629,
+      "learning_rate": 0.00016650109873308765,
+      "loss": 0.8743,
+      "step": 545
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.5069519148688234,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.8831,
+      "step": 546
+    },
+    {
+      "epoch": 0.29173333333333334,
+      "grad_norm": 0.48385860192439323,
+      "learning_rate": 0.0001662425891156531,
+      "loss": 0.8062,
+      "step": 547
+    },
+    {
+      "epoch": 0.2922666666666667,
+      "grad_norm": 0.4852897385486229,
+      "learning_rate": 0.00016611303739816168,
+      "loss": 0.7802,
+      "step": 548
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.42645874053163363,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7051,
+      "step": 549
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.49064710613686185,
+      "learning_rate": 0.00016585334207993476,
+      "loss": 0.7964,
+      "step": 550
+    },
+    {
+      "epoch": 0.29386666666666666,
+      "grad_norm": 0.5401695976111952,
+      "learning_rate": 0.00016572319925468892,
+      "loss": 0.8259,
+      "step": 551
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.5375868692082563,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.829,
+      "step": 552
+    },
+    {
+      "epoch": 0.2949333333333333,
+      "grad_norm": 0.5811058934400566,
+      "learning_rate": 0.0001654623252150624,
+      "loss": 0.7528,
+      "step": 553
+    },
+    {
+      "epoch": 0.29546666666666666,
+      "grad_norm": 0.5069164637571755,
+      "learning_rate": 0.00016533159477969122,
+      "loss": 0.812,
+      "step": 554
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.5728008304872453,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.8032,
+      "step": 555
+    },
+    {
+      "epoch": 0.2965333333333333,
+      "grad_norm": 0.4091892462358017,
+      "learning_rate": 0.00016506954902973655,
+      "loss": 0.7931,
+      "step": 556
+    },
+    {
+      "epoch": 0.29706666666666665,
+      "grad_norm": 0.4679162706641364,
+      "learning_rate": 0.00016493823449766136,
+      "loss": 0.8033,
+      "step": 557
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.5264838841353725,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7719,
+      "step": 558
+    },
+    {
+      "epoch": 0.2981333333333333,
+      "grad_norm": 0.7123429880382754,
+      "learning_rate": 0.00016467502407993992,
+      "loss": 0.8529,
+      "step": 559
+    },
+    {
+      "epoch": 0.2986666666666667,
+      "grad_norm": 0.5137572290704788,
+      "learning_rate": 0.0001645431289802799,
+      "loss": 0.8076,
+      "step": 560
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.5455463754821794,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.8327,
+      "step": 561
+    },
+    {
+      "epoch": 0.29973333333333335,
+      "grad_norm": 0.5074806693324961,
+      "learning_rate": 0.00016427876096865394,
+      "loss": 0.8404,
+      "step": 562
+    },
+    {
+      "epoch": 0.3002666666666667,
+      "grad_norm": 0.5149714849884053,
+      "learning_rate": 0.00016414628884613107,
+      "loss": 0.8242,
+      "step": 563
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.5287866259312569,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7885,
+      "step": 564
+    },
+    {
+      "epoch": 0.30133333333333334,
+      "grad_norm": 0.4810071947579147,
+      "learning_rate": 0.00016388077034557355,
+      "loss": 0.7658,
+      "step": 565
+    },
+    {
+      "epoch": 0.30186666666666667,
+      "grad_norm": 0.5921198116749884,
+      "learning_rate": 0.00016374772476041748,
+      "loss": 0.8433,
+      "step": 566
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.48809684178386664,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7758,
+      "step": 567
+    },
+    {
+      "epoch": 0.30293333333333333,
+      "grad_norm": 0.4838177625207053,
+      "learning_rate": 0.00016348106290682118,
+      "loss": 0.7975,
+      "step": 568
+    },
+    {
+      "epoch": 0.30346666666666666,
+      "grad_norm": 0.5130253612052892,
+      "learning_rate": 0.00016334744743467364,
+      "loss": 0.7705,
+      "step": 569
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.47539445596553065,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7383,
+      "step": 570
+    },
+    {
+      "epoch": 0.3045333333333333,
+      "grad_norm": 0.4787554596238986,
+      "learning_rate": 0.00016307964939465914,
+      "loss": 0.7523,
+      "step": 571
+    },
+    {
+      "epoch": 0.30506666666666665,
+      "grad_norm": 0.5674834308198589,
+      "learning_rate": 0.00016294546762647775,
+      "loss": 0.7946,
+      "step": 572
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.45646133783305565,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7657,
+      "step": 573
+    },
+    {
+      "epoch": 0.3061333333333333,
+      "grad_norm": 0.5614694326739641,
+      "learning_rate": 0.0001626765405972011,
+      "loss": 0.8507,
+      "step": 574
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 0.5014240113047879,
+      "learning_rate": 0.00016254179613916278,
+      "loss": 0.7218,
+      "step": 575
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4472389676560879,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.804,
+      "step": 576
+    },
+    {
+      "epoch": 0.30773333333333336,
+      "grad_norm": 0.6064763208584959,
+      "learning_rate": 0.000162271747348122,
+      "loss": 0.8987,
+      "step": 577
+    },
+    {
+      "epoch": 0.3082666666666667,
+      "grad_norm": 0.7119278693007127,
+      "learning_rate": 0.0001621364438215262,
+      "loss": 0.9979,
+      "step": 578
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.5026387345978868,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7743,
+      "step": 579
+    },
+    {
+      "epoch": 0.30933333333333335,
+      "grad_norm": 0.5526951799907838,
+      "learning_rate": 0.00016186528052636692,
+      "loss": 0.8137,
+      "step": 580
+    },
+    {
+      "epoch": 0.3098666666666667,
+      "grad_norm": 0.4927055292327108,
+      "learning_rate": 0.0001617294215675382,
+      "loss": 0.8203,
+      "step": 581
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.48074662552300734,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7488,
+      "step": 582
+    },
+    {
+      "epoch": 0.31093333333333334,
+      "grad_norm": 0.6877010020255859,
+      "learning_rate": 0.0001614571510558588,
+      "loss": 0.9138,
+      "step": 583
+    },
+    {
+      "epoch": 0.31146666666666667,
+      "grad_norm": 0.4313484746703689,
+      "learning_rate": 0.00016132074031604917,
+      "loss": 0.7364,
+      "step": 584
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.47636978018532844,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7957,
+      "step": 585
+    },
+    {
+      "epoch": 0.31253333333333333,
+      "grad_norm": 0.5055420311446571,
+      "learning_rate": 0.00016104736990520468,
+      "loss": 0.8562,
+      "step": 586
+    },
+    {
+      "epoch": 0.31306666666666666,
+      "grad_norm": 0.5531988181381383,
+      "learning_rate": 0.0001609104110504954,
+      "loss": 0.8923,
+      "step": 587
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4344709603231299,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7367,
+      "step": 588
+    },
+    {
+      "epoch": 0.3141333333333333,
+      "grad_norm": 0.5813064802188114,
+      "learning_rate": 0.00016063594808740113,
+      "loss": 0.9447,
+      "step": 589
+    },
+    {
+      "epoch": 0.31466666666666665,
+      "grad_norm": 0.5993626462305357,
+      "learning_rate": 0.00016049844479860422,
+      "loss": 0.9174,
+      "step": 590
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.5772228866511909,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.8579,
+      "step": 591
+    },
+    {
+      "epoch": 0.3157333333333333,
+      "grad_norm": 0.4990727842899779,
+      "learning_rate": 0.00016022289665953808,
+      "loss": 0.7831,
+      "step": 592
+    },
+    {
+      "epoch": 0.31626666666666664,
+      "grad_norm": 0.43021107184507557,
+      "learning_rate": 0.00016008485263209742,
+      "loss": 0.6802,
+      "step": 593
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.5985180624025509,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.819,
+      "step": 594
+    },
+    {
+      "epoch": 0.31733333333333336,
+      "grad_norm": 0.49020502952037,
+      "learning_rate": 0.0001598082267225018,
+      "loss": 0.7572,
+      "step": 595
+    },
+    {
+      "epoch": 0.3178666666666667,
+      "grad_norm": 0.5184831439018202,
+      "learning_rate": 0.0001596696456663938,
+      "loss": 0.833,
+      "step": 596
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.49554259894650116,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7604,
+      "step": 597
+    },
+    {
+      "epoch": 0.31893333333333335,
+      "grad_norm": 0.4433760973357246,
+      "learning_rate": 0.00015939194942067646,
+      "loss": 0.7036,
+      "step": 598
+    },
+    {
+      "epoch": 0.3194666666666667,
+      "grad_norm": 0.526481275834099,
+      "learning_rate": 0.0001592528350603103,
+      "loss": 0.8491,
+      "step": 599
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.47061940544784364,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.861,
+      "step": 600
+    },
+    {
+      "epoch": 0.32053333333333334,
+      "grad_norm": 0.5934063587664357,
+      "learning_rate": 0.00015897407594164467,
+      "loss": 0.8761,
+      "step": 601
+    },
+    {
+      "epoch": 0.32106666666666667,
+      "grad_norm": 0.6163230108439915,
+      "learning_rate": 0.00015883443201576225,
+      "loss": 0.852,
+      "step": 602
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.3378322500199042,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.5935,
+      "step": 603
+    },
+    {
+      "epoch": 0.3221333333333333,
+      "grad_norm": 0.4232325695110825,
+      "learning_rate": 0.00015855461751588677,
+      "loss": 0.7599,
+      "step": 604
+    },
+    {
+      "epoch": 0.32266666666666666,
+      "grad_norm": 0.4970253850681227,
+      "learning_rate": 0.0001584144477774623,
+      "loss": 0.8726,
+      "step": 605
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.45270525084655294,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7714,
+      "step": 606
+    },
+    {
+      "epoch": 0.3237333333333333,
+      "grad_norm": 0.5344002172985072,
+      "learning_rate": 0.00015813358541647915,
+      "loss": 0.8408,
+      "step": 607
+    },
+    {
+      "epoch": 0.32426666666666665,
+      "grad_norm": 0.756368936702217,
+      "learning_rate": 0.00015799289363261813,
+      "loss": 1.0281,
+      "step": 608
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.5285174953413889,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.8283,
+      "step": 609
+    },
+    {
+      "epoch": 0.3253333333333333,
+      "grad_norm": 0.5411347592986606,
+      "learning_rate": 0.00015771099095879108,
+      "loss": 0.8734,
+      "step": 610
+    },
+    {
+      "epoch": 0.3258666666666667,
+      "grad_norm": 0.4110227441690126,
+      "learning_rate": 0.0001575697809106292,
+      "loss": 0.7238,
+      "step": 611
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.6084585821051274,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.8188,
+      "step": 612
+    },
+    {
+      "epoch": 0.32693333333333335,
+      "grad_norm": 0.5395012175676103,
+      "learning_rate": 0.00015728684550018064,
+      "loss": 0.8572,
+      "step": 613
+    },
+    {
+      "epoch": 0.3274666666666667,
+      "grad_norm": 0.5992401267533382,
+      "learning_rate": 0.0001571451209827821,
+      "loss": 0.8908,
+      "step": 614
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.43236326156039784,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7862,
+      "step": 615
+    },
+    {
+      "epoch": 0.32853333333333334,
+      "grad_norm": 0.4946043648550045,
+      "learning_rate": 0.00015686116043968972,
+      "loss": 0.7476,
+      "step": 616
+    },
+    {
+      "epoch": 0.3290666666666667,
+      "grad_norm": 0.5184618203964196,
+      "learning_rate": 0.00015671892526194516,
+      "loss": 0.779,
+      "step": 617
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.49856805007399935,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7748,
+      "step": 618
+    },
+    {
+      "epoch": 0.33013333333333333,
+      "grad_norm": 0.4223495185880146,
+      "learning_rate": 0.0001564339472177373,
+      "loss": 0.7711,
+      "step": 619
+    },
+    {
+      "epoch": 0.33066666666666666,
+      "grad_norm": 0.46328799644070096,
+      "learning_rate": 0.00015629120520226165,
+      "loss": 0.773,
+      "step": 620
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.596285576103134,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.87,
+      "step": 621
+    },
+    {
+      "epoch": 0.3317333333333333,
+      "grad_norm": 0.5081533212758815,
+      "learning_rate": 0.0001560052173158123,
+      "loss": 0.8247,
+      "step": 622
+    },
+    {
+      "epoch": 0.33226666666666665,
+      "grad_norm": 0.5975594987336019,
+      "learning_rate": 0.00015586197229884184,
+      "loss": 0.7307,
+      "step": 623
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.49422845682486166,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.9156,
+      "step": 624
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.48969307550494445,
+      "learning_rate": 0.00015557498225616487,
+      "loss": 0.8016,
+      "step": 625
+    },
+    {
+      "epoch": 0.33386666666666664,
+      "grad_norm": 0.44535271675758226,
+      "learning_rate": 0.0001554312380874542,
+      "loss": 0.7971,
+      "step": 626
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.41569141825306105,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7058,
+      "step": 627
+    },
+    {
+      "epoch": 0.33493333333333336,
+      "grad_norm": 0.4442526091486681,
+      "learning_rate": 0.00015514325360149668,
+      "loss": 0.756,
+      "step": 628
+    },
+    {
+      "epoch": 0.3354666666666667,
+      "grad_norm": 0.43241219268770614,
+      "learning_rate": 0.0001549990141442153,
+      "loss": 0.7774,
+      "step": 629
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.551195864295374,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.867,
+      "step": 630
+    },
+    {
+      "epoch": 0.33653333333333335,
+      "grad_norm": 0.3704138442913533,
+      "learning_rate": 0.00015471004295465035,
+      "loss": 0.669,
+      "step": 631
+    },
+    {
+      "epoch": 0.3370666666666667,
+      "grad_norm": 0.42772476117022945,
+      "learning_rate": 0.0001545653120852787,
+      "loss": 0.8204,
+      "step": 632
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.5019012824057236,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7893,
+      "step": 633
+    },
+    {
+      "epoch": 0.33813333333333334,
+      "grad_norm": 0.5462200393424741,
+      "learning_rate": 0.00015427536195829742,
+      "loss": 0.8098,
+      "step": 634
+    },
+    {
+      "epoch": 0.33866666666666667,
+      "grad_norm": 0.4607324445006312,
+      "learning_rate": 0.00015413014356652286,
+      "loss": 0.8057,
+      "step": 635
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.8083059211716902,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 1.0064,
+      "step": 636
+    },
+    {
+      "epoch": 0.33973333333333333,
+      "grad_norm": 0.6487825918852744,
+      "learning_rate": 0.00015383922229462549,
+      "loss": 0.9977,
+      "step": 637
+    },
+    {
+      "epoch": 0.34026666666666666,
+      "grad_norm": 0.5888023885057454,
+      "learning_rate": 0.00015369352028323774,
+      "loss": 0.8519,
+      "step": 638
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.469175184433065,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7438,
+      "step": 639
+    },
+    {
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.5278850865079271,
+      "learning_rate": 0.0001534016356850244,
+      "loss": 0.8291,
+      "step": 640
+    },
+    {
+      "epoch": 0.34186666666666665,
+      "grad_norm": 0.51675783823404,
+      "learning_rate": 0.0001532554539698105,
+      "loss": 0.7848,
+      "step": 641
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.7178680876928034,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.9272,
+      "step": 642
+    },
+    {
+      "epoch": 0.3429333333333333,
+      "grad_norm": 0.517804814528721,
+      "learning_rate": 0.00015296261388977108,
+      "loss": 0.7646,
+      "step": 643
+    },
+    {
+      "epoch": 0.34346666666666664,
+      "grad_norm": 0.47377186429963813,
+      "learning_rate": 0.0001528159563994104,
+      "loss": 0.689,
+      "step": 644
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.4875184855956417,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7916,
+      "step": 645
+    },
+    {
+      "epoch": 0.34453333333333336,
+      "grad_norm": 0.7105916731272629,
+      "learning_rate": 0.00015252216870771345,
+      "loss": 0.8881,
+      "step": 646
+    },
+    {
+      "epoch": 0.3450666666666667,
+      "grad_norm": 0.742048351965138,
+      "learning_rate": 0.00015237503938367186,
+      "loss": 0.8945,
+      "step": 647
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.6408990406108908,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.9067,
+      "step": 648
+    },
+    {
+      "epoch": 0.34613333333333335,
+      "grad_norm": 0.4579451451060999,
+      "learning_rate": 0.00015208031197595356,
+      "loss": 0.761,
+      "step": 649
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.5375881455685046,
+      "learning_rate": 0.0001519327147723776,
+      "loss": 0.774,
+      "step": 650
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.5690127553057223,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.8267,
+      "step": 651
+    },
+    {
+      "epoch": 0.34773333333333334,
+      "grad_norm": 0.5525921666015885,
+      "learning_rate": 0.0001516370555695291,
+      "loss": 0.8956,
+      "step": 652
+    },
+    {
+      "epoch": 0.34826666666666667,
+      "grad_norm": 0.43057917172296345,
+      "learning_rate": 0.00015148899445313981,
+      "loss": 0.6629,
+      "step": 653
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.5369851823892494,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.8473,
+      "step": 654
+    },
+    {
+      "epoch": 0.34933333333333333,
+      "grad_norm": 0.4421763523413578,
+      "learning_rate": 0.00015119241140109467,
+      "loss": 0.7547,
+      "step": 655
+    },
+    {
+      "epoch": 0.34986666666666666,
+      "grad_norm": 0.6683809588214158,
+      "learning_rate": 0.00015104389035108077,
+      "loss": 0.8685,
+      "step": 656
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.424323287318125,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.7535,
+      "step": 657
+    },
+    {
+      "epoch": 0.3509333333333333,
+      "grad_norm": 0.7285451043404508,
+      "learning_rate": 0.0001507463914206012,
+      "loss": 0.9519,
+      "step": 658
+    },
+    {
+      "epoch": 0.35146666666666665,
+      "grad_norm": 0.48670522315608944,
+      "learning_rate": 0.0001505974144285124,
+      "loss": 0.7798,
+      "step": 659
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5148457431572767,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.8448,
+      "step": 660
+    },
+    {
+      "epoch": 0.3525333333333333,
+      "grad_norm": 0.46087000378455195,
+      "learning_rate": 0.00015029900761497506,
+      "loss": 0.8342,
+      "step": 661
+    },
+    {
+      "epoch": 0.35306666666666664,
+      "grad_norm": 0.5310656086371572,
+      "learning_rate": 0.00015014957868461458,
+      "loss": 0.8356,
+      "step": 662
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.5934194459135458,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.9335,
+      "step": 663
+    },
+    {
+      "epoch": 0.35413333333333336,
+      "grad_norm": 0.5346786798201316,
+      "learning_rate": 0.000149850272007796,
+      "loss": 0.7713,
+      "step": 664
+    },
+    {
+      "epoch": 0.3546666666666667,
+      "grad_norm": 0.58607560514104,
+      "learning_rate": 0.00014970039515511304,
+      "loss": 0.9632,
+      "step": 665
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.609226022433584,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7857,
+      "step": 666
+    },
+    {
+      "epoch": 0.35573333333333335,
+      "grad_norm": 0.5858962119631774,
+      "learning_rate": 0.0001494001966589736,
+      "loss": 0.8422,
+      "step": 667
+    },
+    {
+      "epoch": 0.3562666666666667,
+      "grad_norm": 0.5898803646397386,
+      "learning_rate": 0.00014924987591195547,
+      "loss": 0.8418,
+      "step": 668
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.4517854783386305,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7254,
+      "step": 669
+    },
+    {
+      "epoch": 0.35733333333333334,
+      "grad_norm": 0.5345428463035593,
+      "learning_rate": 0.0001489487936644237,
+      "loss": 0.8547,
+      "step": 670
+    },
+    {
+      "epoch": 0.35786666666666667,
+      "grad_norm": 0.5636075683541205,
+      "learning_rate": 0.00014879803306298736,
+      "loss": 0.8664,
+      "step": 671
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.530351789134002,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.68,
+      "step": 672
+    },
+    {
+      "epoch": 0.3589333333333333,
+      "grad_norm": 0.6160935335239659,
+      "learning_rate": 0.00014849607515574276,
+      "loss": 0.9196,
+      "step": 673
+    },
+    {
+      "epoch": 0.35946666666666666,
+      "grad_norm": 0.5643390099503108,
+      "learning_rate": 0.00014834487875162657,
+      "loss": 0.9024,
+      "step": 674
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4807419538502926,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.7408,
+      "step": 675
+    },
+    {
+      "epoch": 0.3605333333333333,
+      "grad_norm": 0.5531256861760592,
+      "learning_rate": 0.00014804205329988225,
+      "loss": 0.8567,
+      "step": 676
+    },
+    {
+      "epoch": 0.36106666666666665,
+      "grad_norm": 0.4718076265230407,
+      "learning_rate": 0.00014789042515653687,
+      "loss": 0.7591,
+      "step": 677
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4149949168385742,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7399,
+      "step": 678
+    },
+    {
+      "epoch": 0.3621333333333333,
+      "grad_norm": 0.5035941906366352,
+      "learning_rate": 0.00014758674029882152,
+      "loss": 0.8239,
+      "step": 679
+    },
+    {
+      "epoch": 0.3626666666666667,
+      "grad_norm": 0.5349329534928783,
+      "learning_rate": 0.00014743468449130063,
+      "loss": 0.8727,
+      "step": 680
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.5070941608741568,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.8018,
+      "step": 681
+    },
+    {
+      "epoch": 0.36373333333333335,
+      "grad_norm": 0.49216670496778525,
+      "learning_rate": 0.00014713014838923976,
+      "loss": 0.7589,
+      "step": 682
+    },
+    {
+      "epoch": 0.3642666666666667,
+      "grad_norm": 0.5761438336408812,
+      "learning_rate": 0.00014697766900409074,
+      "loss": 0.9939,
+      "step": 683
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.46972496509699546,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7172,
+      "step": 684
+    },
+    {
+      "epoch": 0.36533333333333334,
+      "grad_norm": 0.4941160902091317,
+      "learning_rate": 0.0001466722898421873,
+      "loss": 0.8086,
+      "step": 685
+    },
+    {
+      "epoch": 0.3658666666666667,
+      "grad_norm": 0.5550066709819924,
+      "learning_rate": 0.0001465193909773413,
+      "loss": 0.8026,
+      "step": 686
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.6072781403801273,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.8696,
+      "step": 687
+    },
+    {
+      "epoch": 0.36693333333333333,
+      "grad_norm": 0.5615374399243932,
+      "learning_rate": 0.00014621317696275564,
+      "loss": 0.8344,
+      "step": 688
+    },
+    {
+      "epoch": 0.36746666666666666,
+      "grad_norm": 0.5570377881327896,
+      "learning_rate": 0.00014605986272741748,
+      "loss": 0.7939,
+      "step": 689
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.524379630946479,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7864,
+      "step": 690
+    },
+    {
+      "epoch": 0.3685333333333333,
+      "grad_norm": 0.571881857654052,
+      "learning_rate": 0.00014575282208974702,
+      "loss": 0.8803,
+      "step": 691
+    },
+    {
+      "epoch": 0.36906666666666665,
+      "grad_norm": 0.4326176104183051,
+      "learning_rate": 0.00014559909660428468,
+      "loss": 0.7871,
+      "step": 692
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.5186171184185153,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7266,
+      "step": 693
+    },
+    {
+      "epoch": 0.3701333333333333,
+      "grad_norm": 0.596725330052778,
+      "learning_rate": 0.00014529123759534255,
+      "loss": 0.8192,
+      "step": 694
+    },
+    {
+      "epoch": 0.37066666666666664,
+      "grad_norm": 0.5459542972673567,
+      "learning_rate": 0.00014513710499117647,
+      "loss": 0.8215,
+      "step": 695
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.42351102934213064,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.7529,
+      "step": 696
+    },
+    {
+      "epoch": 0.37173333333333336,
+      "grad_norm": 0.5304552971293983,
+      "learning_rate": 0.00014482843588476974,
+      "loss": 0.866,
+      "step": 697
+    },
+    {
+      "epoch": 0.3722666666666667,
+      "grad_norm": 0.4713690466074876,
+      "learning_rate": 0.00014467390030426186,
+      "loss": 0.7434,
+      "step": 698
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.49382378877514266,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.8755,
+      "step": 699
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.45454409178905214,
+      "learning_rate": 0.0001443644293959693,
+      "loss": 0.784,
+      "step": 700
+    },
+    {
+      "epoch": 0.3738666666666667,
+      "grad_norm": 0.5601592151502356,
+      "learning_rate": 0.00014420949499231172,
+      "loss": 0.8895,
+      "step": 701
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.52555648303496,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.8281,
+      "step": 702
+    },
+    {
+      "epoch": 0.37493333333333334,
+      "grad_norm": 0.644756954068664,
+      "learning_rate": 0.00014389923059926062,
+      "loss": 0.9863,
+      "step": 703
+    },
+    {
+      "epoch": 0.37546666666666667,
+      "grad_norm": 0.6340066876753365,
+      "learning_rate": 0.0001437439015363638,
+      "loss": 0.8258,
+      "step": 704
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.5395727291816608,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.9003,
+      "step": 705
+    },
+    {
+      "epoch": 0.37653333333333333,
+      "grad_norm": 0.7074307047526871,
+      "learning_rate": 0.00014343285199700683,
+      "loss": 0.9401,
+      "step": 706
+    },
+    {
+      "epoch": 0.37706666666666666,
+      "grad_norm": 0.5764662285713409,
+      "learning_rate": 0.0001432771324493879,
+      "loss": 0.7597,
+      "step": 707
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.46830365827528964,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.6924,
+      "step": 708
+    },
+    {
+      "epoch": 0.3781333333333333,
+      "grad_norm": 0.7437748252665622,
+      "learning_rate": 0.00014296530612327863,
+      "loss": 0.9863,
+      "step": 709
+    },
+    {
+      "epoch": 0.37866666666666665,
+      "grad_norm": 0.5646809541546257,
+      "learning_rate": 0.00014280920027594907,
+      "loss": 0.8527,
+      "step": 710
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.6377326999288427,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.9223,
+      "step": 711
+    },
+    {
+      "epoch": 0.3797333333333333,
+      "grad_norm": 0.4754566065020194,
+      "learning_rate": 0.00014249660554351752,
+      "loss": 0.7673,
+      "step": 712
+    },
+    {
+      "epoch": 0.38026666666666664,
+      "grad_norm": 0.5111905541845883,
+      "learning_rate": 0.00014234011759187083,
+      "loss": 0.8718,
+      "step": 713
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.5313472637050075,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.8734,
+      "step": 714
+    },
+    {
+      "epoch": 0.38133333333333336,
+      "grad_norm": 0.5894268673196887,
+      "learning_rate": 0.00014202676285419812,
+      "loss": 0.9144,
+      "step": 715
+    },
+    {
+      "epoch": 0.3818666666666667,
+      "grad_norm": 0.5459978241336768,
+      "learning_rate": 0.00014186989700389687,
+      "loss": 0.7703,
+      "step": 716
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.5235275768444314,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.6861,
+      "step": 717
+    },
+    {
+      "epoch": 0.38293333333333335,
+      "grad_norm": 0.4377967510855603,
+      "learning_rate": 0.0001415557906824895,
+      "loss": 0.7768,
+      "step": 718
+    },
+    {
+      "epoch": 0.3834666666666667,
+      "grad_norm": 0.4968442166288454,
+      "learning_rate": 0.00014139855114935252,
+      "loss": 0.786,
+      "step": 719
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.5410674523048824,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.8808,
+      "step": 720
+    },
+    {
+      "epoch": 0.38453333333333334,
+      "grad_norm": 0.612217909281079,
+      "learning_rate": 0.0001410837016859161,
+      "loss": 0.926,
+      "step": 721
+    },
+    {
+      "epoch": 0.38506666666666667,
+      "grad_norm": 0.49409111490774654,
+      "learning_rate": 0.00014092609269580496,
+      "loss": 0.735,
+      "step": 722
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.4153658816647891,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7636,
+      "step": 723
+    },
+    {
+      "epoch": 0.38613333333333333,
+      "grad_norm": 0.5318606721582779,
+      "learning_rate": 0.00014061050855201723,
+      "loss": 0.8346,
+      "step": 724
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 0.6498570697531973,
+      "learning_rate": 0.0001404525343407228,
+      "loss": 0.9235,
+      "step": 725
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.5251596978694136,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7657,
+      "step": 726
+    },
+    {
+      "epoch": 0.3877333333333333,
+      "grad_norm": 0.43521208573080616,
+      "learning_rate": 0.00014013622399800627,
+      "loss": 0.8307,
+      "step": 727
+    },
+    {
+      "epoch": 0.38826666666666665,
+      "grad_norm": 0.5675765480381267,
+      "learning_rate": 0.00013997788881113489,
+      "loss": 0.8893,
+      "step": 728
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.4457518470697796,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7389,
+      "step": 729
+    },
+    {
+      "epoch": 0.3893333333333333,
+      "grad_norm": 0.6125756999885809,
+      "learning_rate": 0.0001396608607704289,
+      "loss": 0.8579,
+      "step": 730
+    },
+    {
+      "epoch": 0.38986666666666664,
+      "grad_norm": 0.495065197072205,
+      "learning_rate": 0.0001395021688632882,
+      "loss": 0.8082,
+      "step": 731
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.5063770097020417,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.8032,
+      "step": 732
+    },
+    {
+      "epoch": 0.39093333333333335,
+      "grad_norm": 0.4226249130395913,
+      "learning_rate": 0.00013918443164482046,
+      "loss": 0.7202,
+      "step": 733
+    },
+    {
+      "epoch": 0.3914666666666667,
+      "grad_norm": 0.44610159161640167,
+      "learning_rate": 0.000139025387282305,
+      "loss": 0.6598,
+      "step": 734
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.49682259220980884,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.8139,
+      "step": 735
+    },
+    {
+      "epoch": 0.39253333333333335,
+      "grad_norm": 0.5013280835711356,
+      "learning_rate": 0.0001387069494253626,
+      "loss": 0.8026,
+      "step": 736
+    },
+    {
+      "epoch": 0.3930666666666667,
+      "grad_norm": 0.5183690065958357,
+      "learning_rate": 0.0001385475568818394,
+      "loss": 0.8727,
+      "step": 737
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.4603853052273151,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7738,
+      "step": 738
+    },
+    {
+      "epoch": 0.39413333333333334,
+      "grad_norm": 0.6073729194657201,
+      "learning_rate": 0.00013822842694453924,
+      "loss": 0.8352,
+      "step": 739
+    },
+    {
+      "epoch": 0.39466666666666667,
+      "grad_norm": 0.6068524072143933,
+      "learning_rate": 0.0001380686905037327,
+      "loss": 0.8887,
+      "step": 740
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.5191397985291558,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.8078,
+      "step": 741
+    },
+    {
+      "epoch": 0.3957333333333333,
+      "grad_norm": 0.5641609132857682,
+      "learning_rate": 0.00013774887706279165,
+      "loss": 0.808,
+      "step": 742
+    },
+    {
+      "epoch": 0.39626666666666666,
+      "grad_norm": 0.5389677760221879,
+      "learning_rate": 0.0001375888010176686,
+      "loss": 0.8395,
+      "step": 743
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.5184783334009618,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.8319,
+      "step": 744
+    },
+    {
+      "epoch": 0.3973333333333333,
+      "grad_norm": 0.5459629740792883,
+      "learning_rate": 0.00013726831266817278,
+      "loss": 0.8212,
+      "step": 745
+    },
+    {
+      "epoch": 0.39786666666666665,
+      "grad_norm": 0.43376535113344805,
+      "learning_rate": 0.00013710790132082692,
+      "loss": 0.8556,
+      "step": 746
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.48247796104572255,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.8638,
+      "step": 747
+    },
+    {
+      "epoch": 0.3989333333333333,
+      "grad_norm": 0.476546953886047,
+      "learning_rate": 0.00013678674667600102,
+      "loss": 0.7701,
+      "step": 748
+    },
+    {
+      "epoch": 0.3994666666666667,
+      "grad_norm": 0.5093857000584507,
+      "learning_rate": 0.00013662600433753745,
+      "loss": 0.8421,
+      "step": 749
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.42181155000414133,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.6899,
+      "step": 750
+    },
+    {
+      "epoch": 0.40053333333333335,
+      "grad_norm": 0.4882886499004825,
+      "learning_rate": 0.00013630419202851284,
+      "loss": 0.8336,
+      "step": 751
+    },
+    {
+      "epoch": 0.4010666666666667,
+      "grad_norm": 1.0218606193263766,
+      "learning_rate": 0.00013614312301893223,
+      "loss": 0.7448,
+      "step": 752
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.42477641436317826,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.7224,
+      "step": 753
+    },
+    {
+      "epoch": 0.40213333333333334,
+      "grad_norm": 0.5353153320960067,
+      "learning_rate": 0.00013582066169451535,
+      "loss": 0.8221,
+      "step": 754
+    },
+    {
+      "epoch": 0.4026666666666667,
+      "grad_norm": 0.5451172345800744,
+      "learning_rate": 0.0001356592703425976,
+      "loss": 0.8424,
+      "step": 755
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.5181668693402046,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7698,
+      "step": 756
+    },
+    {
+      "epoch": 0.40373333333333333,
+      "grad_norm": 0.4766116638861282,
+      "learning_rate": 0.00013533616866903735,
+      "loss": 0.7126,
+      "step": 757
+    },
+    {
+      "epoch": 0.40426666666666666,
+      "grad_norm": 0.44770221837631086,
+      "learning_rate": 0.0001351744593122255,
+      "loss": 0.6988,
+      "step": 758
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.504750260059945,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.864,
+      "step": 759
+    },
+    {
+      "epoch": 0.4053333333333333,
+      "grad_norm": 0.6325774483574017,
+      "learning_rate": 0.00013485072597298038,
+      "loss": 0.8822,
+      "step": 760
+    },
+    {
+      "epoch": 0.40586666666666665,
+      "grad_norm": 0.4569329524560576,
+      "learning_rate": 0.00013468870295726398,
+      "loss": 0.6728,
+      "step": 761
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.5355129531529659,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.8147,
+      "step": 762
+    },
+    {
+      "epoch": 0.4069333333333333,
+      "grad_norm": 0.4442749326372372,
+      "learning_rate": 0.00013436434665276865,
+      "loss": 0.6988,
+      "step": 763
+    },
+    {
+      "epoch": 0.40746666666666664,
+      "grad_norm": 0.45287896753395207,
+      "learning_rate": 0.00013420201433256689,
+      "loss": 0.8024,
+      "step": 764
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.49870183332551243,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7681,
+      "step": 765
+    },
+    {
+      "epoch": 0.40853333333333336,
+      "grad_norm": 0.4669554709536968,
+      "learning_rate": 0.00013387704377999842,
+      "loss": 0.7716,
+      "step": 766
+    },
+    {
+      "epoch": 0.4090666666666667,
+      "grad_norm": 0.48479901942900394,
+      "learning_rate": 0.00013371440651804313,
+      "loss": 0.7534,
+      "step": 767
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.5730119832200375,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.8598,
+      "step": 768
+    },
+    {
+      "epoch": 0.41013333333333335,
+      "grad_norm": 0.5076618420414802,
+      "learning_rate": 0.00013338883045108674,
+      "loss": 0.8088,
+      "step": 769
+    },
+    {
+      "epoch": 0.4106666666666667,
+      "grad_norm": 0.7184939384281536,
+      "learning_rate": 0.00013322589261830517,
+      "loss": 0.895,
+      "step": 770
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.4384030304467163,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7392,
+      "step": 771
+    },
+    {
+      "epoch": 0.41173333333333334,
+      "grad_norm": 0.4692553533275476,
+      "learning_rate": 0.0001328997197869194,
+      "loss": 0.7433,
+      "step": 772
+    },
+    {
+      "epoch": 0.41226666666666667,
+      "grad_norm": 0.4522213692729189,
+      "learning_rate": 0.0001327364857623168,
+      "loss": 0.7129,
+      "step": 773
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.43422551076221394,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7158,
+      "step": 774
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 0.5490870203496083,
+      "learning_rate": 0.00013240972493249847,
+      "loss": 0.7931,
+      "step": 775
+    },
+    {
+      "epoch": 0.41386666666666666,
+      "grad_norm": 0.6217571465441969,
+      "learning_rate": 0.0001322461991030402,
+      "loss": 0.892,
+      "step": 776
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.633769034792794,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.86,
+      "step": 777
+    },
+    {
+      "epoch": 0.4149333333333333,
+      "grad_norm": 0.5025293751957768,
+      "learning_rate": 0.00013191885905658872,
+      "loss": 0.7794,
+      "step": 778
+    },
+    {
+      "epoch": 0.41546666666666665,
+      "grad_norm": 0.46658759424992413,
+      "learning_rate": 0.0001317550458170826,
+      "loss": 0.7948,
+      "step": 779
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.5181717210611255,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.8416,
+      "step": 780
+    },
+    {
+      "epoch": 0.4165333333333333,
+      "grad_norm": 0.49200410156120783,
+      "learning_rate": 0.00013142713535136414,
+      "loss": 0.7819,
+      "step": 781
+    },
+    {
+      "epoch": 0.41706666666666664,
+      "grad_norm": 0.4167994844684718,
+      "learning_rate": 0.00013126303910434214,
+      "loss": 0.7104,
+      "step": 782
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.4163699666315266,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7148,
+      "step": 783
+    },
+    {
+      "epoch": 0.41813333333333336,
+      "grad_norm": 0.4238355139305973,
+      "learning_rate": 0.00013093456703205288,
+      "loss": 0.6728,
+      "step": 784
+    },
+    {
+      "epoch": 0.4186666666666667,
+      "grad_norm": 0.511426303580633,
+      "learning_rate": 0.00013077019218765305,
+      "loss": 0.839,
+      "step": 785
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.5356515901046077,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7557,
+      "step": 786
+    },
+    {
+      "epoch": 0.41973333333333335,
+      "grad_norm": 0.6882185983313841,
+      "learning_rate": 0.0001304411673365826,
+      "loss": 0.8474,
+      "step": 787
+    },
+    {
+      "epoch": 0.4202666666666667,
+      "grad_norm": 0.6641428887881485,
+      "learning_rate": 0.0001302765183124302,
+      "loss": 0.9442,
+      "step": 788
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.5679268441219842,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7947,
+      "step": 789
+    },
+    {
+      "epoch": 0.42133333333333334,
+      "grad_norm": 0.44286158098750944,
+      "learning_rate": 0.00012994694952522435,
+      "loss": 0.7316,
+      "step": 790
+    },
+    {
+      "epoch": 0.42186666666666667,
+      "grad_norm": 0.4508969293267315,
+      "learning_rate": 0.00012978203074631334,
+      "loss": 0.7638,
+      "step": 791
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.5308334648457299,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7683,
+      "step": 792
+    },
+    {
+      "epoch": 0.42293333333333333,
+      "grad_norm": 0.5946803376428245,
+      "learning_rate": 0.00012945192688023624,
+      "loss": 0.929,
+      "step": 793
+    },
+    {
+      "epoch": 0.42346666666666666,
+      "grad_norm": 0.7207562732813872,
+      "learning_rate": 0.0001292867427788104,
+      "loss": 0.9165,
+      "step": 794
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.4779980260077528,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7764,
+      "step": 795
+    },
+    {
+      "epoch": 0.4245333333333333,
+      "grad_norm": 0.4855169806953814,
+      "learning_rate": 0.00012895611270550666,
+      "loss": 0.7795,
+      "step": 796
+    },
+    {
+      "epoch": 0.42506666666666665,
+      "grad_norm": 0.4662692243441799,
+      "learning_rate": 0.0001287906677209403,
+      "loss": 0.7901,
+      "step": 797
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.40931315804008817,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7117,
+      "step": 798
+    },
+    {
+      "epoch": 0.4261333333333333,
+      "grad_norm": 0.4854966151390374,
+      "learning_rate": 0.0001284595203261965,
+      "loss": 0.7212,
+      "step": 799
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.5357969560679365,
+      "learning_rate": 0.00012829381890487536,
+      "loss": 0.7459,
+      "step": 800
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.519103528674126,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7753,
+      "step": 801
+    },
+    {
+      "epoch": 0.42773333333333335,
+      "grad_norm": 0.5791316915320031,
+      "learning_rate": 0.00012796216308838117,
+      "loss": 0.8571,
+      "step": 802
+    },
+    {
+      "epoch": 0.4282666666666667,
+      "grad_norm": 0.6084051457588875,
+      "learning_rate": 0.00012779620968358273,
+      "loss": 0.8512,
+      "step": 803
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4841458991923718,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.792,
+      "step": 804
+    },
+    {
+      "epoch": 0.42933333333333334,
+      "grad_norm": 0.6251617340588154,
+      "learning_rate": 0.00012746405435869198,
+      "loss": 0.8055,
+      "step": 805
+    },
+    {
+      "epoch": 0.4298666666666667,
+      "grad_norm": 0.3840708723496544,
+      "learning_rate": 0.00012729785343046588,
+      "loss": 0.7237,
+      "step": 806
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.610610369045068,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.8078,
+      "step": 807
+    },
+    {
+      "epoch": 0.43093333333333333,
+      "grad_norm": 0.5101123044764597,
+      "learning_rate": 0.00012696520752395672,
+      "loss": 0.7977,
+      "step": 808
+    },
+    {
+      "epoch": 0.43146666666666667,
+      "grad_norm": 0.43049751229646416,
+      "learning_rate": 0.00012679876353900482,
+      "loss": 0.7101,
+      "step": 809
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.6156941943170096,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.876,
+      "step": 810
+    },
+    {
+      "epoch": 0.4325333333333333,
+      "grad_norm": 0.49964764586889204,
+      "learning_rate": 0.00012646563599083996,
+      "loss": 0.8106,
+      "step": 811
+    },
+    {
+      "epoch": 0.43306666666666666,
+      "grad_norm": 0.6425904840094377,
+      "learning_rate": 0.00012629895342239643,
+      "loss": 0.8873,
+      "step": 812
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.4594886918342748,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.7891,
+      "step": 813
+    },
+    {
+      "epoch": 0.4341333333333333,
+      "grad_norm": 0.3715193579073943,
+      "learning_rate": 0.00012596535318548289,
+      "loss": 0.6497,
+      "step": 814
+    },
+    {
+      "epoch": 0.43466666666666665,
+      "grad_norm": 0.4969181835624368,
+      "learning_rate": 0.0001257984365131938,
+      "loss": 0.8602,
+      "step": 815
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.5112261662756453,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.7946,
+      "step": 816
+    },
+    {
+      "epoch": 0.4357333333333333,
+      "grad_norm": 0.4404318603249988,
+      "learning_rate": 0.00012546437255314222,
+      "loss": 0.8057,
+      "step": 817
+    },
+    {
+      "epoch": 0.4362666666666667,
+      "grad_norm": 0.5074910199048389,
+      "learning_rate": 0.0001252972262629454,
+      "loss": 0.7261,
+      "step": 818
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.6579442937626416,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7994,
+      "step": 819
+    },
+    {
+      "epoch": 0.43733333333333335,
+      "grad_norm": 0.5269627351371972,
+      "learning_rate": 0.00012496270755782914,
+      "loss": 0.8513,
+      "step": 820
+    },
+    {
+      "epoch": 0.4378666666666667,
+      "grad_norm": 0.6410417420546307,
+      "learning_rate": 0.00012479533614183334,
+      "loss": 0.8727,
+      "step": 821
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.509013657441772,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7401,
+      "step": 822
+    },
+    {
+      "epoch": 0.43893333333333334,
+      "grad_norm": 0.605665680146341,
+      "learning_rate": 0.00012446037168194714,
+      "loss": 0.871,
+      "step": 823
+    },
+    {
+      "epoch": 0.43946666666666667,
+      "grad_norm": 0.7658208778704714,
+      "learning_rate": 0.00012429277963831148,
+      "loss": 0.878,
+      "step": 824
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.5077593808819223,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7604,
+      "step": 825
+    },
+    {
+      "epoch": 0.44053333333333333,
+      "grad_norm": 0.6091180723940991,
+      "learning_rate": 0.00012395737842592995,
+      "loss": 0.9141,
+      "step": 826
+    },
+    {
+      "epoch": 0.44106666666666666,
+      "grad_norm": 0.5647979519746409,
+      "learning_rate": 0.000123789570258743,
+      "loss": 0.8112,
+      "step": 827
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.49231303109397256,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.838,
+      "step": 828
+    },
+    {
+      "epoch": 0.4421333333333333,
+      "grad_norm": 0.507668649681542,
+      "learning_rate": 0.00012345374130787854,
+      "loss": 0.8796,
+      "step": 829
+    },
+    {
+      "epoch": 0.44266666666666665,
+      "grad_norm": 0.5270163965454614,
+      "learning_rate": 0.00012328572152703725,
+      "loss": 0.7963,
+      "step": 830
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.4607488335162496,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7155,
+      "step": 831
+    },
+    {
+      "epoch": 0.4437333333333333,
+      "grad_norm": 0.5625901485250786,
+      "learning_rate": 0.00012294947386319794,
+      "loss": 0.77,
+      "step": 832
+    },
+    {
+      "epoch": 0.44426666666666664,
+      "grad_norm": 0.45832785535563436,
+      "learning_rate": 0.0001227812469842864,
+      "loss": 0.7626,
+      "step": 833
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.45676392241459746,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7891,
+      "step": 834
+    },
+    {
+      "epoch": 0.44533333333333336,
+      "grad_norm": 0.5331910631819192,
+      "learning_rate": 0.00012244458964423327,
+      "loss": 0.8101,
+      "step": 835
+    },
+    {
+      "epoch": 0.4458666666666667,
+      "grad_norm": 0.5336665803865309,
+      "learning_rate": 0.00012227616018840154,
+      "loss": 0.8476,
+      "step": 836
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.4725071119751479,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7257,
+      "step": 837
+    },
+    {
+      "epoch": 0.44693333333333335,
+      "grad_norm": 0.5485778876123043,
+      "learning_rate": 0.00012193910221990581,
+      "loss": 0.7574,
+      "step": 838
+    },
+    {
+      "epoch": 0.4474666666666667,
+      "grad_norm": 0.49722835793082243,
+      "learning_rate": 0.00012177047471374807,
+      "loss": 0.8108,
+      "step": 839
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.47025306102710457,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.8576,
+      "step": 840
+    },
+    {
+      "epoch": 0.44853333333333334,
+      "grad_norm": 0.5428983779819186,
+      "learning_rate": 0.0001214330251753481,
+      "loss": 0.8883,
+      "step": 841
+    },
+    {
+      "epoch": 0.44906666666666667,
+      "grad_norm": 0.4694803567020228,
+      "learning_rate": 0.00012126420415078132,
+      "loss": 0.7932,
+      "step": 842
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.48473876763436824,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.8554,
+      "step": 843
+    },
+    {
+      "epoch": 0.45013333333333333,
+      "grad_norm": 0.47005570048715595,
+      "learning_rate": 0.00012092637211153885,
+      "loss": 0.7806,
+      "step": 844
+    },
+    {
+      "epoch": 0.45066666666666666,
+      "grad_norm": 0.5006290971264497,
+      "learning_rate": 0.0001207573621056809,
+      "loss": 0.7757,
+      "step": 845
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.6262326768214996,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.8247,
+      "step": 846
+    },
+    {
+      "epoch": 0.4517333333333333,
+      "grad_norm": 0.6495934091606899,
+      "learning_rate": 0.00012041915664493761,
+      "loss": 0.9522,
+      "step": 847
+    },
+    {
+      "epoch": 0.45226666666666665,
+      "grad_norm": 0.3926168857682602,
+      "learning_rate": 0.00012024996219998517,
+      "loss": 0.7462,
+      "step": 848
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.45491320496713733,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7765,
+      "step": 849
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.4434547347861371,
+      "learning_rate": 0.00011991139240711857,
+      "loss": 0.766,
+      "step": 850
+    },
+    {
+      "epoch": 0.45386666666666664,
+      "grad_norm": 0.5861265159507776,
+      "learning_rate": 0.00011974201807022525,
+      "loss": 0.8013,
+      "step": 851
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.4550910918588998,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7629,
+      "step": 852
+    },
+    {
+      "epoch": 0.45493333333333336,
+      "grad_norm": 0.4797524978868209,
+      "learning_rate": 0.00011940309304440433,
+      "loss": 0.778,
+      "step": 853
+    },
+    {
+      "epoch": 0.4554666666666667,
+      "grad_norm": 0.5258462973897162,
+      "learning_rate": 0.00011923354336755835,
+      "loss": 0.8173,
+      "step": 854
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.5479945980446406,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.8253,
+      "step": 855
+    },
+    {
+      "epoch": 0.45653333333333335,
+      "grad_norm": 0.4568344411849772,
+      "learning_rate": 0.00011889427221749916,
+      "loss": 0.7206,
+      "step": 856
+    },
+    {
+      "epoch": 0.4570666666666667,
+      "grad_norm": 0.505548947892483,
+      "learning_rate": 0.00011872455175740112,
+      "loss": 0.7785,
+      "step": 857
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.48924710640761454,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7868,
+      "step": 858
+    },
+    {
+      "epoch": 0.45813333333333334,
+      "grad_norm": 0.4107100163640565,
+      "learning_rate": 0.00011838494360112185,
+      "loss": 0.7318,
+      "step": 859
+    },
+    {
+      "epoch": 0.45866666666666667,
+      "grad_norm": 0.5565731120612221,
+      "learning_rate": 0.00011821505691906216,
+      "loss": 0.8572,
+      "step": 860
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.4982784682798707,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.769,
+      "step": 861
+    },
+    {
+      "epoch": 0.4597333333333333,
+      "grad_norm": 0.4995791829692604,
+      "learning_rate": 0.00011787512088363817,
+      "loss": 0.7514,
+      "step": 862
+    },
+    {
+      "epoch": 0.46026666666666666,
+      "grad_norm": 0.45805779859294643,
+      "learning_rate": 0.00011770507254537453,
+      "loss": 0.7062,
+      "step": 863
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.509454093532379,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.8522,
+      "step": 864
+    },
+    {
+      "epoch": 0.4613333333333333,
+      "grad_norm": 0.42024159280118345,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.7247,
+      "step": 865
+    },
+    {
+      "epoch": 0.46186666666666665,
+      "grad_norm": 0.39307172555639547,
+      "learning_rate": 0.00011719461234232764,
+      "loss": 0.7005,
+      "step": 866
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.42169313512842804,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7528,
+      "step": 867
+    },
+    {
+      "epoch": 0.4629333333333333,
+      "grad_norm": 0.5039975584791423,
+      "learning_rate": 0.00011685404796484225,
+      "loss": 0.7759,
+      "step": 868
+    },
+    {
+      "epoch": 0.4634666666666667,
+      "grad_norm": 0.40711579520247354,
+      "learning_rate": 0.00011668369002869912,
+      "loss": 0.6914,
+      "step": 869
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.6028022279161257,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.9179,
+      "step": 870
+    },
+    {
+      "epoch": 0.46453333333333335,
+      "grad_norm": 0.526392815479632,
+      "learning_rate": 0.00011634282520518383,
+      "loss": 0.8378,
+      "step": 871
+    },
+    {
+      "epoch": 0.4650666666666667,
+      "grad_norm": 0.5248041781489471,
+      "learning_rate": 0.00011617231933568578,
+      "loss": 0.8562,
+      "step": 872
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.42751189208185303,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7296,
+      "step": 873
+    },
+    {
+      "epoch": 0.46613333333333334,
+      "grad_norm": 0.512742744328725,
+      "learning_rate": 0.00011583116322698935,
+      "loss": 0.7847,
+      "step": 874
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.5611307796660479,
+      "learning_rate": 0.00011566051400653486,
+      "loss": 0.8123,
+      "step": 875
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.5575651974777656,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7401,
+      "step": 876
+    },
+    {
+      "epoch": 0.46773333333333333,
+      "grad_norm": 0.4723621023001122,
+      "learning_rate": 0.00011531907578133429,
+      "loss": 0.7066,
+      "step": 877
+    },
+    {
+      "epoch": 0.46826666666666666,
+      "grad_norm": 0.4575807168925933,
+      "learning_rate": 0.00011514828779617459,
+      "loss": 0.7333,
+      "step": 878
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.4651231651925861,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7569,
+      "step": 879
+    },
+    {
+      "epoch": 0.4693333333333333,
+      "grad_norm": 0.3966971100695475,
+      "learning_rate": 0.00011480657663072896,
+      "loss": 0.6847,
+      "step": 880
+    },
+    {
+      "epoch": 0.46986666666666665,
+      "grad_norm": 0.5161805818763336,
+      "learning_rate": 0.00011463565447084445,
+      "loss": 0.7304,
+      "step": 881
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.5663476833765066,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.876,
+      "step": 882
+    },
+    {
+      "epoch": 0.4709333333333333,
+      "grad_norm": 0.46355268174699166,
+      "learning_rate": 0.00011429367954874819,
+      "loss": 0.7611,
+      "step": 883
+    },
+    {
+      "epoch": 0.47146666666666665,
+      "grad_norm": 0.5092064946900178,
+      "learning_rate": 0.0001141226278077254,
+      "loss": 0.7269,
+      "step": 884
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.46970918456239436,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7396,
+      "step": 885
+    },
+    {
+      "epoch": 0.47253333333333336,
+      "grad_norm": 0.4557188143082749,
+      "learning_rate": 0.00011378039831966134,
+      "loss": 0.7341,
+      "step": 886
+    },
+    {
+      "epoch": 0.4730666666666667,
+      "grad_norm": 0.6647036642923956,
+      "learning_rate": 0.00011360922159456928,
+      "loss": 0.7488,
+      "step": 887
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.6765332341145955,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.9396,
+      "step": 888
+    },
+    {
+      "epoch": 0.47413333333333335,
+      "grad_norm": 0.5175862474749728,
+      "learning_rate": 0.00011326674673806195,
+      "loss": 0.7783,
+      "step": 889
+    },
+    {
+      "epoch": 0.4746666666666667,
+      "grad_norm": 0.436284207863048,
+      "learning_rate": 0.00011309544962932862,
+      "loss": 0.7639,
+      "step": 890
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.4686215143619508,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7497,
+      "step": 891
+    },
+    {
+      "epoch": 0.47573333333333334,
+      "grad_norm": 0.44740547705866324,
+      "learning_rate": 0.00011275273860849684,
+      "loss": 0.6466,
+      "step": 892
+    },
+    {
+      "epoch": 0.47626666666666667,
+      "grad_norm": 0.5501474923213852,
+      "learning_rate": 0.00011258132571978555,
+      "loss": 0.8357,
+      "step": 893
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.4729206846885701,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.7393,
+      "step": 894
+    },
+    {
+      "epoch": 0.47733333333333333,
+      "grad_norm": 0.6172459831714341,
+      "learning_rate": 0.00011223838774509514,
+      "loss": 0.8347,
+      "step": 895
+    },
+    {
+      "epoch": 0.47786666666666666,
+      "grad_norm": 0.47854416285741824,
+      "learning_rate": 0.00011206686368318086,
+      "loss": 0.7491,
+      "step": 896
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.4388952320442764,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7897,
+      "step": 897
+    },
+    {
+      "epoch": 0.4789333333333333,
+      "grad_norm": 0.5456853460509431,
+      "learning_rate": 0.00011172370797119712,
+      "loss": 0.8387,
+      "step": 898
+    },
+    {
+      "epoch": 0.47946666666666665,
+      "grad_norm": 0.48268009519768473,
+      "learning_rate": 0.00011155207734584263,
+      "loss": 0.8265,
+      "step": 899
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.43680690221110663,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.72,
+      "step": 900
+    },
+    {
+      "epoch": 0.4805333333333333,
+      "grad_norm": 0.49961382847292923,
+      "learning_rate": 0.00011120871311898254,
+      "loss": 0.7646,
+      "step": 901
+    },
+    {
+      "epoch": 0.48106666666666664,
+      "grad_norm": 0.5450142519081406,
+      "learning_rate": 0.0001110369805428146,
+      "loss": 0.8989,
+      "step": 902
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.622389521380628,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.8413,
+      "step": 903
+    },
+    {
+      "epoch": 0.48213333333333336,
+      "grad_norm": 0.47369590455789173,
+      "learning_rate": 0.0001106934170290991,
+      "loss": 0.813,
+      "step": 904
+    },
+    {
+      "epoch": 0.4826666666666667,
+      "grad_norm": 0.46356194145737945,
+      "learning_rate": 0.00011052158711748434,
+      "loss": 0.7059,
+      "step": 905
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.44236433205679154,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.8015,
+      "step": 906
+    },
+    {
+      "epoch": 0.48373333333333335,
+      "grad_norm": 0.8646622375431554,
+      "learning_rate": 0.00011017783355029026,
+      "loss": 1.0327,
+      "step": 907
+    },
+    {
+      "epoch": 0.4842666666666667,
+      "grad_norm": 0.42402134760906085,
+      "learning_rate": 0.00011000591092121127,
+      "loss": 0.7366,
+      "step": 908
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.48443938796841723,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7556,
+      "step": 909
+    },
+    {
+      "epoch": 0.48533333333333334,
+      "grad_norm": 0.46558447549359927,
+      "learning_rate": 0.0001096619765390232,
+      "loss": 0.783,
+      "step": 910
+    },
+    {
+      "epoch": 0.48586666666666667,
+      "grad_norm": 0.5132249584302769,
+      "learning_rate": 0.00010948996581295436,
+      "loss": 0.6438,
+      "step": 911
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.5177124844667969,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.8248,
+      "step": 912
+    },
+    {
+      "epoch": 0.48693333333333333,
+      "grad_norm": 0.43916411940111544,
+      "learning_rate": 0.00010914585985911632,
+      "loss": 0.7437,
+      "step": 913
+    },
+    {
+      "epoch": 0.48746666666666666,
+      "grad_norm": 0.6446654464232441,
+      "learning_rate": 0.00010897376565889971,
+      "loss": 0.7377,
+      "step": 914
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.5476434595463063,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.8251,
+      "step": 915
+    },
+    {
+      "epoch": 0.4885333333333333,
+      "grad_norm": 0.5544217543377928,
+      "learning_rate": 0.00010862949738136681,
+      "loss": 0.828,
+      "step": 916
+    },
+    {
+      "epoch": 0.48906666666666665,
+      "grad_norm": 0.4863232958573473,
+      "learning_rate": 0.00010845732433208779,
+      "loss": 0.7145,
+      "step": 917
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.5264052654515747,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.9068,
+      "step": 918
+    },
+    {
+      "epoch": 0.4901333333333333,
+      "grad_norm": 0.405147594001005,
+      "learning_rate": 0.00010811290298317755,
+      "loss": 0.7008,
+      "step": 919
+    },
+    {
+      "epoch": 0.49066666666666664,
+      "grad_norm": 0.5847775056895158,
+      "learning_rate": 0.00010794065571204072,
+      "loss": 0.8319,
+      "step": 920
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.3871865423396232,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7312,
+      "step": 921
+    },
+    {
+      "epoch": 0.49173333333333336,
+      "grad_norm": 0.6457922270370335,
+      "learning_rate": 0.00010759609054818458,
+      "loss": 0.8671,
+      "step": 922
+    },
+    {
+      "epoch": 0.4922666666666667,
+      "grad_norm": 0.7058579473633974,
+      "learning_rate": 0.00010742377368438914,
+      "loss": 0.8271,
+      "step": 923
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.6236517778840219,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.856,
+      "step": 924
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 0.5573009691516873,
+      "learning_rate": 0.00010707907396588361,
+      "loss": 0.7543,
+      "step": 925
+    },
+    {
+      "epoch": 0.4938666666666667,
+      "grad_norm": 0.49780016357808454,
+      "learning_rate": 0.0001069066921404992,
+      "loss": 0.7296,
+      "step": 926
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.5077651372406793,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7598,
+      "step": 927
+    },
+    {
+      "epoch": 0.49493333333333334,
+      "grad_norm": 0.5880793316898241,
+      "learning_rate": 0.00010656186713125689,
+      "loss": 0.8521,
+      "step": 928
+    },
+    {
+      "epoch": 0.49546666666666667,
+      "grad_norm": 0.4501858915478432,
+      "learning_rate": 0.0001063894249770989,
+      "loss": 0.7065,
+      "step": 929
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4525662342544168,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7283,
+      "step": 930
+    },
+    {
+      "epoch": 0.4965333333333333,
+      "grad_norm": 0.46356536316788016,
+      "learning_rate": 0.00010604448394439983,
+      "loss": 0.711,
+      "step": 931
+    },
+    {
+      "epoch": 0.49706666666666666,
+      "grad_norm": 0.518028225576915,
+      "learning_rate": 0.00010587198609590505,
+      "loss": 0.6937,
+      "step": 932
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.49351490979787055,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.7069,
+      "step": 933
+    },
+    {
+      "epoch": 0.4981333333333333,
+      "grad_norm": 0.5228500242330347,
+      "learning_rate": 0.00010552693831014726,
+      "loss": 0.7455,
+      "step": 934
+    },
+    {
+      "epoch": 0.49866666666666665,
+      "grad_norm": 0.5719893983924824,
+      "learning_rate": 0.0001053543894032493,
+      "loss": 0.9416,
+      "step": 935
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.5860034231810596,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.8477,
+      "step": 936
+    },
+    {
+      "epoch": 0.4997333333333333,
+      "grad_norm": 0.4261071701248609,
+      "learning_rate": 0.00010500924413769988,
+      "loss": 0.7264,
+      "step": 937
+    },
+    {
+      "epoch": 0.5002666666666666,
+      "grad_norm": 0.46650246057627515,
+      "learning_rate": 0.00010483664880970457,
+      "loss": 0.7839,
+      "step": 938
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.4904731107816936,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7921,
+      "step": 939
+    },
+    {
+      "epoch": 0.5013333333333333,
+      "grad_norm": 0.4106913663496465,
+      "learning_rate": 0.00010449141534025045,
+      "loss": 0.7229,
+      "step": 940
+    },
+    {
+      "epoch": 0.5018666666666667,
+      "grad_norm": 0.4630754228578689,
+      "learning_rate": 0.00010431877822971117,
+      "loss": 0.8031,
+      "step": 941
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.669350177875791,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.8647,
+      "step": 942
+    },
+    {
+      "epoch": 0.5029333333333333,
+      "grad_norm": 0.550665101899265,
+      "learning_rate": 0.00010397346583460971,
+      "loss": 0.7562,
+      "step": 943
+    },
+    {
+      "epoch": 0.5034666666666666,
+      "grad_norm": 0.5337377164529131,
+      "learning_rate": 0.0001038007915812028,
+      "loss": 0.8166,
+      "step": 944
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.5369097933800977,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.8819,
+      "step": 945
+    },
+    {
+      "epoch": 0.5045333333333333,
+      "grad_norm": 0.5726516016893167,
+      "learning_rate": 0.0001034554095408326,
+      "loss": 0.7766,
+      "step": 946
+    },
+    {
+      "epoch": 0.5050666666666667,
+      "grad_norm": 0.5174007996748117,
+      "learning_rate": 0.00010328270278523256,
+      "loss": 0.7886,
+      "step": 947
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.5108075215013611,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7897,
+      "step": 948
+    },
+    {
+      "epoch": 0.5061333333333333,
+      "grad_norm": 0.4952947355198917,
+      "learning_rate": 0.00010293726038184393,
+      "loss": 0.7651,
+      "step": 949
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.3929843911174011,
+      "learning_rate": 0.00010276452576559879,
+      "loss": 0.6889,
+      "step": 950
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.5123541093374682,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7496,
+      "step": 951
+    },
+    {
+      "epoch": 0.5077333333333334,
+      "grad_norm": 0.7818822625689277,
+      "learning_rate": 0.00010241903228306431,
+      "loss": 0.8296,
+      "step": 952
+    },
+    {
+      "epoch": 0.5082666666666666,
+      "grad_norm": 0.5186603963496055,
+      "learning_rate": 0.0001022462744484709,
+      "loss": 0.7966,
+      "step": 953
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.48888205408805113,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.7337,
+      "step": 954
+    },
+    {
+      "epoch": 0.5093333333333333,
+      "grad_norm": 0.4230633658945411,
+      "learning_rate": 0.00010190073917203589,
+      "loss": 0.7062,
+      "step": 955
+    },
+    {
+      "epoch": 0.5098666666666667,
+      "grad_norm": 0.5203464661211605,
+      "learning_rate": 0.00010172796276201503,
+      "loss": 0.7566,
+      "step": 956
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.48279774075271586,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.8452,
+      "step": 957
+    },
+    {
+      "epoch": 0.5109333333333334,
+      "grad_norm": 0.45710951243634307,
+      "learning_rate": 0.00010138239497804804,
+      "loss": 0.6955,
+      "step": 958
+    },
+    {
+      "epoch": 0.5114666666666666,
+      "grad_norm": 0.5619796545503565,
+      "learning_rate": 0.00010120960463601976,
+      "loss": 0.7901,
+      "step": 959
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.48951536650972494,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7348,
+      "step": 960
+    },
+    {
+      "epoch": 0.5125333333333333,
+      "grad_norm": 0.47600503644883513,
+      "learning_rate": 0.00010086401363176305,
+      "loss": 0.7258,
+      "step": 961
+    },
+    {
+      "epoch": 0.5130666666666667,
+      "grad_norm": 0.4847148591163568,
+      "learning_rate": 0.00010069121400152181,
+      "loss": 0.7551,
+      "step": 962
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.5372752457232837,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.8417,
+      "step": 963
+    },
+    {
+      "epoch": 0.5141333333333333,
+      "grad_norm": 0.5872679932756703,
+      "learning_rate": 0.0001003456090648416,
+      "loss": 0.8513,
+      "step": 964
+    },
+    {
+      "epoch": 0.5146666666666667,
+      "grad_norm": 0.49083770690518475,
+      "learning_rate": 0.00010017280479043147,
+      "loss": 0.7121,
+      "step": 965
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4659127202746154,
+      "learning_rate": 0.0001,
+      "loss": 0.755,
+      "step": 966
+    },
+    {
+      "epoch": 0.5157333333333334,
+      "grad_norm": 0.48699175107640585,
+      "learning_rate": 9.982719520956855e-05,
+      "loss": 0.8061,
+      "step": 967
+    },
+    {
+      "epoch": 0.5162666666666667,
+      "grad_norm": 0.45843027668721104,
+      "learning_rate": 9.965439093515841e-05,
+      "loss": 0.6362,
+      "step": 968
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.5333429546353413,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7565,
+      "step": 969
+    },
+    {
+      "epoch": 0.5173333333333333,
+      "grad_norm": 0.43853763581027966,
+      "learning_rate": 9.930878599847821e-05,
+      "loss": 0.754,
+      "step": 970
+    },
+    {
+      "epoch": 0.5178666666666667,
+      "grad_norm": 0.5266600942678731,
+      "learning_rate": 9.913598636823693e-05,
+      "loss": 0.7966,
+      "step": 971
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.42561735307159076,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.7163,
+      "step": 972
+    },
+    {
+      "epoch": 0.5189333333333334,
+      "grad_norm": 0.449037557482995,
+      "learning_rate": 9.879039536398024e-05,
+      "loss": 0.7291,
+      "step": 973
+    },
+    {
+      "epoch": 0.5194666666666666,
+      "grad_norm": 0.42693571312985196,
+      "learning_rate": 9.861760502195197e-05,
+      "loss": 0.7571,
+      "step": 974
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.6057037416060455,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.8379,
+      "step": 975
+    },
+    {
+      "epoch": 0.5205333333333333,
+      "grad_norm": 0.38959454129722154,
+      "learning_rate": 9.827203723798498e-05,
+      "loss": 0.7104,
+      "step": 976
+    },
+    {
+      "epoch": 0.5210666666666667,
+      "grad_norm": 0.4412624315931409,
+      "learning_rate": 9.809926082796415e-05,
+      "loss": 0.7196,
+      "step": 977
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.6146574697091995,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.8773,
+      "step": 978
+    },
+    {
+      "epoch": 0.5221333333333333,
+      "grad_norm": 0.4521603647619686,
+      "learning_rate": 9.775372555152912e-05,
+      "loss": 0.7239,
+      "step": 979
+    },
+    {
+      "epoch": 0.5226666666666666,
+      "grad_norm": 0.6000515937257633,
+      "learning_rate": 9.758096771693573e-05,
+      "loss": 0.9331,
+      "step": 980
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.48263637707963297,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.7464,
+      "step": 981
+    },
+    {
+      "epoch": 0.5237333333333334,
+      "grad_norm": 0.5369406336442489,
+      "learning_rate": 9.723547423440122e-05,
+      "loss": 0.8192,
+      "step": 982
+    },
+    {
+      "epoch": 0.5242666666666667,
+      "grad_norm": 0.5734133799045289,
+      "learning_rate": 9.70627396181561e-05,
+      "loss": 0.8167,
+      "step": 983
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.5157323536339183,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7275,
+      "step": 984
+    },
+    {
+      "epoch": 0.5253333333333333,
+      "grad_norm": 0.4320454187671807,
+      "learning_rate": 9.671729721476746e-05,
+      "loss": 0.6858,
+      "step": 985
+    },
+    {
+      "epoch": 0.5258666666666667,
+      "grad_norm": 0.4362601796684237,
+      "learning_rate": 9.654459045916743e-05,
+      "loss": 0.7336,
+      "step": 986
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.43655010339443623,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7057,
+      "step": 987
+    },
+    {
+      "epoch": 0.5269333333333334,
+      "grad_norm": 0.565766930269078,
+      "learning_rate": 9.619920841879725e-05,
+      "loss": 0.8337,
+      "step": 988
+    },
+    {
+      "epoch": 0.5274666666666666,
+      "grad_norm": 0.4846368722573889,
+      "learning_rate": 9.602653416539031e-05,
+      "loss": 0.7062,
+      "step": 989
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.48713923130411746,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.7796,
+      "step": 990
+    },
+    {
+      "epoch": 0.5285333333333333,
+      "grad_norm": 0.6359416804035497,
+      "learning_rate": 9.568122177028884e-05,
+      "loss": 0.8497,
+      "step": 991
+    },
+    {
+      "epoch": 0.5290666666666667,
+      "grad_norm": 0.5866104931830324,
+      "learning_rate": 9.550858465974958e-05,
+      "loss": 0.8544,
+      "step": 992
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.6620979616144008,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.9016,
+      "step": 993
+    },
+    {
+      "epoch": 0.5301333333333333,
+      "grad_norm": 0.5382906566763935,
+      "learning_rate": 9.516335119029546e-05,
+      "loss": 0.8,
+      "step": 994
+    },
+    {
+      "epoch": 0.5306666666666666,
+      "grad_norm": 0.5147446979701066,
+      "learning_rate": 9.499075586230013e-05,
+      "loss": 0.8645,
+      "step": 995
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.4869675410544784,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.7537,
+      "step": 996
+    },
+    {
+      "epoch": 0.5317333333333333,
+      "grad_norm": 0.5085385693138902,
+      "learning_rate": 9.464561059675073e-05,
+      "loss": 0.7488,
+      "step": 997
+    },
+    {
+      "epoch": 0.5322666666666667,
+      "grad_norm": 0.466320279935278,
+      "learning_rate": 9.44730616898528e-05,
+      "loss": 0.7034,
+      "step": 998
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.6619554748109024,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.9005,
+      "step": 999
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.42704129186922574,
+      "learning_rate": 9.412801390409497e-05,
+      "loss": 0.6908,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5338666666666667,
+      "grad_norm": 0.6515567080007995,
+      "learning_rate": 9.395551605560018e-05,
+      "loss": 0.8798,
+      "step": 1001
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.639067034187619,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.8557,
+      "step": 1002
+    },
+    {
+      "epoch": 0.5349333333333334,
+      "grad_norm": 0.46267469343545414,
+      "learning_rate": 9.361057502290113e-05,
+      "loss": 0.7318,
+      "step": 1003
+    },
+    {
+      "epoch": 0.5354666666666666,
+      "grad_norm": 0.44148492385870847,
+      "learning_rate": 9.343813286874312e-05,
+      "loss": 0.7608,
+      "step": 1004
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.42488598217866247,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.6942,
+      "step": 1005
+    },
+    {
+      "epoch": 0.5365333333333333,
+      "grad_norm": 0.5400187523020266,
+      "learning_rate": 9.309330785950086e-05,
+      "loss": 0.8963,
+      "step": 1006
+    },
+    {
+      "epoch": 0.5370666666666667,
+      "grad_norm": 0.4033778484114782,
+      "learning_rate": 9.292092603411641e-05,
+      "loss": 0.5889,
+      "step": 1007
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.6648280630297925,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.8801,
+      "step": 1008
+    },
+    {
+      "epoch": 0.5381333333333334,
+      "grad_norm": 0.4158160219681407,
+      "learning_rate": 9.257622631561085e-05,
+      "loss": 0.7063,
+      "step": 1009
+    },
+    {
+      "epoch": 0.5386666666666666,
+      "grad_norm": 0.4340034267388822,
+      "learning_rate": 9.240390945181543e-05,
+      "loss": 0.6817,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.6060790801817617,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.8711,
+      "step": 1011
+    },
+    {
+      "epoch": 0.5397333333333333,
+      "grad_norm": 0.49214997795697046,
+      "learning_rate": 9.205934428795929e-05,
+      "loss": 0.7479,
+      "step": 1012
+    },
+    {
+      "epoch": 0.5402666666666667,
+      "grad_norm": 0.4554214767887092,
+      "learning_rate": 9.188709701682247e-05,
+      "loss": 0.7654,
+      "step": 1013
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.4834491284888872,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7904,
+      "step": 1014
+    },
+    {
+      "epoch": 0.5413333333333333,
+      "grad_norm": 0.5120511712619266,
+      "learning_rate": 9.154267566791223e-05,
+      "loss": 0.7684,
+      "step": 1015
+    },
+    {
+      "epoch": 0.5418666666666667,
+      "grad_norm": 0.6515054587877325,
+      "learning_rate": 9.137050261863324e-05,
+      "loss": 0.8495,
+      "step": 1016
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.49541283087197946,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7174,
+      "step": 1017
+    },
+    {
+      "epoch": 0.5429333333333334,
+      "grad_norm": 0.5203766008620797,
+      "learning_rate": 9.102623434110028e-05,
+      "loss": 0.7405,
+      "step": 1018
+    },
+    {
+      "epoch": 0.5434666666666667,
+      "grad_norm": 0.5692575546824659,
+      "learning_rate": 9.085414014088369e-05,
+      "loss": 0.6862,
+      "step": 1019
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.47628058661436834,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.8073,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5445333333333333,
+      "grad_norm": 0.4742278003831475,
+      "learning_rate": 9.051003418704565e-05,
+      "loss": 0.7852,
+      "step": 1021
+    },
+    {
+      "epoch": 0.5450666666666667,
+      "grad_norm": 0.5541467054786648,
+      "learning_rate": 9.033802346097682e-05,
+      "loss": 0.807,
+      "step": 1022
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.48691385545687804,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.7268,
+      "step": 1023
+    },
+    {
+      "epoch": 0.5461333333333334,
+      "grad_norm": 0.46472962020380765,
+      "learning_rate": 8.999408907878877e-05,
+      "loss": 0.6764,
+      "step": 1024
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 0.5500366067206548,
+      "learning_rate": 8.982216644970979e-05,
+      "loss": 0.7323,
+      "step": 1025
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.5108050694391268,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.8426,
+      "step": 1026
+    },
+    {
+      "epoch": 0.5477333333333333,
+      "grad_norm": 0.604186928096418,
+      "learning_rate": 8.947841288251568e-05,
+      "loss": 0.8281,
+      "step": 1027
+    },
+    {
+      "epoch": 0.5482666666666667,
+      "grad_norm": 0.4795065894056041,
+      "learning_rate": 8.930658297090091e-05,
+      "loss": 0.769,
+      "step": 1028
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.6189202287040847,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.7709,
+      "step": 1029
+    },
+    {
+      "epoch": 0.5493333333333333,
+      "grad_norm": 0.5380527668118427,
+      "learning_rate": 8.896301945718541e-05,
+      "loss": 0.7603,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5498666666666666,
+      "grad_norm": 0.4115800576976164,
+      "learning_rate": 8.879128688101749e-05,
+      "loss": 0.7383,
+      "step": 1031
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.4693879302110707,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7566,
+      "step": 1032
+    },
+    {
+      "epoch": 0.5509333333333334,
+      "grad_norm": 0.6167483217002174,
+      "learning_rate": 8.844792265415738e-05,
+      "loss": 0.8895,
+      "step": 1033
+    },
+    {
+      "epoch": 0.5514666666666667,
+      "grad_norm": 0.5125593007986087,
+      "learning_rate": 8.827629202880293e-05,
+      "loss": 0.748,
+      "step": 1034
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.5070721336715346,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.6403,
+      "step": 1035
+    },
+    {
+      "epoch": 0.5525333333333333,
+      "grad_norm": 0.6555424217817568,
+      "learning_rate": 8.793313631681915e-05,
+      "loss": 0.8753,
+      "step": 1036
+    },
+    {
+      "epoch": 0.5530666666666667,
+      "grad_norm": 0.4700363265708347,
+      "learning_rate": 8.776161225490489e-05,
+      "loss": 0.7608,
+      "step": 1037
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.45530246840219085,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.7679,
+      "step": 1038
+    },
+    {
+      "epoch": 0.5541333333333334,
+      "grad_norm": 0.5769680891344217,
+      "learning_rate": 8.741867428021446e-05,
+      "loss": 0.9385,
+      "step": 1039
+    },
+    {
+      "epoch": 0.5546666666666666,
+      "grad_norm": 0.5213522658748875,
+      "learning_rate": 8.724726139150318e-05,
+      "loss": 0.7074,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.5430504538099926,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.8272,
+      "step": 1041
+    },
+    {
+      "epoch": 0.5557333333333333,
+      "grad_norm": 0.5711565429390287,
+      "learning_rate": 8.690455037067141e-05,
+      "loss": 0.8051,
+      "step": 1042
+    },
+    {
+      "epoch": 0.5562666666666667,
+      "grad_norm": 0.5134932074468219,
+      "learning_rate": 8.673325326193806e-05,
+      "loss": 0.8268,
+      "step": 1043
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.5285646384272197,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.8118,
+      "step": 1044
+    },
+    {
+      "epoch": 0.5573333333333333,
+      "grad_norm": 0.6025455434008704,
+      "learning_rate": 8.639077840543077e-05,
+      "loss": 0.7901,
+      "step": 1045
+    },
+    {
+      "epoch": 0.5578666666666666,
+      "grad_norm": 0.4612698316993332,
+      "learning_rate": 8.621960168033867e-05,
+      "loss": 0.7982,
+      "step": 1046
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.5369786917678391,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.8198,
+      "step": 1047
+    },
+    {
+      "epoch": 0.5589333333333333,
+      "grad_norm": 0.47531988042191753,
+      "learning_rate": 8.587737219227462e-05,
+      "loss": 0.7494,
+      "step": 1048
+    },
+    {
+      "epoch": 0.5594666666666667,
+      "grad_norm": 0.4473668127784352,
+      "learning_rate": 8.570632045125185e-05,
+      "loss": 0.6847,
+      "step": 1049
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.6211559575041504,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.7318,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5605333333333333,
+      "grad_norm": 0.43328372044700475,
+      "learning_rate": 8.536434552915556e-05,
+      "loss": 0.6811,
+      "step": 1051
+    },
+    {
+      "epoch": 0.5610666666666667,
+      "grad_norm": 0.47205587916228653,
+      "learning_rate": 8.519342336927105e-05,
+      "loss": 0.8094,
+      "step": 1052
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.4694639479325875,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7448,
+      "step": 1053
+    },
+    {
+      "epoch": 0.5621333333333334,
+      "grad_norm": 0.47735306311845915,
+      "learning_rate": 8.485171220382545e-05,
+      "loss": 0.7255,
+      "step": 1054
+    },
+    {
+      "epoch": 0.5626666666666666,
+      "grad_norm": 0.4744457136560929,
+      "learning_rate": 8.468092421866573e-05,
+      "loss": 0.7868,
+      "step": 1055
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.4500414979056848,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7213,
+      "step": 1056
+    },
+    {
+      "epoch": 0.5637333333333333,
+      "grad_norm": 0.5390310724367505,
+      "learning_rate": 8.433948599346516e-05,
+      "loss": 0.8223,
+      "step": 1057
+    },
+    {
+      "epoch": 0.5642666666666667,
+      "grad_norm": 0.4778521611194694,
+      "learning_rate": 8.416883677301069e-05,
+      "loss": 0.8438,
+      "step": 1058
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.640710731690861,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.9087,
+      "step": 1059
+    },
+    {
+      "epoch": 0.5653333333333334,
+      "grad_norm": 0.5073079590848001,
+      "learning_rate": 8.382768066431425e-05,
+      "loss": 0.8257,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5658666666666666,
+      "grad_norm": 0.5713390277685219,
+      "learning_rate": 8.36571747948162e-05,
+      "loss": 0.7989,
+      "step": 1061
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4996593533508955,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7331,
+      "step": 1062
+    },
+    {
+      "epoch": 0.5669333333333333,
+      "grad_norm": 0.4621108429117102,
+      "learning_rate": 8.33163099713009e-05,
+      "loss": 0.7546,
+      "step": 1063
+    },
+    {
+      "epoch": 0.5674666666666667,
+      "grad_norm": 0.6679531151631573,
+      "learning_rate": 8.31459520351578e-05,
+      "loss": 0.8472,
+      "step": 1064
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.6668175327591062,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.8815,
+      "step": 1065
+    },
+    {
+      "epoch": 0.5685333333333333,
+      "grad_norm": 0.5663129656097874,
+      "learning_rate": 8.280538765767235e-05,
+      "loss": 0.7657,
+      "step": 1066
+    },
+    {
+      "epoch": 0.5690666666666667,
+      "grad_norm": 0.5685556021286235,
+      "learning_rate": 8.263518223330697e-05,
+      "loss": 0.8798,
+      "step": 1067
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.42441445817147244,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.655,
+      "step": 1068
+    },
+    {
+      "epoch": 0.5701333333333334,
+      "grad_norm": 0.5685652190537315,
+      "learning_rate": 8.22949274546255e-05,
+      "loss": 0.8485,
+      "step": 1069
+    },
+    {
+      "epoch": 0.5706666666666667,
+      "grad_norm": 0.5624917087659866,
+      "learning_rate": 8.212487911636184e-05,
+      "loss": 0.8148,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.3846401553503173,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6862,
+      "step": 1071
+    },
+    {
+      "epoch": 0.5717333333333333,
+      "grad_norm": 0.5743668213994452,
+      "learning_rate": 8.178494308093789e-05,
+      "loss": 0.8873,
+      "step": 1072
+    },
+    {
+      "epoch": 0.5722666666666667,
+      "grad_norm": 0.6479373917403084,
+      "learning_rate": 8.161505639887817e-05,
+      "loss": 0.8719,
+      "step": 1073
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.40451274184862296,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.6928,
+      "step": 1074
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 0.48464990730039686,
+      "learning_rate": 8.127544824259889e-05,
+      "loss": 0.7871,
+      "step": 1075
+    },
+    {
+      "epoch": 0.5738666666666666,
+      "grad_norm": 0.4265284370295491,
+      "learning_rate": 8.110572778250085e-05,
+      "loss": 0.6842,
+      "step": 1076
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.4531712771929814,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.8089,
+      "step": 1077
+    },
+    {
+      "epoch": 0.5749333333333333,
+      "grad_norm": 0.5282833160589813,
+      "learning_rate": 8.076645663244168e-05,
+      "loss": 0.8612,
+      "step": 1078
+    },
+    {
+      "epoch": 0.5754666666666667,
+      "grad_norm": 0.4966712268976588,
+      "learning_rate": 8.059690695559568e-05,
+      "loss": 0.8598,
+      "step": 1079
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.5036087290902579,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.7659,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5765333333333333,
+      "grad_norm": 0.5469001239551731,
+      "learning_rate": 8.025798192977481e-05,
+      "loss": 0.5943,
+      "step": 1081
+    },
+    {
+      "epoch": 0.5770666666666666,
+      "grad_norm": 0.4463582397100332,
+      "learning_rate": 8.008860759288147e-05,
+      "loss": 0.6806,
+      "step": 1082
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.4482926143147082,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.7409,
+      "step": 1083
+    },
+    {
+      "epoch": 0.5781333333333334,
+      "grad_norm": 0.5499106714015792,
+      "learning_rate": 7.975003780001485e-05,
+      "loss": 0.818,
+      "step": 1084
+    },
+    {
+      "epoch": 0.5786666666666667,
+      "grad_norm": 0.4929794102658157,
+      "learning_rate": 7.958084335506239e-05,
+      "loss": 0.7785,
+      "step": 1085
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.5349856013157154,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6787,
+      "step": 1086
+    },
+    {
+      "epoch": 0.5797333333333333,
+      "grad_norm": 0.45238853987498595,
+      "learning_rate": 7.924263789431912e-05,
+      "loss": 0.7539,
+      "step": 1087
+    },
+    {
+      "epoch": 0.5802666666666667,
+      "grad_norm": 0.4237508155111719,
+      "learning_rate": 7.907362788846116e-05,
+      "loss": 0.7112,
+      "step": 1088
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.45808964928763096,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.7771,
+      "step": 1089
+    },
+    {
+      "epoch": 0.5813333333333334,
+      "grad_norm": 0.4091449714829364,
+      "learning_rate": 7.873579584921869e-05,
+      "loss": 0.6485,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5818666666666666,
+      "grad_norm": 0.4889806760975611,
+      "learning_rate": 7.856697482465196e-05,
+      "loss": 0.6759,
+      "step": 1091
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.5508245245182095,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.8022,
+      "step": 1092
+    },
+    {
+      "epoch": 0.5829333333333333,
+      "grad_norm": 0.5898191229656782,
+      "learning_rate": 7.822952528625191e-05,
+      "loss": 0.8398,
+      "step": 1093
+    },
+    {
+      "epoch": 0.5834666666666667,
+      "grad_norm": 0.47826214094023284,
+      "learning_rate": 7.806089778009421e-05,
+      "loss": 0.697,
+      "step": 1094
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.5681861178160683,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.9488,
+      "step": 1095
+    },
+    {
+      "epoch": 0.5845333333333333,
+      "grad_norm": 0.5392993969331693,
+      "learning_rate": 7.772383981159849e-05,
+      "loss": 0.8002,
+      "step": 1096
+    },
+    {
+      "epoch": 0.5850666666666666,
+      "grad_norm": 0.4913637580180928,
+      "learning_rate": 7.755541035576677e-05,
+      "loss": 0.7783,
+      "step": 1097
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.504611556710071,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.7271,
+      "step": 1098
+    },
+    {
+      "epoch": 0.5861333333333333,
+      "grad_norm": 0.46748778682185405,
+      "learning_rate": 7.721875301571359e-05,
+      "loss": 0.7513,
+      "step": 1099
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.45296519137301017,
+      "learning_rate": 7.705052613680211e-05,
+      "loss": 0.7341,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.49348300837413195,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.8698,
+      "step": 1101
+    },
+    {
+      "epoch": 0.5877333333333333,
+      "grad_norm": 0.5032313813731415,
+      "learning_rate": 7.671427847296275e-05,
+      "loss": 0.7695,
+      "step": 1102
+    },
+    {
+      "epoch": 0.5882666666666667,
+      "grad_norm": 0.46451439857227805,
+      "learning_rate": 7.654625869212146e-05,
+      "loss": 0.8341,
+      "step": 1103
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.48209882298040035,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.7778,
+      "step": 1104
+    },
+    {
+      "epoch": 0.5893333333333334,
+      "grad_norm": 0.5516104800786016,
+      "learning_rate": 7.6210429741257e-05,
+      "loss": 0.7292,
+      "step": 1105
+    },
+    {
+      "epoch": 0.5898666666666667,
+      "grad_norm": 0.43801049561849525,
+      "learning_rate": 7.604262157407007e-05,
+      "loss": 0.7435,
+      "step": 1106
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.5079927430133366,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.7599,
+      "step": 1107
+    },
+    {
+      "epoch": 0.5909333333333333,
+      "grad_norm": 0.4763049318851867,
+      "learning_rate": 7.570722036168854e-05,
+      "loss": 0.6854,
+      "step": 1108
+    },
+    {
+      "epoch": 0.5914666666666667,
+      "grad_norm": 0.7403263673714452,
+      "learning_rate": 7.55396283180529e-05,
+      "loss": 0.8395,
+      "step": 1109
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4813353695503427,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.8065,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5925333333333334,
+      "grad_norm": 0.5446426690578455,
+      "learning_rate": 7.520466385816671e-05,
+      "loss": 0.7714,
+      "step": 1111
+    },
+    {
+      "epoch": 0.5930666666666666,
+      "grad_norm": 0.4326182537228563,
+      "learning_rate": 7.503729244217086e-05,
+      "loss": 0.6318,
+      "step": 1112
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.4584204156957829,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.7055,
+      "step": 1113
+    },
+    {
+      "epoch": 0.5941333333333333,
+      "grad_norm": 0.6282796044032658,
+      "learning_rate": 7.470277373705461e-05,
+      "loss": 0.8419,
+      "step": 1114
+    },
+    {
+      "epoch": 0.5946666666666667,
+      "grad_norm": 0.5543211445921447,
+      "learning_rate": 7.453562744685778e-05,
+      "loss": 0.7328,
+      "step": 1115
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.655151151291856,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.7464,
+      "step": 1116
+    },
+    {
+      "epoch": 0.5957333333333333,
+      "grad_norm": 0.5380245745320105,
+      "learning_rate": 7.42015634868062e-05,
+      "loss": 0.753,
+      "step": 1117
+    },
+    {
+      "epoch": 0.5962666666666666,
+      "grad_norm": 0.5265961461267854,
+      "learning_rate": 7.403464681451715e-05,
+      "loss": 0.823,
+      "step": 1118
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.6381331403841647,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.9757,
+      "step": 1119
+    },
+    {
+      "epoch": 0.5973333333333334,
+      "grad_norm": 0.4447661712160991,
+      "learning_rate": 7.370104657760361e-05,
+      "loss": 0.6729,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5978666666666667,
+      "grad_norm": 0.4045124027471263,
+      "learning_rate": 7.353436400916004e-05,
+      "loss": 0.6791,
+      "step": 1121
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.44717160795570676,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.7998,
+      "step": 1122
+    },
+    {
+      "epoch": 0.5989333333333333,
+      "grad_norm": 0.4844227400492327,
+      "learning_rate": 7.320123646099519e-05,
+      "loss": 0.784,
+      "step": 1123
+    },
+    {
+      "epoch": 0.5994666666666667,
+      "grad_norm": 0.5960917780879336,
+      "learning_rate": 7.303479247604332e-05,
+      "loss": 0.8309,
+      "step": 1124
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4941955447021566,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.777,
+      "step": 1125
+    },
+    {
+      "epoch": 0.6005333333333334,
+      "grad_norm": 0.4671897428092522,
+      "learning_rate": 7.270214656953415e-05,
+      "loss": 0.7454,
+      "step": 1126
+    },
+    {
+      "epoch": 0.6010666666666666,
+      "grad_norm": 0.5189767174413965,
+      "learning_rate": 7.253594564130804e-05,
+      "loss": 0.8536,
+      "step": 1127
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.4704982863123676,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.7822,
+      "step": 1128
+    },
+    {
+      "epoch": 0.6021333333333333,
+      "grad_norm": 0.5406238052780499,
+      "learning_rate": 7.22037903164173e-05,
+      "loss": 0.7489,
+      "step": 1129
+    },
+    {
+      "epoch": 0.6026666666666667,
+      "grad_norm": 0.6020386246541988,
+      "learning_rate": 7.203783691161883e-05,
+      "loss": 0.8678,
+      "step": 1130
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.45764114524716276,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.7431,
+      "step": 1131
+    },
+    {
+      "epoch": 0.6037333333333333,
+      "grad_norm": 0.5067908160648289,
+      "learning_rate": 7.170618109512465e-05,
+      "loss": 0.7513,
+      "step": 1132
+    },
+    {
+      "epoch": 0.6042666666666666,
+      "grad_norm": 0.5348274612779799,
+      "learning_rate": 7.154047967380354e-05,
+      "loss": 0.8676,
+      "step": 1133
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.47661862978938707,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.7565,
+      "step": 1134
+    },
+    {
+      "epoch": 0.6053333333333333,
+      "grad_norm": 0.43205702010047076,
+      "learning_rate": 7.12093322790597e-05,
+      "loss": 0.6713,
+      "step": 1135
+    },
+    {
+      "epoch": 0.6058666666666667,
+      "grad_norm": 0.5378722416279199,
+      "learning_rate": 7.104388729449338e-05,
+      "loss": 0.8349,
+      "step": 1136
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.3895921853008002,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.6663,
+      "step": 1137
+    },
+    {
+      "epoch": 0.6069333333333333,
+      "grad_norm": 0.5082806574142494,
+      "learning_rate": 7.071325722118963e-05,
+      "loss": 0.7544,
+      "step": 1138
+    },
+    {
+      "epoch": 0.6074666666666667,
+      "grad_norm": 0.5650201467097065,
+      "learning_rate": 7.054807311976379e-05,
+      "loss": 0.7768,
+      "step": 1139
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4096872443532017,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.7259,
+      "step": 1140
+    },
+    {
+      "epoch": 0.6085333333333334,
+      "grad_norm": 0.5379896796802529,
+      "learning_rate": 7.021796925368667e-05,
+      "loss": 0.8902,
+      "step": 1141
+    },
+    {
+      "epoch": 0.6090666666666666,
+      "grad_norm": 0.46756979598603726,
+      "learning_rate": 7.005305047477566e-05,
+      "loss": 0.766,
+      "step": 1142
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.48308377185190693,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6935,
+      "step": 1143
+    },
+    {
+      "epoch": 0.6101333333333333,
+      "grad_norm": 0.46655363286281903,
+      "learning_rate": 6.972348168756983e-05,
+      "loss": 0.7312,
+      "step": 1144
+    },
+    {
+      "epoch": 0.6106666666666667,
+      "grad_norm": 0.6958980881773419,
+      "learning_rate": 6.955883266341741e-05,
+      "loss": 0.8651,
+      "step": 1145
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.5290377094155534,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.691,
+      "step": 1146
+    },
+    {
+      "epoch": 0.6117333333333334,
+      "grad_norm": 0.518133987643529,
+      "learning_rate": 6.922980781234699e-05,
+      "loss": 0.7471,
+      "step": 1147
+    },
+    {
+      "epoch": 0.6122666666666666,
+      "grad_norm": 0.5157380752691214,
+      "learning_rate": 6.906543296794714e-05,
+      "loss": 0.8151,
+      "step": 1148
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.6494412370042536,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.8192,
+      "step": 1149
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.43205634335668175,
+      "learning_rate": 6.873696089565786e-05,
+      "loss": 0.7397,
+      "step": 1150
+    },
+    {
+      "epoch": 0.6138666666666667,
+      "grad_norm": 0.4554541305344579,
+      "learning_rate": 6.85728646486359e-05,
+      "loss": 0.7428,
+      "step": 1151
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.4493850061507572,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6925,
+      "step": 1152
+    },
+    {
+      "epoch": 0.6149333333333333,
+      "grad_norm": 0.5101316029528573,
+      "learning_rate": 6.82449541829174e-05,
+      "loss": 0.7068,
+      "step": 1153
+    },
+    {
+      "epoch": 0.6154666666666667,
+      "grad_norm": 0.6696214707884905,
+      "learning_rate": 6.80811409434113e-05,
+      "loss": 0.928,
+      "step": 1154
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.5144302677407107,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.7072,
+      "step": 1155
+    },
+    {
+      "epoch": 0.6165333333333334,
+      "grad_norm": 0.4802014965966708,
+      "learning_rate": 6.775380089695986e-05,
+      "loss": 0.7351,
+      "step": 1156
+    },
+    {
+      "epoch": 0.6170666666666667,
+      "grad_norm": 0.6233943714464858,
+      "learning_rate": 6.759027506750158e-05,
+      "loss": 0.7396,
+      "step": 1157
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.5902832195659008,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7455,
+      "step": 1158
+    },
+    {
+      "epoch": 0.6181333333333333,
+      "grad_norm": 0.5148558440489647,
+      "learning_rate": 6.726351423768322e-05,
+      "loss": 0.8222,
+      "step": 1159
+    },
+    {
+      "epoch": 0.6186666666666667,
+      "grad_norm": 0.5422651160322576,
+      "learning_rate": 6.710028021308061e-05,
+      "loss": 0.7444,
+      "step": 1160
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.5997442019912321,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.8776,
+      "step": 1161
+    },
+    {
+      "epoch": 0.6197333333333334,
+      "grad_norm": 0.47216962959753883,
+      "learning_rate": 6.677410738169485e-05,
+      "loss": 0.8149,
+      "step": 1162
+    },
+    {
+      "epoch": 0.6202666666666666,
+      "grad_norm": 0.48578142733428253,
+      "learning_rate": 6.661116954891328e-05,
+      "loss": 0.7723,
+      "step": 1163
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.43694832545933093,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.7322,
+      "step": 1164
+    },
+    {
+      "epoch": 0.6213333333333333,
+      "grad_norm": 0.4387965685892197,
+      "learning_rate": 6.62855934819569e-05,
+      "loss": 0.7192,
+      "step": 1165
+    },
+    {
+      "epoch": 0.6218666666666667,
+      "grad_norm": 0.44221737929282645,
+      "learning_rate": 6.612295622000162e-05,
+      "loss": 0.6721,
+      "step": 1166
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.5317043483069767,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7802,
+      "step": 1167
+    },
+    {
+      "epoch": 0.6229333333333333,
+      "grad_norm": 0.5254488331545611,
+      "learning_rate": 6.579798566743314e-05,
+      "loss": 0.854,
+      "step": 1168
+    },
+    {
+      "epoch": 0.6234666666666666,
+      "grad_norm": 0.4320448488412785,
+      "learning_rate": 6.563565334723134e-05,
+      "loss": 0.7555,
+      "step": 1169
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.6693722250597276,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7934,
+      "step": 1170
+    },
+    {
+      "epoch": 0.6245333333333334,
+      "grad_norm": 0.561061711091939,
+      "learning_rate": 6.531129704273604e-05,
+      "loss": 0.7534,
+      "step": 1171
+    },
+    {
+      "epoch": 0.6250666666666667,
+      "grad_norm": 0.42815001849208884,
+      "learning_rate": 6.514927402701964e-05,
+      "loss": 0.68,
+      "step": 1172
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.6616699072813104,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.7953,
+      "step": 1173
+    },
+    {
+      "epoch": 0.6261333333333333,
+      "grad_norm": 0.607465993246323,
+      "learning_rate": 6.48255406877745e-05,
+      "loss": 0.8411,
+      "step": 1174
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 0.4634088038952,
+      "learning_rate": 6.466383133096267e-05,
+      "loss": 0.7263,
+      "step": 1175
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.4663127644283584,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.7019,
+      "step": 1176
+    },
+    {
+      "epoch": 0.6277333333333334,
+      "grad_norm": 0.5318269218021549,
+      "learning_rate": 6.434072965740242e-05,
+      "loss": 0.8198,
+      "step": 1177
+    },
+    {
+      "epoch": 0.6282666666666666,
+      "grad_norm": 0.48418239365215526,
+      "learning_rate": 6.417933830548467e-05,
+      "loss": 0.788,
+      "step": 1178
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.4934059389962716,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7581,
+      "step": 1179
+    },
+    {
+      "epoch": 0.6293333333333333,
+      "grad_norm": 0.45641908593579644,
+      "learning_rate": 6.385687698106781e-05,
+      "loss": 0.7607,
+      "step": 1180
+    },
+    {
+      "epoch": 0.6298666666666667,
+      "grad_norm": 0.40804816282301337,
+      "learning_rate": 6.369580797148718e-05,
+      "loss": 0.7346,
+      "step": 1181
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.41541357433087245,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6507,
+      "step": 1182
+    },
+    {
+      "epoch": 0.6309333333333333,
+      "grad_norm": 0.530273367861406,
+      "learning_rate": 6.337399566246257e-05,
+      "loss": 0.7793,
+      "step": 1183
+    },
+    {
+      "epoch": 0.6314666666666666,
+      "grad_norm": 0.45198700938716535,
+      "learning_rate": 6.321325332399903e-05,
+      "loss": 0.6777,
+      "step": 1184
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.4910311261306767,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7497,
+      "step": 1185
+    },
+    {
+      "epoch": 0.6325333333333333,
+      "grad_norm": 0.4191032950549475,
+      "learning_rate": 6.289209867917312e-05,
+      "loss": 0.6815,
+      "step": 1186
+    },
+    {
+      "epoch": 0.6330666666666667,
+      "grad_norm": 0.42521602565259287,
+      "learning_rate": 6.273168733182722e-05,
+      "loss": 0.6696,
+      "step": 1187
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.45724365836303976,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.693,
+      "step": 1188
+    },
+    {
+      "epoch": 0.6341333333333333,
+      "grad_norm": 0.4681549116148737,
+      "learning_rate": 6.241119898233144e-05,
+      "loss": 0.7026,
+      "step": 1189
+    },
+    {
+      "epoch": 0.6346666666666667,
+      "grad_norm": 0.6789018870198453,
+      "learning_rate": 6.225112293720836e-05,
+      "loss": 0.7361,
+      "step": 1190
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.5063196359817714,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.759,
+      "step": 1191
+    },
+    {
+      "epoch": 0.6357333333333334,
+      "grad_norm": 0.770884473818953,
+      "learning_rate": 6.19313094962673e-05,
+      "loss": 0.6958,
+      "step": 1192
+    },
+    {
+      "epoch": 0.6362666666666666,
+      "grad_norm": 0.6074969474432725,
+      "learning_rate": 6.177157305546078e-05,
+      "loss": 0.8502,
+      "step": 1193
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.5011061449472839,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.7515,
+      "step": 1194
+    },
+    {
+      "epoch": 0.6373333333333333,
+      "grad_norm": 0.5350931694868268,
+      "learning_rate": 6.145244311816063e-05,
+      "loss": 0.7487,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6378666666666667,
+      "grad_norm": 0.6716992073597984,
+      "learning_rate": 6.129305057463741e-05,
+      "loss": 0.8926,
+      "step": 1196
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.524797879941467,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.8274,
+      "step": 1197
+    },
+    {
+      "epoch": 0.6389333333333334,
+      "grad_norm": 0.4809902269491656,
+      "learning_rate": 6.0974612717695004e-05,
+      "loss": 0.6717,
+      "step": 1198
+    },
+    {
+      "epoch": 0.6394666666666666,
+      "grad_norm": 0.4219959336351506,
+      "learning_rate": 6.0815568355179556e-05,
+      "loss": 0.7069,
+      "step": 1199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.4657782597005101,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.7688,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6405333333333333,
+      "grad_norm": 0.568815942322247,
+      "learning_rate": 6.0497831136711836e-05,
+      "loss": 0.811,
+      "step": 1201
+    },
+    {
+      "epoch": 0.6410666666666667,
+      "grad_norm": 0.5377443693598044,
+      "learning_rate": 6.0339139229571116e-05,
+      "loss": 0.8183,
+      "step": 1202
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.45838809987067625,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.7749,
+      "step": 1203
+    },
+    {
+      "epoch": 0.6421333333333333,
+      "grad_norm": 0.5101782962781369,
+      "learning_rate": 6.002211118886514e-05,
+      "loss": 0.8099,
+      "step": 1204
+    },
+    {
+      "epoch": 0.6426666666666667,
+      "grad_norm": 0.48877370124847747,
+      "learning_rate": 5.986377600199371e-05,
+      "loss": 0.643,
+      "step": 1205
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.46387351043563374,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6959,
+      "step": 1206
+    },
+    {
+      "epoch": 0.6437333333333334,
+      "grad_norm": 0.39213100681003926,
+      "learning_rate": 5.9547465659277215e-05,
+      "loss": 0.7196,
+      "step": 1207
+    },
+    {
+      "epoch": 0.6442666666666667,
+      "grad_norm": 0.42281381656254524,
+      "learning_rate": 5.938949144798279e-05,
+      "loss": 0.6999,
+      "step": 1208
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.5186431685086226,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.8203,
+      "step": 1209
+    },
+    {
+      "epoch": 0.6453333333333333,
+      "grad_norm": 0.4485652182183885,
+      "learning_rate": 5.907390730419507e-05,
+      "loss": 0.686,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6458666666666667,
+      "grad_norm": 0.5572375127517957,
+      "learning_rate": 5.8916298314083915e-05,
+      "loss": 0.7907,
+      "step": 1211
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.40511949815619563,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.664,
+      "step": 1212
+    },
+    {
+      "epoch": 0.6469333333333334,
+      "grad_norm": 0.482497695987161,
+      "learning_rate": 5.860144885064751e-05,
+      "loss": 0.7242,
+      "step": 1213
+    },
+    {
+      "epoch": 0.6474666666666666,
+      "grad_norm": 0.5357150658223143,
+      "learning_rate": 5.8444209317510514e-05,
+      "loss": 0.7433,
+      "step": 1214
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.5014769543067805,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.7665,
+      "step": 1215
+    },
+    {
+      "epoch": 0.6485333333333333,
+      "grad_norm": 0.5921082136434273,
+      "learning_rate": 5.813010299610313e-05,
+      "loss": 0.7779,
+      "step": 1216
+    },
+    {
+      "epoch": 0.6490666666666667,
+      "grad_norm": 0.48359932766996533,
+      "learning_rate": 5.797323714580192e-05,
+      "loss": 0.746,
+      "step": 1217
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.4953142138271574,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.7494,
+      "step": 1218
+    },
+    {
+      "epoch": 0.6501333333333333,
+      "grad_norm": 0.43792013269202484,
+      "learning_rate": 5.765988240812921e-05,
+      "loss": 0.6628,
+      "step": 1219
+    },
+    {
+      "epoch": 0.6506666666666666,
+      "grad_norm": 0.5439812674482977,
+      "learning_rate": 5.750339445648252e-05,
+      "loss": 0.7148,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.5922963850903797,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.8375,
+      "step": 1221
+    },
+    {
+      "epoch": 0.6517333333333334,
+      "grad_norm": 0.5806771674779112,
+      "learning_rate": 5.7190799724050924e-05,
+      "loss": 0.8751,
+      "step": 1222
+    },
+    {
+      "epoch": 0.6522666666666667,
+      "grad_norm": 0.4701483806067986,
+      "learning_rate": 5.7034693876721376e-05,
+      "loss": 0.764,
+      "step": 1223
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.4889909786916612,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.7088,
+      "step": 1224
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 0.4613240962353329,
+      "learning_rate": 5.6722867550612116e-05,
+      "loss": 0.7149,
+      "step": 1225
+    },
+    {
+      "epoch": 0.6538666666666667,
+      "grad_norm": 0.3984948217464435,
+      "learning_rate": 5.6567148002993164e-05,
+      "loss": 0.7382,
+      "step": 1226
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.501250894300205,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.7694,
+      "step": 1227
+    },
+    {
+      "epoch": 0.6549333333333334,
+      "grad_norm": 0.5822686939948196,
+      "learning_rate": 5.625609846363622e-05,
+      "loss": 0.8485,
+      "step": 1228
+    },
+    {
+      "epoch": 0.6554666666666666,
+      "grad_norm": 0.43881242398728537,
+      "learning_rate": 5.6100769400739383e-05,
+      "loss": 0.7095,
+      "step": 1229
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4512436169356323,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.792,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6565333333333333,
+      "grad_norm": 0.43294726731770017,
+      "learning_rate": 5.579050500768836e-05,
+      "loss": 0.6956,
+      "step": 1231
+    },
+    {
+      "epoch": 0.6570666666666667,
+      "grad_norm": 0.5981157102451093,
+      "learning_rate": 5.5635570604030705e-05,
+      "loss": 0.7722,
+      "step": 1232
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.5866986489901131,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.8363,
+      "step": 1233
+    },
+    {
+      "epoch": 0.6581333333333333,
+      "grad_norm": 0.4785783401013728,
+      "learning_rate": 5.53260996957381e-05,
+      "loss": 0.7856,
+      "step": 1234
+    },
+    {
+      "epoch": 0.6586666666666666,
+      "grad_norm": 0.5080859880685213,
+      "learning_rate": 5.5171564115230254e-05,
+      "loss": 0.7517,
+      "step": 1235
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.46964094461012745,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6938,
+      "step": 1236
+    },
+    {
+      "epoch": 0.6597333333333333,
+      "grad_norm": 0.672978787343272,
+      "learning_rate": 5.486289500882355e-05,
+      "loss": 0.8052,
+      "step": 1237
+    },
+    {
+      "epoch": 0.6602666666666667,
+      "grad_norm": 0.5027873812480866,
+      "learning_rate": 5.47087624046575e-05,
+      "loss": 0.7143,
+      "step": 1238
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.5810707920425806,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.7721,
+      "step": 1239
+    },
+    {
+      "epoch": 0.6613333333333333,
+      "grad_norm": 0.44439083231981336,
+      "learning_rate": 5.4400903395715366e-05,
+      "loss": 0.6527,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6618666666666667,
+      "grad_norm": 0.42210824519504037,
+      "learning_rate": 5.424717791025302e-05,
+      "loss": 0.7518,
+      "step": 1241
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4714095795814111,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.7473,
+      "step": 1242
+    },
+    {
+      "epoch": 0.6629333333333334,
+      "grad_norm": 0.504061296510171,
+      "learning_rate": 5.394013727258254e-05,
+      "loss": 0.6969,
+      "step": 1243
+    },
+    {
+      "epoch": 0.6634666666666666,
+      "grad_norm": 0.46041365032163317,
+      "learning_rate": 5.378682303724435e-05,
+      "loss": 0.6413,
+      "step": 1244
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4969594868488173,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.7814,
+      "step": 1245
+    },
+    {
+      "epoch": 0.6645333333333333,
+      "grad_norm": 0.49584945662167007,
+      "learning_rate": 5.348060902265871e-05,
+      "loss": 0.7614,
+      "step": 1246
+    },
+    {
+      "epoch": 0.6650666666666667,
+      "grad_norm": 0.5274520652139624,
+      "learning_rate": 5.332771015781275e-05,
+      "loss": 0.771,
+      "step": 1247
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.5332755545886234,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.7514,
+      "step": 1248
+    },
+    {
+      "epoch": 0.6661333333333334,
+      "grad_norm": 0.49100379041992465,
+      "learning_rate": 5.302233099590928e-05,
+      "loss": 0.7213,
+      "step": 1249
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.5938538866312052,
+      "learning_rate": 5.286985161076029e-05,
+      "loss": 0.7977,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.4942488328210268,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6769,
+      "step": 1251
+    },
+    {
+      "epoch": 0.6677333333333333,
+      "grad_norm": 0.4215994160402016,
+      "learning_rate": 5.2565315508699376e-05,
+      "loss": 0.5855,
+      "step": 1252
+    },
+    {
+      "epoch": 0.6682666666666667,
+      "grad_norm": 0.5086154225201112,
+      "learning_rate": 5.2413259701178505e-05,
+      "loss": 0.6753,
+      "step": 1253
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.5248714135836858,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6745,
+      "step": 1254
+    },
+    {
+      "epoch": 0.6693333333333333,
+      "grad_norm": 0.5620922527146265,
+      "learning_rate": 5.210957484346314e-05,
+      "loss": 0.7977,
+      "step": 1255
+    },
+    {
+      "epoch": 0.6698666666666667,
+      "grad_norm": 0.4182022770681136,
+      "learning_rate": 5.195794670011776e-05,
+      "loss": 0.6789,
+      "step": 1256
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.42913433058351197,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6787,
+      "step": 1257
+    },
+    {
+      "epoch": 0.6709333333333334,
+      "grad_norm": 0.6440569118119236,
+      "learning_rate": 5.165512124837344e-05,
+      "loss": 0.8633,
+      "step": 1258
+    },
+    {
+      "epoch": 0.6714666666666667,
+      "grad_norm": 0.49867787813607417,
+      "learning_rate": 5.150392484425728e-05,
+      "loss": 0.6548,
+      "step": 1259
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.506007457551456,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.7303,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6725333333333333,
+      "grad_norm": 0.4625411722735148,
+      "learning_rate": 5.120196693701267e-05,
+      "loss": 0.6942,
+      "step": 1261
+    },
+    {
+      "epoch": 0.6730666666666667,
+      "grad_norm": 0.5030217459622003,
+      "learning_rate": 5.105120633557634e-05,
+      "loss": 0.6883,
+      "step": 1262
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.46391289917365475,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7552,
+      "step": 1263
+    },
+    {
+      "epoch": 0.6741333333333334,
+      "grad_norm": 0.4119788599648552,
+      "learning_rate": 5.075012408804458e-05,
+      "loss": 0.6306,
+      "step": 1264
+    },
+    {
+      "epoch": 0.6746666666666666,
+      "grad_norm": 0.4399745670376556,
+      "learning_rate": 5.059980334102637e-05,
+      "loss": 0.6398,
+      "step": 1265
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.5260393823344774,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.8704,
+      "step": 1266
+    },
+    {
+      "epoch": 0.6757333333333333,
+      "grad_norm": 0.5601385184124167,
+      "learning_rate": 5.0299604844886985e-05,
+      "loss": 0.8091,
+      "step": 1267
+    },
+    {
+      "epoch": 0.6762666666666667,
+      "grad_norm": 0.5569550119402903,
+      "learning_rate": 5.014972799220403e-05,
+      "loss": 0.783,
+      "step": 1268
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.6694025213672582,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.8641,
+      "step": 1269
+    },
+    {
+      "epoch": 0.6773333333333333,
+      "grad_norm": 0.48258198891676685,
+      "learning_rate": 4.985042131538545e-05,
+      "loss": 0.7629,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6778666666666666,
+      "grad_norm": 0.43424476605807916,
+      "learning_rate": 4.9700992385024934e-05,
+      "loss": 0.6529,
+      "step": 1271
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.556758950681185,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.8336,
+      "step": 1272
+    },
+    {
+      "epoch": 0.6789333333333334,
+      "grad_norm": 0.46801204945419045,
+      "learning_rate": 4.940258557148765e-05,
+      "loss": 0.7457,
+      "step": 1273
+    },
+    {
+      "epoch": 0.6794666666666667,
+      "grad_norm": 0.5170130104399749,
+      "learning_rate": 4.9253608579398855e-05,
+      "loss": 0.7409,
+      "step": 1274
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.4723598024244657,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6647,
+      "step": 1275
+    },
+    {
+      "epoch": 0.6805333333333333,
+      "grad_norm": 0.5073743528033355,
+      "learning_rate": 4.895610964891923e-05,
+      "loss": 0.731,
+      "step": 1276
+    },
+    {
+      "epoch": 0.6810666666666667,
+      "grad_norm": 0.5333546065197741,
+      "learning_rate": 4.880758859890536e-05,
+      "loss": 0.7705,
+      "step": 1277
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.5430941842482363,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.7337,
+      "step": 1278
+    },
+    {
+      "epoch": 0.6821333333333334,
+      "grad_norm": 0.4521245278658801,
+      "learning_rate": 4.851100554686021e-05,
+      "loss": 0.7228,
+      "step": 1279
+    },
+    {
+      "epoch": 0.6826666666666666,
+      "grad_norm": 0.5579528039858371,
+      "learning_rate": 4.836294443047088e-05,
+      "loss": 0.6376,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.4378522151415932,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.6456,
+      "step": 1281
+    },
+    {
+      "epoch": 0.6837333333333333,
+      "grad_norm": 0.47299190909586863,
+      "learning_rate": 4.8067285227622404e-05,
+      "loss": 0.7334,
+      "step": 1282
+    },
+    {
+      "epoch": 0.6842666666666667,
+      "grad_norm": 0.45553444421775985,
+      "learning_rate": 4.791968802404648e-05,
+      "loss": 0.6508,
+      "step": 1283
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.6104515238781205,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.7464,
+      "step": 1284
+    },
+    {
+      "epoch": 0.6853333333333333,
+      "grad_norm": 0.4312168444174294,
+      "learning_rate": 4.762496061632814e-05,
+      "loss": 0.7016,
+      "step": 1285
+    },
+    {
+      "epoch": 0.6858666666666666,
+      "grad_norm": 0.6148834760259394,
+      "learning_rate": 4.747783129228656e-05,
+      "loss": 0.769,
+      "step": 1286
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.4715346096559044,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.6776,
+      "step": 1287
+    },
+    {
+      "epoch": 0.6869333333333333,
+      "grad_norm": 0.6178802722197313,
+      "learning_rate": 4.718404360058966e-05,
+      "loss": 0.8642,
+      "step": 1288
+    },
+    {
+      "epoch": 0.6874666666666667,
+      "grad_norm": 0.6860616728128789,
+      "learning_rate": 4.7037386110228985e-05,
+      "loss": 0.8978,
+      "step": 1289
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.5622536592324762,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.8318,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6885333333333333,
+      "grad_norm": 0.6339672235931774,
+      "learning_rate": 4.6744546030189486e-05,
+      "loss": 0.8077,
+      "step": 1291
+    },
+    {
+      "epoch": 0.6890666666666667,
+      "grad_norm": 0.6337318821383485,
+      "learning_rate": 4.659836431497563e-05,
+      "loss": 0.8657,
+      "step": 1292
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.5440546726106485,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.8027,
+      "step": 1293
+    },
+    {
+      "epoch": 0.6901333333333334,
+      "grad_norm": 0.4885364928818766,
+      "learning_rate": 4.630647971676232e-05,
+      "loss": 0.7567,
+      "step": 1294
+    },
+    {
+      "epoch": 0.6906666666666667,
+      "grad_norm": 0.5547202784254522,
+      "learning_rate": 4.6160777705374524e-05,
+      "loss": 0.7392,
+      "step": 1295
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.5522379529790205,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.7507,
+      "step": 1296
+    },
+    {
+      "epoch": 0.6917333333333333,
+      "grad_norm": 0.5014160980658192,
+      "learning_rate": 4.586985643347717e-05,
+      "loss": 0.6989,
+      "step": 1297
+    },
+    {
+      "epoch": 0.6922666666666667,
+      "grad_norm": 0.48194452166869545,
+      "learning_rate": 4.572463804170263e-05,
+      "loss": 0.7215,
+      "step": 1298
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.5414145480932072,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.7474,
+      "step": 1299
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.5363974976619018,
+      "learning_rate": 4.543468791472131e-05,
+      "loss": 0.6966,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6938666666666666,
+      "grad_norm": 0.5315348555098458,
+      "learning_rate": 4.5289957045349653e-05,
+      "loss": 0.7057,
+      "step": 1301
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.5496948104991902,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.9065,
+      "step": 1302
+    },
+    {
+      "epoch": 0.6949333333333333,
+      "grad_norm": 0.43605694299155395,
+      "learning_rate": 4.5000985855784746e-05,
+      "loss": 0.6293,
+      "step": 1303
+    },
+    {
+      "epoch": 0.6954666666666667,
+      "grad_norm": 0.46088868134026584,
+      "learning_rate": 4.485674639850333e-05,
+      "loss": 0.7132,
+      "step": 1304
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.4649796978337266,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7358,
+      "step": 1305
+    },
+    {
+      "epoch": 0.6965333333333333,
+      "grad_norm": 0.4418888859085772,
+      "learning_rate": 4.456876191254582e-05,
+      "loss": 0.7071,
+      "step": 1306
+    },
+    {
+      "epoch": 0.6970666666666666,
+      "grad_norm": 0.5046766627277882,
+      "learning_rate": 4.442501774383515e-05,
+      "loss": 0.7768,
+      "step": 1307
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.4557547083973194,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.7131,
+      "step": 1308
+    },
+    {
+      "epoch": 0.6981333333333334,
+      "grad_norm": 0.39055024125647814,
+      "learning_rate": 4.413802770115816e-05,
+      "loss": 0.6099,
+      "step": 1309
+    },
+    {
+      "epoch": 0.6986666666666667,
+      "grad_norm": 0.45005225206007193,
+      "learning_rate": 4.399478268418771e-05,
+      "loss": 0.7233,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.4326733656972398,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.634,
+      "step": 1311
+    },
+    {
+      "epoch": 0.6997333333333333,
+      "grad_norm": 0.5376495961296939,
+      "learning_rate": 4.3708794797738375e-05,
+      "loss": 0.8217,
+      "step": 1312
+    },
+    {
+      "epoch": 0.7002666666666667,
+      "grad_norm": 0.3553498079363257,
+      "learning_rate": 4.3566052782262735e-05,
+      "loss": 0.6513,
+      "step": 1313
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.4831175795186089,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.719,
+      "step": 1314
+    },
+    {
+      "epoch": 0.7013333333333334,
+      "grad_norm": 0.4580642378892101,
+      "learning_rate": 4.328107473805487e-05,
+      "loss": 0.6152,
+      "step": 1315
+    },
+    {
+      "epoch": 0.7018666666666666,
+      "grad_norm": 0.6353832067445115,
+      "learning_rate": 4.3138839560310303e-05,
+      "loss": 0.7743,
+      "step": 1316
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.7433318010308233,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.7238,
+      "step": 1317
+    },
+    {
+      "epoch": 0.7029333333333333,
+      "grad_norm": 0.5120089821147191,
+      "learning_rate": 4.2854879017217894e-05,
+      "loss": 0.6953,
+      "step": 1318
+    },
+    {
+      "epoch": 0.7034666666666667,
+      "grad_norm": 0.5530914964555543,
+      "learning_rate": 4.271315449981934e-05,
+      "loss": 0.7672,
+      "step": 1319
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.446703431755806,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.7543,
+      "step": 1320
+    },
+    {
+      "epoch": 0.7045333333333333,
+      "grad_norm": 0.4985683155201103,
+      "learning_rate": 4.2430219089370823e-05,
+      "loss": 0.763,
+      "step": 1321
+    },
+    {
+      "epoch": 0.7050666666666666,
+      "grad_norm": 0.4447455806296091,
+      "learning_rate": 4.228900904120895e-05,
+      "loss": 0.7282,
+      "step": 1322
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.524732558471761,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.793,
+      "step": 1323
+    },
+    {
+      "epoch": 0.7061333333333333,
+      "grad_norm": 0.5169358793386757,
+      "learning_rate": 4.200710636738189e-05,
+      "loss": 0.723,
+      "step": 1324
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 0.5670401797972314,
+      "learning_rate": 4.1866414583520877e-05,
+      "loss": 0.699,
+      "step": 1325
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.44486036937631557,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.7626,
+      "step": 1326
+    },
+    {
+      "epoch": 0.7077333333333333,
+      "grad_norm": 0.5124037474686822,
+      "learning_rate": 4.158555222253771e-05,
+      "loss": 0.7847,
+      "step": 1327
+    },
+    {
+      "epoch": 0.7082666666666667,
+      "grad_norm": 0.44326342488274656,
+      "learning_rate": 4.14453824841132e-05,
+      "loss": 0.6848,
+      "step": 1328
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.557505037931972,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.777,
+      "step": 1329
+    },
+    {
+      "epoch": 0.7093333333333334,
+      "grad_norm": 0.5183138109894138,
+      "learning_rate": 4.1165567984237764e-05,
+      "loss": 0.7005,
+      "step": 1330
+    },
+    {
+      "epoch": 0.7098666666666666,
+      "grad_norm": 0.477675504552373,
+      "learning_rate": 4.102592405835536e-05,
+      "loss": 0.7449,
+      "step": 1331
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.5426554793530708,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.6782,
+      "step": 1332
+    },
+    {
+      "epoch": 0.7109333333333333,
+      "grad_norm": 0.44255186163158894,
+      "learning_rate": 4.074716493968975e-05,
+      "loss": 0.6841,
+      "step": 1333
+    },
+    {
+      "epoch": 0.7114666666666667,
+      "grad_norm": 0.3863902721025659,
+      "learning_rate": 4.060805057932359e-05,
+      "loss": 0.6402,
+      "step": 1334
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3884260682205065,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6679,
+      "step": 1335
+    },
+    {
+      "epoch": 0.7125333333333334,
+      "grad_norm": 0.48546800516790917,
+      "learning_rate": 4.0330354333606234e-05,
+      "loss": 0.7065,
+      "step": 1336
+    },
+    {
+      "epoch": 0.7130666666666666,
+      "grad_norm": 0.45241040521383147,
+      "learning_rate": 4.019177327749822e-05,
+      "loss": 0.6521,
+      "step": 1337
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.5625006140730437,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.7623,
+      "step": 1338
+    },
+    {
+      "epoch": 0.7141333333333333,
+      "grad_norm": 0.5578660711528415,
+      "learning_rate": 3.991514736790258e-05,
+      "loss": 0.7355,
+      "step": 1339
+    },
+    {
+      "epoch": 0.7146666666666667,
+      "grad_norm": 0.46600670807627503,
+      "learning_rate": 3.977710334046193e-05,
+      "loss": 0.7163,
+      "step": 1340
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.5593721202886864,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.8431,
+      "step": 1341
+    },
+    {
+      "epoch": 0.7157333333333333,
+      "grad_norm": 0.40681967664480323,
+      "learning_rate": 3.950155520139581e-05,
+      "loss": 0.6492,
+      "step": 1342
+    },
+    {
+      "epoch": 0.7162666666666667,
+      "grad_norm": 0.5289946303807045,
+      "learning_rate": 3.936405191259891e-05,
+      "loss": 0.7155,
+      "step": 1343
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.48816964803481955,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.8317,
+      "step": 1344
+    },
+    {
+      "epoch": 0.7173333333333334,
+      "grad_norm": 0.5769581385100243,
+      "learning_rate": 3.9089588949504655e-05,
+      "loss": 0.8158,
+      "step": 1345
+    },
+    {
+      "epoch": 0.7178666666666667,
+      "grad_norm": 0.4771966205047752,
+      "learning_rate": 3.895263009479534e-05,
+      "loss": 0.6407,
+      "step": 1346
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.4979329202373666,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.7623,
+      "step": 1347
+    },
+    {
+      "epoch": 0.7189333333333333,
+      "grad_norm": 0.5001062879658064,
+      "learning_rate": 3.867925968395085e-05,
+      "loss": 0.719,
+      "step": 1348
+    },
+    {
+      "epoch": 0.7194666666666667,
+      "grad_norm": 0.506521075551192,
+      "learning_rate": 3.854284894414122e-05,
+      "loss": 0.6791,
+      "step": 1349
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5503240252563162,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.8363,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7205333333333334,
+      "grad_norm": 0.5162890993641392,
+      "learning_rate": 3.82705784324618e-05,
+      "loss": 0.7164,
+      "step": 1351
+    },
+    {
+      "epoch": 0.7210666666666666,
+      "grad_norm": 0.5295010839522402,
+      "learning_rate": 3.8134719473633094e-05,
+      "loss": 0.6623,
+      "step": 1352
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.5019958403606616,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.7614,
+      "step": 1353
+    },
+    {
+      "epoch": 0.7221333333333333,
+      "grad_norm": 0.49783319519372066,
+      "learning_rate": 3.786355617847385e-05,
+      "loss": 0.76,
+      "step": 1354
+    },
+    {
+      "epoch": 0.7226666666666667,
+      "grad_norm": 0.47246774450334006,
+      "learning_rate": 3.772825265187802e-05,
+      "loss": 0.6311,
+      "step": 1355
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.49908252295917693,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.7004,
+      "step": 1356
+    },
+    {
+      "epoch": 0.7237333333333333,
+      "grad_norm": 0.5023872903425932,
+      "learning_rate": 3.7458203860837234e-05,
+      "loss": 0.7962,
+      "step": 1357
+    },
+    {
+      "epoch": 0.7242666666666666,
+      "grad_norm": 0.5296038386612373,
+      "learning_rate": 3.732345940279893e-05,
+      "loss": 0.7145,
+      "step": 1358
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.4801478395862959,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.7896,
+      "step": 1359
+    },
+    {
+      "epoch": 0.7253333333333334,
+      "grad_norm": 0.47532808111907654,
+      "learning_rate": 3.705453237352227e-05,
+      "loss": 0.6979,
+      "step": 1360
+    },
+    {
+      "epoch": 0.7258666666666667,
+      "grad_norm": 0.6487872517751047,
+      "learning_rate": 3.692035060534088e-05,
+      "loss": 0.9316,
+      "step": 1361
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.45283259562586164,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.7261,
+      "step": 1362
+    },
+    {
+      "epoch": 0.7269333333333333,
+      "grad_norm": 0.5359049357600549,
+      "learning_rate": 3.665255256532638e-05,
+      "loss": 0.7996,
+      "step": 1363
+    },
+    {
+      "epoch": 0.7274666666666667,
+      "grad_norm": 0.57903559949146,
+      "learning_rate": 3.651893709317887e-05,
+      "loss": 0.7025,
+      "step": 1364
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.42218551646518954,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6646,
+      "step": 1365
+    },
+    {
+      "epoch": 0.7285333333333334,
+      "grad_norm": 0.4729458889883054,
+      "learning_rate": 3.625227523958252e-05,
+      "loss": 0.6803,
+      "step": 1366
+    },
+    {
+      "epoch": 0.7290666666666666,
+      "grad_norm": 0.5367830113665476,
+      "learning_rate": 3.611922965442648e-05,
+      "loss": 0.7685,
+      "step": 1367
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3904123291334064,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.6442,
+      "step": 1368
+    },
+    {
+      "epoch": 0.7301333333333333,
+      "grad_norm": 0.5182504791183655,
+      "learning_rate": 3.5853711153868965e-05,
+      "loss": 0.7414,
+      "step": 1369
+    },
+    {
+      "epoch": 0.7306666666666667,
+      "grad_norm": 0.5489916184826816,
+      "learning_rate": 3.5721239031346066e-05,
+      "loss": 0.7705,
+      "step": 1370
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.5345518887498807,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.7244,
+      "step": 1371
+    },
+    {
+      "epoch": 0.7317333333333333,
+      "grad_norm": 0.39855204806143646,
+      "learning_rate": 3.545687101972013e-05,
+      "loss": 0.7069,
+      "step": 1372
+    },
+    {
+      "epoch": 0.7322666666666666,
+      "grad_norm": 0.38343271970491233,
+      "learning_rate": 3.53249759200601e-05,
+      "loss": 0.6327,
+      "step": 1373
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.5320127876609486,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.8237,
+      "step": 1374
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.5319164206608027,
+      "learning_rate": 3.506176550233863e-05,
+      "loss": 0.7438,
+      "step": 1375
+    },
+    {
+      "epoch": 0.7338666666666667,
+      "grad_norm": 0.4475564619340168,
+      "learning_rate": 3.4930450970263485e-05,
+      "loss": 0.7102,
+      "step": 1376
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.5204112631420996,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.7062,
+      "step": 1377
+    },
+    {
+      "epoch": 0.7349333333333333,
+      "grad_norm": 0.5665374236680606,
+      "learning_rate": 3.46684052203088e-05,
+      "loss": 0.725,
+      "step": 1378
+    },
+    {
+      "epoch": 0.7354666666666667,
+      "grad_norm": 0.49320064737854913,
+      "learning_rate": 3.4537674784937614e-05,
+      "loss": 0.8121,
+      "step": 1379
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.6305004785407522,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.7841,
+      "step": 1380
+    },
+    {
+      "epoch": 0.7365333333333334,
+      "grad_norm": 0.5103737337145858,
+      "learning_rate": 3.427680074531113e-05,
+      "loss": 0.7701,
+      "step": 1381
+    },
+    {
+      "epoch": 0.7370666666666666,
+      "grad_norm": 0.5587074934448885,
+      "learning_rate": 3.4146657920065285e-05,
+      "loss": 0.8189,
+      "step": 1382
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.608655438526152,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7247,
+      "step": 1383
+    },
+    {
+      "epoch": 0.7381333333333333,
+      "grad_norm": 0.3973307572980837,
+      "learning_rate": 3.388696260183832e-05,
+      "loss": 0.6174,
+      "step": 1384
+    },
+    {
+      "epoch": 0.7386666666666667,
+      "grad_norm": 0.5380683436519683,
+      "learning_rate": 3.3757410884346894e-05,
+      "loss": 0.7722,
+      "step": 1385
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.5235736873409991,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.7243,
+      "step": 1386
+    },
+    {
+      "epoch": 0.7397333333333334,
+      "grad_norm": 0.4668120644303702,
+      "learning_rate": 3.3498901266912396e-05,
+      "loss": 0.6957,
+      "step": 1387
+    },
+    {
+      "epoch": 0.7402666666666666,
+      "grad_norm": 0.4843637052915313,
+      "learning_rate": 3.336994413891828e-05,
+      "loss": 0.7176,
+      "step": 1388
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.36948340124858087,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.6135,
+      "step": 1389
+    },
+    {
+      "epoch": 0.7413333333333333,
+      "grad_norm": 0.3817406780247521,
+      "learning_rate": 3.3112627169802946e-05,
+      "loss": 0.5869,
+      "step": 1390
+    },
+    {
+      "epoch": 0.7418666666666667,
+      "grad_norm": 0.5036523649989718,
+      "learning_rate": 3.298426809706928e-05,
+      "loss": 0.65,
+      "step": 1391
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.6680926557764835,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.777,
+      "step": 1392
+    },
+    {
+      "epoch": 0.7429333333333333,
+      "grad_norm": 0.4547140456205486,
+      "learning_rate": 3.2728150691747115e-05,
+      "loss": 0.6212,
+      "step": 1393
+    },
+    {
+      "epoch": 0.7434666666666667,
+      "grad_norm": 0.4760441911558683,
+      "learning_rate": 3.2600393123964113e-05,
+      "loss": 0.6633,
+      "step": 1394
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4472468798232282,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6992,
+      "step": 1395
+    },
+    {
+      "epoch": 0.7445333333333334,
+      "grad_norm": 0.541400706662573,
+      "learning_rate": 3.234548216567049e-05,
+      "loss": 0.8108,
+      "step": 1396
+    },
+    {
+      "epoch": 0.7450666666666667,
+      "grad_norm": 0.5837970062841571,
+      "learning_rate": 3.2218329536362704e-05,
+      "loss": 0.7361,
+      "step": 1397
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.5825471471342758,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.8008,
+      "step": 1398
+    },
+    {
+      "epoch": 0.7461333333333333,
+      "grad_norm": 0.49119367337757014,
+      "learning_rate": 3.196463187590929e-05,
+      "loss": 0.671,
+      "step": 1399
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.5146132969965356,
+      "learning_rate": 3.1838087602343344e-05,
+      "loss": 0.7137,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.45711650738707627,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.7292,
+      "step": 1401
+    },
+    {
+      "epoch": 0.7477333333333334,
+      "grad_norm": 0.5450920433168046,
+      "learning_rate": 3.158561005793402e-05,
+      "loss": 0.7085,
+      "step": 1402
+    },
+    {
+      "epoch": 0.7482666666666666,
+      "grad_norm": 0.4880516059939469,
+      "learning_rate": 3.145967754102691e-05,
+      "loss": 0.7886,
+      "step": 1403
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.5540779020706869,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.7657,
+      "step": 1404
+    },
+    {
+      "epoch": 0.7493333333333333,
+      "grad_norm": 0.4985735020980697,
+      "learning_rate": 3.120842689807468e-05,
+      "loss": 0.711,
+      "step": 1405
+    },
+    {
+      "epoch": 0.7498666666666667,
+      "grad_norm": 0.4345770921338324,
+      "learning_rate": 3.108310952230212e-05,
+      "loss": 0.6625,
+      "step": 1406
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.39460650068404723,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6976,
+      "step": 1407
+    },
+    {
+      "epoch": 0.7509333333333333,
+      "grad_norm": 0.5785645508518101,
+      "learning_rate": 3.083309253324651e-05,
+      "loss": 0.8539,
+      "step": 1408
+    },
+    {
+      "epoch": 0.7514666666666666,
+      "grad_norm": 0.9136536474478617,
+      "learning_rate": 3.070839366655215e-05,
+      "loss": 0.7693,
+      "step": 1409
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.7312061473447439,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.9283,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7525333333333334,
+      "grad_norm": 0.4833939545743598,
+      "learning_rate": 3.0459617050677868e-05,
+      "loss": 0.7393,
+      "step": 1411
+    },
+    {
+      "epoch": 0.7530666666666667,
+      "grad_norm": 0.5777872168268464,
+      "learning_rate": 3.0335540044382694e-05,
+      "loss": 0.6804,
+      "step": 1412
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.7351180136300047,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.8204,
+      "step": 1413
+    },
+    {
+      "epoch": 0.7541333333333333,
+      "grad_norm": 0.5079474619126504,
+      "learning_rate": 3.008801048763914e-05,
+      "loss": 0.7901,
+      "step": 1414
+    },
+    {
+      "epoch": 0.7546666666666667,
+      "grad_norm": 0.45629376236150493,
+      "learning_rate": 2.996455867635155e-05,
+      "loss": 0.7184,
+      "step": 1415
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.5133009909462517,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.8571,
+      "step": 1416
+    },
+    {
+      "epoch": 0.7557333333333334,
+      "grad_norm": 0.4853061726082453,
+      "learning_rate": 2.9718282831172883e-05,
+      "loss": 0.6878,
+      "step": 1417
+    },
+    {
+      "epoch": 0.7562666666666666,
+      "grad_norm": 0.7062162795938509,
+      "learning_rate": 2.9595459532698854e-05,
+      "loss": 0.8062,
+      "step": 1418
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.5187628854617226,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.7226,
+      "step": 1419
+    },
+    {
+      "epoch": 0.7573333333333333,
+      "grad_norm": 0.44021219651503096,
+      "learning_rate": 2.9350444017825385e-05,
+      "loss": 0.6203,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7578666666666667,
+      "grad_norm": 0.5201824428298304,
+      "learning_rate": 2.922825253307947e-05,
+      "loss": 0.7495,
+      "step": 1421
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.9047447233905686,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.7796,
+      "step": 1422
+    },
+    {
+      "epoch": 0.7589333333333333,
+      "grad_norm": 0.38481153766405335,
+      "learning_rate": 2.898450393337977e-05,
+      "loss": 0.6882,
+      "step": 1423
+    },
+    {
+      "epoch": 0.7594666666666666,
+      "grad_norm": 0.44462652136571124,
+      "learning_rate": 2.8862947546296315e-05,
+      "loss": 0.7444,
+      "step": 1424
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.5055843010425,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.6945,
+      "step": 1425
+    },
+    {
+      "epoch": 0.7605333333333333,
+      "grad_norm": 0.5534967200400872,
+      "learning_rate": 2.8620472412590228e-05,
+      "loss": 0.8019,
+      "step": 1426
+    },
+    {
+      "epoch": 0.7610666666666667,
+      "grad_norm": 0.506842202837792,
+      "learning_rate": 2.8499554390035143e-05,
+      "loss": 0.7231,
+      "step": 1427
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.47564544136627496,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.7637,
+      "step": 1428
+    },
+    {
+      "epoch": 0.7621333333333333,
+      "grad_norm": 0.46953984846854974,
+      "learning_rate": 2.8258359238917665e-05,
+      "loss": 0.6723,
+      "step": 1429
+    },
+    {
+      "epoch": 0.7626666666666667,
+      "grad_norm": 0.46835967280477037,
+      "learning_rate": 2.8138082830600554e-05,
+      "loss": 0.6714,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.5744851355888679,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.7623,
+      "step": 1431
+    },
+    {
+      "epoch": 0.7637333333333334,
+      "grad_norm": 0.46780492927444983,
+      "learning_rate": 2.7898174144266732e-05,
+      "loss": 0.6781,
+      "step": 1432
+    },
+    {
+      "epoch": 0.7642666666666666,
+      "grad_norm": 0.6108339290589371,
+      "learning_rate": 2.7778542582653744e-05,
+      "loss": 0.7367,
+      "step": 1433
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.4200445732358918,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6772,
+      "step": 1434
+    },
+    {
+      "epoch": 0.7653333333333333,
+      "grad_norm": 0.44003838748442664,
+      "learning_rate": 2.753992680872457e-05,
+      "loss": 0.8008,
+      "step": 1435
+    },
+    {
+      "epoch": 0.7658666666666667,
+      "grad_norm": 0.5101369359628932,
+      "learning_rate": 2.7420943308951284e-05,
+      "loss": 0.7532,
+      "step": 1436
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.49022908128800663,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.6776,
+      "step": 1437
+    },
+    {
+      "epoch": 0.7669333333333334,
+      "grad_norm": 0.5011160457821283,
+      "learning_rate": 2.7183626860300247e-05,
+      "loss": 0.7838,
+      "step": 1438
+    },
+    {
+      "epoch": 0.7674666666666666,
+      "grad_norm": 0.5549048902973877,
+      "learning_rate": 2.7065294620085424e-05,
+      "loss": 0.7788,
+      "step": 1439
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.46499151041580006,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6757,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7685333333333333,
+      "grad_norm": 0.5227086813836328,
+      "learning_rate": 2.6829283874666233e-05,
+      "loss": 0.611,
+      "step": 1441
+    },
+    {
+      "epoch": 0.7690666666666667,
+      "grad_norm": 0.5269337175198842,
+      "learning_rate": 2.6711606074225782e-05,
+      "loss": 0.7644,
+      "step": 1442
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.5937431269608971,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.7407,
+      "step": 1443
+    },
+    {
+      "epoch": 0.7701333333333333,
+      "grad_norm": 0.7069015746962616,
+      "learning_rate": 2.647690737490106e-05,
+      "loss": 0.7024,
+      "step": 1444
+    },
+    {
+      "epoch": 0.7706666666666667,
+      "grad_norm": 0.4820769595037139,
+      "learning_rate": 2.6359887176862718e-05,
+      "loss": 0.7244,
+      "step": 1445
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.6025833896320884,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.835,
+      "step": 1446
+    },
+    {
+      "epoch": 0.7717333333333334,
+      "grad_norm": 0.5122563531231694,
+      "learning_rate": 2.6126506831233344e-05,
+      "loss": 0.7097,
+      "step": 1447
+    },
+    {
+      "epoch": 0.7722666666666667,
+      "grad_norm": 0.43897546891764955,
+      "learning_rate": 2.6010147380551475e-05,
+      "loss": 0.6709,
+      "step": 1448
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.47938233123497537,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6765,
+      "step": 1449
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.4074266138722427,
+      "learning_rate": 2.577809166078716e-05,
+      "loss": 0.6932,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7738666666666667,
+      "grad_norm": 0.5324500134388179,
+      "learning_rate": 2.566239608465838e-05,
+      "loss": 0.7736,
+      "step": 1451
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.5665658629227085,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.7511,
+      "step": 1452
+    },
+    {
+      "epoch": 0.7749333333333334,
+      "grad_norm": 0.5028946018469815,
+      "learning_rate": 2.543167122732918e-05,
+      "loss": 0.6953,
+      "step": 1453
+    },
+    {
+      "epoch": 0.7754666666666666,
+      "grad_norm": 0.49206414908888135,
+      "learning_rate": 2.5316642635108244e-05,
+      "loss": 0.6946,
+      "step": 1454
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.5304755577081713,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.7475,
+      "step": 1455
+    },
+    {
+      "epoch": 0.7765333333333333,
+      "grad_norm": 0.6393735675642492,
+      "learning_rate": 2.508725484101684e-05,
+      "loss": 0.8167,
+      "step": 1456
+    },
+    {
+      "epoch": 0.7770666666666667,
+      "grad_norm": 0.513773800806844,
+      "learning_rate": 2.4972896324133144e-05,
+      "loss": 0.6585,
+      "step": 1457
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.5662132456840565,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.7496,
+      "step": 1458
+    },
+    {
+      "epoch": 0.7781333333333333,
+      "grad_norm": 0.48634830493495995,
+      "learning_rate": 2.4744851758148156e-05,
+      "loss": 0.7295,
+      "step": 1459
+    },
+    {
+      "epoch": 0.7786666666666666,
+      "grad_norm": 0.43229534390856567,
+      "learning_rate": 2.4631166390022574e-05,
+      "loss": 0.7104,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.3938232209502425,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6811,
+      "step": 1461
+    },
+    {
+      "epoch": 0.7797333333333333,
+      "grad_norm": 0.5132216213085693,
+      "learning_rate": 2.4404471180913058e-05,
+      "loss": 0.737,
+      "step": 1462
+    },
+    {
+      "epoch": 0.7802666666666667,
+      "grad_norm": 0.4195156568509407,
+      "learning_rate": 2.429146201687538e-05,
+      "loss": 0.7066,
+      "step": 1463
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.40033997960486356,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6818,
+      "step": 1464
+    },
+    {
+      "epoch": 0.7813333333333333,
+      "grad_norm": 0.5373976326564892,
+      "learning_rate": 2.4066122257145894e-05,
+      "loss": 0.7328,
+      "step": 1465
+    },
+    {
+      "epoch": 0.7818666666666667,
+      "grad_norm": 0.45754895410055296,
+      "learning_rate": 2.3953792334352787e-05,
+      "loss": 0.6853,
+      "step": 1466
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.4575813475049241,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.5971,
+      "step": 1467
+    },
+    {
+      "epoch": 0.7829333333333334,
+      "grad_norm": 0.5451849398635704,
+      "learning_rate": 2.3729814080079816e-05,
+      "loss": 0.7032,
+      "step": 1468
+    },
+    {
+      "epoch": 0.7834666666666666,
+      "grad_norm": 0.5441774619324102,
+      "learning_rate": 2.361816641743303e-05,
+      "loss": 0.7331,
+      "step": 1469
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.39524876304033657,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.6186,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7845333333333333,
+      "grad_norm": 0.41980029309219075,
+      "learning_rate": 2.339555568810221e-05,
+      "loss": 0.6764,
+      "step": 1471
+    },
+    {
+      "epoch": 0.7850666666666667,
+      "grad_norm": 0.4290035490569862,
+      "learning_rate": 2.328459328616759e-05,
+      "loss": 0.7402,
+      "step": 1472
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.4344070777605754,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6437,
+      "step": 1473
+    },
+    {
+      "epoch": 0.7861333333333334,
+      "grad_norm": 0.41828191602597053,
+      "learning_rate": 2.306335606451181e-05,
+      "loss": 0.6458,
+      "step": 1474
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 0.5392793424158163,
+      "learning_rate": 2.295308190543859e-05,
+      "loss": 0.7386,
+      "step": 1475
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.4887441003063013,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6862,
+      "step": 1476
+    },
+    {
+      "epoch": 0.7877333333333333,
+      "grad_norm": 0.6256757383610962,
+      "learning_rate": 2.2733224137277366e-05,
+      "loss": 0.8432,
+      "step": 1477
+    },
+    {
+      "epoch": 0.7882666666666667,
+      "grad_norm": 0.5464595903115794,
+      "learning_rate": 2.262364118471805e-05,
+      "loss": 0.7301,
+      "step": 1478
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.5638335821204267,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.7447,
+      "step": 1479
+    },
+    {
+      "epoch": 0.7893333333333333,
+      "grad_norm": 0.5525261367409167,
+      "learning_rate": 2.2405168778797646e-05,
+      "loss": 0.8391,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7898666666666667,
+      "grad_norm": 0.4953289966782963,
+      "learning_rate": 2.2296279977828337e-05,
+      "loss": 0.6948,
+      "step": 1481
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.45134057631510055,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.6883,
+      "step": 1482
+    },
+    {
+      "epoch": 0.7909333333333334,
+      "grad_norm": 0.42700165725759,
+      "learning_rate": 2.2079198805662914e-05,
+      "loss": 0.6537,
+      "step": 1483
+    },
+    {
+      "epoch": 0.7914666666666667,
+      "grad_norm": 0.49804296009133825,
+      "learning_rate": 2.1971007082704164e-05,
+      "loss": 0.7491,
+      "step": 1484
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.4871235507358856,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6385,
+      "step": 1485
+    },
+    {
+      "epoch": 0.7925333333333333,
+      "grad_norm": 0.5887939524755246,
+      "learning_rate": 2.1755322978418137e-05,
+      "loss": 0.8623,
+      "step": 1486
+    },
+    {
+      "epoch": 0.7930666666666667,
+      "grad_norm": 0.5463547126142305,
+      "learning_rate": 2.1647831241156302e-05,
+      "loss": 0.7247,
+      "step": 1487
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.4681078387739679,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.7115,
+      "step": 1488
+    },
+    {
+      "epoch": 0.7941333333333334,
+      "grad_norm": 0.5399798111944335,
+      "learning_rate": 2.1433550001327373e-05,
+      "loss": 0.6925,
+      "step": 1489
+    },
+    {
+      "epoch": 0.7946666666666666,
+      "grad_norm": 0.5538299714024002,
+      "learning_rate": 2.1326761138636553e-05,
+      "loss": 0.7917,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.4655161305883387,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.7605,
+      "step": 1491
+    },
+    {
+      "epoch": 0.7957333333333333,
+      "grad_norm": 0.46046084755303923,
+      "learning_rate": 2.111388852214001e-05,
+      "loss": 0.7305,
+      "step": 1492
+    },
+    {
+      "epoch": 0.7962666666666667,
+      "grad_norm": 0.6432362937427276,
+      "learning_rate": 2.1007805404004242e-05,
+      "loss": 0.7866,
+      "step": 1493
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.5233846063949636,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.7926,
+      "step": 1494
+    },
+    {
+      "epoch": 0.7973333333333333,
+      "grad_norm": 0.5790472384084447,
+      "learning_rate": 2.0796347131858186e-05,
+      "loss": 0.675,
+      "step": 1495
+    },
+    {
+      "epoch": 0.7978666666666666,
+      "grad_norm": 0.4714697559149237,
+      "learning_rate": 2.069097260929439e-05,
+      "loss": 0.7253,
+      "step": 1496
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.4270576465559357,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6784,
+      "step": 1497
+    },
+    {
+      "epoch": 0.7989333333333334,
+      "grad_norm": 0.5188902875539768,
+      "learning_rate": 2.048093436450603e-05,
+      "loss": 0.7824,
+      "step": 1498
+    },
+    {
+      "epoch": 0.7994666666666667,
+      "grad_norm": 0.6367857125346924,
+      "learning_rate": 2.0376271269487514e-05,
+      "loss": 0.8625,
+      "step": 1499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.4906668326163334,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.7062,
+      "step": 1500
+    },
+    {
+      "epoch": 0.8005333333333333,
+      "grad_norm": 0.5277813993514586,
+      "learning_rate": 2.0167658696900317e-05,
+      "loss": 0.8342,
+      "step": 1501
+    },
+    {
+      "epoch": 0.8010666666666667,
+      "grad_norm": 0.47983067644242744,
+      "learning_rate": 2.0063709842280432e-05,
+      "loss": 0.7006,
+      "step": 1502
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.43792461864947296,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.7622,
+      "step": 1503
+    },
+    {
+      "epoch": 0.8021333333333334,
+      "grad_norm": 0.4916156036073879,
+      "learning_rate": 1.985652854842247e-05,
+      "loss": 0.7127,
+      "step": 1504
+    },
+    {
+      "epoch": 0.8026666666666666,
+      "grad_norm": 0.5768343416347183,
+      "learning_rate": 1.9753296727859195e-05,
+      "loss": 0.7781,
+      "step": 1505
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.4569625310327317,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6978,
+      "step": 1506
+    },
+    {
+      "epoch": 0.8037333333333333,
+      "grad_norm": 0.4554160260672957,
+      "learning_rate": 1.9547552280792524e-05,
+      "loss": 0.6804,
+      "step": 1507
+    },
+    {
+      "epoch": 0.8042666666666667,
+      "grad_norm": 0.39705504892736143,
+      "learning_rate": 1.9445040268673298e-05,
+      "loss": 0.6464,
+      "step": 1508
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.5352078742401196,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.6905,
+      "step": 1509
+    },
+    {
+      "epoch": 0.8053333333333333,
+      "grad_norm": 0.5554651283839609,
+      "learning_rate": 1.9240738197844278e-05,
+      "loss": 0.7253,
+      "step": 1510
+    },
+    {
+      "epoch": 0.8058666666666666,
+      "grad_norm": 0.45444727275032104,
+      "learning_rate": 1.9138948749211472e-05,
+      "loss": 0.7549,
+      "step": 1511
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.49208812313330785,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.7315,
+      "step": 1512
+    },
+    {
+      "epoch": 0.8069333333333333,
+      "grad_norm": 0.4950480797316589,
+      "learning_rate": 1.8936094545302095e-05,
+      "loss": 0.7494,
+      "step": 1513
+    },
+    {
+      "epoch": 0.8074666666666667,
+      "grad_norm": 0.5498653769016123,
+      "learning_rate": 1.883503039577894e-05,
+      "loss": 0.7609,
+      "step": 1514
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.49438895303259683,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7298,
+      "step": 1515
+    },
+    {
+      "epoch": 0.8085333333333333,
+      "grad_norm": 0.3536975783106019,
+      "learning_rate": 1.8633629510559314e-05,
+      "loss": 0.6478,
+      "step": 1516
+    },
+    {
+      "epoch": 0.8090666666666667,
+      "grad_norm": 0.603891234009926,
+      "learning_rate": 1.8533293376276472e-05,
+      "loss": 0.7822,
+      "step": 1517
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.456745084908359,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.7652,
+      "step": 1518
+    },
+    {
+      "epoch": 0.8101333333333334,
+      "grad_norm": 0.4775098878163809,
+      "learning_rate": 1.8333351222458407e-05,
+      "loss": 0.7414,
+      "step": 1519
+    },
+    {
+      "epoch": 0.8106666666666666,
+      "grad_norm": 0.5057797701774622,
+      "learning_rate": 1.8233745799980817e-05,
+      "loss": 0.7019,
+      "step": 1520
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.4220879754209838,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.6576,
+      "step": 1521
+    },
+    {
+      "epoch": 0.8117333333333333,
+      "grad_norm": 0.5444032294164403,
+      "learning_rate": 1.803526775107217e-05,
+      "loss": 0.8257,
+      "step": 1522
+    },
+    {
+      "epoch": 0.8122666666666667,
+      "grad_norm": 0.481955756353624,
+      "learning_rate": 1.7936395717326704e-05,
+      "loss": 0.7183,
+      "step": 1523
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4362237396864434,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.7494,
+      "step": 1524
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 0.5170191679170411,
+      "learning_rate": 1.773938710748706e-05,
+      "loss": 0.7928,
+      "step": 1525
+    },
+    {
+      "epoch": 0.8138666666666666,
+      "grad_norm": 0.5080971785760765,
+      "learning_rate": 1.7641251119690505e-05,
+      "loss": 0.6409,
+      "step": 1526
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.5161371330680553,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.7773,
+      "step": 1527
+    },
+    {
+      "epoch": 0.8149333333333333,
+      "grad_norm": 0.5304994174906112,
+      "learning_rate": 1.744571724358789e-05,
+      "loss": 0.7557,
+      "step": 1528
+    },
+    {
+      "epoch": 0.8154666666666667,
+      "grad_norm": 0.4454709693189586,
+      "learning_rate": 1.7348319939175637e-05,
+      "loss": 0.7259,
+      "step": 1529
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.49547457346470486,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.7042,
+      "step": 1530
+    },
+    {
+      "epoch": 0.8165333333333333,
+      "grad_norm": 0.3729345692266381,
+      "learning_rate": 1.715426605184407e-05,
+      "loss": 0.6745,
+      "step": 1531
+    },
+    {
+      "epoch": 0.8170666666666667,
+      "grad_norm": 0.50135253803681,
+      "learning_rate": 1.705761004839911e-05,
+      "loss": 0.7581,
+      "step": 1532
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.6832387692102796,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7099,
+      "step": 1533
+    },
+    {
+      "epoch": 0.8181333333333334,
+      "grad_norm": 0.44670629381081794,
+      "learning_rate": 1.6865041365097435e-05,
+      "loss": 0.7218,
+      "step": 1534
+    },
+    {
+      "epoch": 0.8186666666666667,
+      "grad_norm": 0.5226987026394574,
+      "learning_rate": 1.676912926028007e-05,
+      "loss": 0.8117,
+      "step": 1535
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.4598291624037757,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6637,
+      "step": 1536
+    },
+    {
+      "epoch": 0.8197333333333333,
+      "grad_norm": 0.4829505048181646,
+      "learning_rate": 1.6578050956351886e-05,
+      "loss": 0.7263,
+      "step": 1537
+    },
+    {
+      "epoch": 0.8202666666666667,
+      "grad_norm": 0.48790533521950497,
+      "learning_rate": 1.6482885327829913e-05,
+      "loss": 0.7045,
+      "step": 1538
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.413581741770677,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6852,
+      "step": 1539
+    },
+    {
+      "epoch": 0.8213333333333334,
+      "grad_norm": 0.4483225785193507,
+      "learning_rate": 1.6293302538564382e-05,
+      "loss": 0.6595,
+      "step": 1540
+    },
+    {
+      "epoch": 0.8218666666666666,
+      "grad_norm": 0.5535810560099256,
+      "learning_rate": 1.619888594394382e-05,
+      "loss": 0.7033,
+      "step": 1541
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.6466696989559793,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.832,
+      "step": 1542
+    },
+    {
+      "epoch": 0.8229333333333333,
+      "grad_norm": 0.46938949706307914,
+      "learning_rate": 1.601080376443763e-05,
+      "loss": 0.605,
+      "step": 1543
+    },
+    {
+      "epoch": 0.8234666666666667,
+      "grad_norm": 0.5142034808023143,
+      "learning_rate": 1.5917138741193973e-05,
+      "loss": 0.6822,
+      "step": 1544
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.5907942946614096,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.8282,
+      "step": 1545
+    },
+    {
+      "epoch": 0.8245333333333333,
+      "grad_norm": 0.498830697777462,
+      "learning_rate": 1.573056222621453e-05,
+      "loss": 0.7772,
+      "step": 1546
+    },
+    {
+      "epoch": 0.8250666666666666,
+      "grad_norm": 0.5974015247062966,
+      "learning_rate": 1.5637651291624523e-05,
+      "loss": 0.7316,
+      "step": 1547
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.5259485131736402,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.7492,
+      "step": 1548
+    },
+    {
+      "epoch": 0.8261333333333334,
+      "grad_norm": 0.5107853708990638,
+      "learning_rate": 1.5452585455473977e-05,
+      "loss": 0.7255,
+      "step": 1549
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.5795296746970275,
+      "learning_rate": 1.536043110654809e-05,
+      "loss": 0.7327,
+      "step": 1550
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.48555881447046184,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.8639,
+      "step": 1551
+    },
+    {
+      "epoch": 0.8277333333333333,
+      "grad_norm": 0.4531193486352657,
+      "learning_rate": 1.5176880922928616e-05,
+      "loss": 0.7119,
+      "step": 1552
+    },
+    {
+      "epoch": 0.8282666666666667,
+      "grad_norm": 0.513527134320427,
+      "learning_rate": 1.5085485636343755e-05,
+      "loss": 0.7414,
+      "step": 1553
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.5954943410632516,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.7679,
+      "step": 1554
+    },
+    {
+      "epoch": 0.8293333333333334,
+      "grad_norm": 0.5613384242425445,
+      "learning_rate": 1.4903456038223939e-05,
+      "loss": 0.7713,
+      "step": 1555
+    },
+    {
+      "epoch": 0.8298666666666666,
+      "grad_norm": 0.4444495696459105,
+      "learning_rate": 1.4812822270257009e-05,
+      "loss": 0.7333,
+      "step": 1556
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.6737660720946029,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.7759,
+      "step": 1557
+    },
+    {
+      "epoch": 0.8309333333333333,
+      "grad_norm": 0.5986316877783368,
+      "learning_rate": 1.4632318149739177e-05,
+      "loss": 0.7094,
+      "step": 1558
+    },
+    {
+      "epoch": 0.8314666666666667,
+      "grad_norm": 0.3811464696827495,
+      "learning_rate": 1.454244833620102e-05,
+      "loss": 0.6575,
+      "step": 1559
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.38979442506119516,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6019,
+      "step": 1560
+    },
+    {
+      "epoch": 0.8325333333333333,
+      "grad_norm": 0.5112200475446241,
+      "learning_rate": 1.4363474544389877e-05,
+      "loss": 0.6747,
+      "step": 1561
+    },
+    {
+      "epoch": 0.8330666666666666,
+      "grad_norm": 0.4925816723698563,
+      "learning_rate": 1.4274371100559791e-05,
+      "loss": 0.7111,
+      "step": 1562
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.5026870895601208,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.7843,
+      "step": 1563
+    },
+    {
+      "epoch": 0.8341333333333333,
+      "grad_norm": 0.5327465316476543,
+      "learning_rate": 1.409693244743192e-05,
+      "loss": 0.794,
+      "step": 1564
+    },
+    {
+      "epoch": 0.8346666666666667,
+      "grad_norm": 0.5553229160368591,
+      "learning_rate": 1.4008597767992871e-05,
+      "loss": 0.702,
+      "step": 1565
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.7168884703670975,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.8406,
+      "step": 1566
+    },
+    {
+      "epoch": 0.8357333333333333,
+      "grad_norm": 0.609620211368279,
+      "learning_rate": 1.3832699022267515e-05,
+      "loss": 0.8081,
+      "step": 1567
+    },
+    {
+      "epoch": 0.8362666666666667,
+      "grad_norm": 0.5099940552094323,
+      "learning_rate": 1.37451354812416e-05,
+      "loss": 0.6834,
+      "step": 1568
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.4236752960093711,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6256,
+      "step": 1569
+    },
+    {
+      "epoch": 0.8373333333333334,
+      "grad_norm": 0.6151341652474434,
+      "learning_rate": 1.3570781370252582e-05,
+      "loss": 0.8692,
+      "step": 1570
+    },
+    {
+      "epoch": 0.8378666666666666,
+      "grad_norm": 0.5796419138079855,
+      "learning_rate": 1.3483991320937306e-05,
+      "loss": 0.7776,
+      "step": 1571
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.5134552362210623,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.712,
+      "step": 1572
+    },
+    {
+      "epoch": 0.8389333333333333,
+      "grad_norm": 0.49380865507584276,
+      "learning_rate": 1.3311186530505838e-05,
+      "loss": 0.6452,
+      "step": 1573
+    },
+    {
+      "epoch": 0.8394666666666667,
+      "grad_norm": 0.5081933379437217,
+      "learning_rate": 1.322517230541096e-05,
+      "loss": 0.8112,
+      "step": 1574
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.5121736787964045,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.7272,
+      "step": 1575
+    },
+    {
+      "epoch": 0.8405333333333334,
+      "grad_norm": 0.48009551155070473,
+      "learning_rate": 1.30539214797198e-05,
+      "loss": 0.661,
+      "step": 1576
+    },
+    {
+      "epoch": 0.8410666666666666,
+      "grad_norm": 0.48465583201319534,
+      "learning_rate": 1.2968685390504465e-05,
+      "loss": 0.6784,
+      "step": 1577
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.685254036551879,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.7132,
+      "step": 1578
+    },
+    {
+      "epoch": 0.8421333333333333,
+      "grad_norm": 0.44102035361865277,
+      "learning_rate": 1.2798993131973091e-05,
+      "loss": 0.7113,
+      "step": 1579
+    },
+    {
+      "epoch": 0.8426666666666667,
+      "grad_norm": 0.4994805408242732,
+      "learning_rate": 1.2714537469383858e-05,
+      "loss": 0.711,
+      "step": 1580
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.5602382620403693,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.8379,
+      "step": 1581
+    },
+    {
+      "epoch": 0.8437333333333333,
+      "grad_norm": 0.5671929880193168,
+      "learning_rate": 1.2546408338544769e-05,
+      "loss": 0.7182,
+      "step": 1582
+    },
+    {
+      "epoch": 0.8442666666666667,
+      "grad_norm": 0.4630358554367751,
+      "learning_rate": 1.2462735372353996e-05,
+      "loss": 0.6504,
+      "step": 1583
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.5289835274851352,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.7377,
+      "step": 1584
+    },
+    {
+      "epoch": 0.8453333333333334,
+      "grad_norm": 0.47775023907270436,
+      "learning_rate": 1.2296173887730123e-05,
+      "loss": 0.7138,
+      "step": 1585
+    },
+    {
+      "epoch": 0.8458666666666667,
+      "grad_norm": 0.5515531883262018,
+      "learning_rate": 1.2213285866674905e-05,
+      "loss": 0.6868,
+      "step": 1586
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.44824653933517705,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6341,
+      "step": 1587
+    },
+    {
+      "epoch": 0.8469333333333333,
+      "grad_norm": 0.41486994233454416,
+      "learning_rate": 1.2048296504658207e-05,
+      "loss": 0.634,
+      "step": 1588
+    },
+    {
+      "epoch": 0.8474666666666667,
+      "grad_norm": 0.4545108148817928,
+      "learning_rate": 1.1966195656380031e-05,
+      "loss": 0.6034,
+      "step": 1589
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4489533834839864,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.5554,
+      "step": 1590
+    },
+    {
+      "epoch": 0.8485333333333334,
+      "grad_norm": 0.5405287066083088,
+      "learning_rate": 1.1802782851111205e-05,
+      "loss": 0.7041,
+      "step": 1591
+    },
+    {
+      "epoch": 0.8490666666666666,
+      "grad_norm": 0.4972187285317307,
+      "learning_rate": 1.1721471382096027e-05,
+      "loss": 0.7534,
+      "step": 1592
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.5154262619152171,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6712,
+      "step": 1593
+    },
+    {
+      "epoch": 0.8501333333333333,
+      "grad_norm": 0.3663032585790623,
+      "learning_rate": 1.1559639525345311e-05,
+      "loss": 0.5763,
+      "step": 1594
+    },
+    {
+      "epoch": 0.8506666666666667,
+      "grad_norm": 0.7023035553436592,
+      "learning_rate": 1.1479119620864276e-05,
+      "loss": 0.7126,
+      "step": 1595
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.457900813666415,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.7207,
+      "step": 1596
+    },
+    {
+      "epoch": 0.8517333333333333,
+      "grad_norm": 0.5554462953531683,
+      "learning_rate": 1.1318873061913405e-05,
+      "loss": 0.8523,
+      "step": 1597
+    },
+    {
+      "epoch": 0.8522666666666666,
+      "grad_norm": 0.5480430788653078,
+      "learning_rate": 1.123914688596409e-05,
+      "loss": 0.7748,
+      "step": 1598
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.5988734876106011,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.7738,
+      "step": 1599
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.45939067209244255,
+      "learning_rate": 1.1080489931489391e-05,
+      "loss": 0.6841,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8538666666666667,
+      "grad_norm": 0.6290835230461248,
+      "learning_rate": 1.1001559626737756e-05,
+      "loss": 0.8871,
+      "step": 1601
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.5737171101704664,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.7974,
+      "step": 1602
+    },
+    {
+      "epoch": 0.8549333333333333,
+      "grad_norm": 0.5603477018432383,
+      "learning_rate": 1.0844496540694515e-05,
+      "loss": 0.6972,
+      "step": 1603
+    },
+    {
+      "epoch": 0.8554666666666667,
+      "grad_norm": 0.4958360653147047,
+      "learning_rate": 1.0766364228417148e-05,
+      "loss": 0.7337,
+      "step": 1604
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.4948548812131531,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6741,
+      "step": 1605
+    },
+    {
+      "epoch": 0.8565333333333334,
+      "grad_norm": 0.559838660552377,
+      "learning_rate": 1.0610899231924886e-05,
+      "loss": 0.7608,
+      "step": 1606
+    },
+    {
+      "epoch": 0.8570666666666666,
+      "grad_norm": 0.40902355723274353,
+      "learning_rate": 1.0533567011952094e-05,
+      "loss": 0.699,
+      "step": 1607
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.504641703128184,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6953,
+      "step": 1608
+    },
+    {
+      "epoch": 0.8581333333333333,
+      "grad_norm": 0.5149518116555113,
+      "learning_rate": 1.0379704283181179e-05,
+      "loss": 0.7222,
+      "step": 1609
+    },
+    {
+      "epoch": 0.8586666666666667,
+      "grad_norm": 0.46055043846863936,
+      "learning_rate": 1.0303174233840528e-05,
+      "loss": 0.7279,
+      "step": 1610
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.6861137346352174,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.9197,
+      "step": 1611
+    },
+    {
+      "epoch": 0.8597333333333333,
+      "grad_norm": 0.6202906634173873,
+      "learning_rate": 1.0150917907899926e-05,
+      "loss": 0.9064,
+      "step": 1612
+    },
+    {
+      "epoch": 0.8602666666666666,
+      "grad_norm": 0.6052956382369696,
+      "learning_rate": 1.007519208596045e-05,
+      "loss": 0.816,
+      "step": 1613
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.5092774321360481,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.7326,
+      "step": 1614
+    },
+    {
+      "epoch": 0.8613333333333333,
+      "grad_norm": 0.55870392357648,
+      "learning_rate": 9.924546254786493e-06,
+      "loss": 0.7598,
+      "step": 1615
+    },
+    {
+      "epoch": 0.8618666666666667,
+      "grad_norm": 1.0762146408116535,
+      "learning_rate": 9.849626695403324e-06,
+      "loss": 0.6783,
+      "step": 1616
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.4095981231341868,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6808,
+      "step": 1617
+    },
+    {
+      "epoch": 0.8629333333333333,
+      "grad_norm": 0.44874326314271656,
+      "learning_rate": 9.700595407649805e-06,
+      "loss": 0.6651,
+      "step": 1618
+    },
+    {
+      "epoch": 0.8634666666666667,
+      "grad_norm": 0.5942177582265716,
+      "learning_rate": 9.62648412430951e-06,
+      "loss": 0.6917,
+      "step": 1619
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.5073765704372167,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.7164,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8645333333333334,
+      "grad_norm": 0.3834286436527206,
+      "learning_rate": 9.479071385238892e-06,
+      "loss": 0.6655,
+      "step": 1621
+    },
+    {
+      "epoch": 0.8650666666666667,
+      "grad_norm": 0.4142314259895984,
+      "learning_rate": 9.40577036970538e-06,
+      "loss": 0.6932,
+      "step": 1622
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.4614074512356123,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.7713,
+      "step": 1623
+    },
+    {
+      "epoch": 0.8661333333333333,
+      "grad_norm": 0.4282735890102852,
+      "learning_rate": 9.259980141081115e-06,
+      "loss": 0.6883,
+      "step": 1624
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.45682689964176815,
+      "learning_rate": 9.187491363342093e-06,
+      "loss": 0.7074,
+      "step": 1625
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.49523647877393767,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6899,
+      "step": 1626
+    },
+    {
+      "epoch": 0.8677333333333334,
+      "grad_norm": 0.4242852673300794,
+      "learning_rate": 9.043327563322112e-06,
+      "loss": 0.6641,
+      "step": 1627
+    },
+    {
+      "epoch": 0.8682666666666666,
+      "grad_norm": 0.4348634746466489,
+      "learning_rate": 8.971652971536148e-06,
+      "loss": 0.631,
+      "step": 1628
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.47104771509485044,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6354,
+      "step": 1629
+    },
+    {
+      "epoch": 0.8693333333333333,
+      "grad_norm": 0.6595089288899116,
+      "learning_rate": 8.829119474567671e-06,
+      "loss": 0.7565,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8698666666666667,
+      "grad_norm": 0.45446723321640703,
+      "learning_rate": 8.758260995011825e-06,
+      "loss": 0.7437,
+      "step": 1631
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.39749840642328826,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.652,
+      "step": 1632
+    },
+    {
+      "epoch": 0.8709333333333333,
+      "grad_norm": 0.418676467578698,
+      "learning_rate": 8.617361631727138e-06,
+      "loss": 0.6729,
+      "step": 1633
+    },
+    {
+      "epoch": 0.8714666666666666,
+      "grad_norm": 0.4167017082623031,
+      "learning_rate": 8.547321168745193e-06,
+      "loss": 0.6532,
+      "step": 1634
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.5261322915065519,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.6475,
+      "step": 1635
+    },
+    {
+      "epoch": 0.8725333333333334,
+      "grad_norm": 0.47291652632472153,
+      "learning_rate": 8.408059725858719e-06,
+      "loss": 0.6616,
+      "step": 1636
+    },
+    {
+      "epoch": 0.8730666666666667,
+      "grad_norm": 0.4231944470460595,
+      "learning_rate": 8.338839161809997e-06,
+      "loss": 0.7635,
+      "step": 1637
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.49601636170391883,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.652,
+      "step": 1638
+    },
+    {
+      "epoch": 0.8741333333333333,
+      "grad_norm": 0.4860629505490482,
+      "learning_rate": 8.201219382016556e-06,
+      "loss": 0.6797,
+      "step": 1639
+    },
+    {
+      "epoch": 0.8746666666666667,
+      "grad_norm": 0.537269565059725,
+      "learning_rate": 8.132820577225387e-06,
+      "loss": 0.7567,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.5361527148663102,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.7547,
+      "step": 1641
+    },
+    {
+      "epoch": 0.8757333333333334,
+      "grad_norm": 0.40056536356318667,
+      "learning_rate": 7.996846159099557e-06,
+      "loss": 0.6737,
+      "step": 1642
+    },
+    {
+      "epoch": 0.8762666666666666,
+      "grad_norm": 0.8346711580260499,
+      "learning_rate": 7.929270951805178e-06,
+      "loss": 0.7156,
+      "step": 1643
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.6127306118234916,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.8728,
+      "step": 1644
+    },
+    {
+      "epoch": 0.8773333333333333,
+      "grad_norm": 0.5128203754782934,
+      "learning_rate": 7.794945549701993e-06,
+      "loss": 0.6896,
+      "step": 1645
+    },
+    {
+      "epoch": 0.8778666666666667,
+      "grad_norm": 0.4633092483624061,
+      "learning_rate": 7.728195756009204e-06,
+      "loss": 0.7922,
+      "step": 1646
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.45462026648594467,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6976,
+      "step": 1647
+    },
+    {
+      "epoch": 0.8789333333333333,
+      "grad_norm": 0.42808648619396766,
+      "learning_rate": 7.595522979965819e-06,
+      "loss": 0.677,
+      "step": 1648
+    },
+    {
+      "epoch": 0.8794666666666666,
+      "grad_norm": 0.4651930398760335,
+      "learning_rate": 7.529600393796232e-06,
+      "loss": 0.6459,
+      "step": 1649
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5187080741739207,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6837,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8805333333333333,
+      "grad_norm": 0.49920522537938794,
+      "learning_rate": 7.3985838094349444e-06,
+      "loss": 0.6918,
+      "step": 1651
+    },
+    {
+      "epoch": 0.8810666666666667,
+      "grad_norm": 0.4287234489018753,
+      "learning_rate": 7.333490202478666e-06,
+      "loss": 0.6631,
+      "step": 1652
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.5796082685254073,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.7036,
+      "step": 1653
+    },
+    {
+      "epoch": 0.8821333333333333,
+      "grad_norm": 0.41699761236237465,
+      "learning_rate": 7.204133330911178e-06,
+      "loss": 0.658,
+      "step": 1654
+    },
+    {
+      "epoch": 0.8826666666666667,
+      "grad_norm": 0.40963255834753687,
+      "learning_rate": 7.1398704525792e-06,
+      "loss": 0.618,
+      "step": 1655
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.4266669253277369,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6551,
+      "step": 1656
+    },
+    {
+      "epoch": 0.8837333333333334,
+      "grad_norm": 0.500144255977364,
+      "learning_rate": 7.012176770311862e-06,
+      "loss": 0.622,
+      "step": 1657
+    },
+    {
+      "epoch": 0.8842666666666666,
+      "grad_norm": 0.6423284317172347,
+      "learning_rate": 6.948746347689183e-06,
+      "loss": 0.8343,
+      "step": 1658
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.4785552023213915,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6753,
+      "step": 1659
+    },
+    {
+      "epoch": 0.8853333333333333,
+      "grad_norm": 0.5236688032524741,
+      "learning_rate": 6.8227192865295995e-06,
+      "loss": 0.7347,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8858666666666667,
+      "grad_norm": 0.4491627730242979,
+      "learning_rate": 6.760123024328624e-06,
+      "loss": 0.679,
+      "step": 1661
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.5533506587760416,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.8003,
+      "step": 1662
+    },
+    {
+      "epoch": 0.8869333333333334,
+      "grad_norm": 0.5595371188782717,
+      "learning_rate": 6.635765971293484e-06,
+      "loss": 0.7421,
+      "step": 1663
+    },
+    {
+      "epoch": 0.8874666666666666,
+      "grad_norm": 0.4560730862550497,
+      "learning_rate": 6.5740055518083375e-06,
+      "loss": 0.7176,
+      "step": 1664
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.44371075720210884,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.7023,
+      "step": 1665
+    },
+    {
+      "epoch": 0.8885333333333333,
+      "grad_norm": 0.5542076134403445,
+      "learning_rate": 6.451321849032288e-06,
+      "loss": 0.7239,
+      "step": 1666
+    },
+    {
+      "epoch": 0.8890666666666667,
+      "grad_norm": 0.48346860118408197,
+      "learning_rate": 6.390398932093555e-06,
+      "loss": 0.7135,
+      "step": 1667
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.6524116192763804,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.7735,
+      "step": 1668
+    },
+    {
+      "epoch": 0.8901333333333333,
+      "grad_norm": 0.45036943940470364,
+      "learning_rate": 6.269391876739495e-06,
+      "loss": 0.7418,
+      "step": 1669
+    },
+    {
+      "epoch": 0.8906666666666667,
+      "grad_norm": 0.8386021628329972,
+      "learning_rate": 6.209308099669597e-06,
+      "loss": 0.6556,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.5626439893049194,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.748,
+      "step": 1671
+    },
+    {
+      "epoch": 0.8917333333333334,
+      "grad_norm": 0.5415177761468162,
+      "learning_rate": 6.089980943839924e-06,
+      "loss": 0.8099,
+      "step": 1672
+    },
+    {
+      "epoch": 0.8922666666666667,
+      "grad_norm": 0.5112037404168135,
+      "learning_rate": 6.030737921409169e-06,
+      "loss": 0.7068,
+      "step": 1673
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.6180075417601816,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.8498,
+      "step": 1674
+    },
+    {
+      "epoch": 0.8933333333333333,
+      "grad_norm": 0.5179797392869975,
+      "learning_rate": 5.913093872058528e-06,
+      "loss": 0.7522,
+      "step": 1675
+    },
+    {
+      "epoch": 0.8938666666666667,
+      "grad_norm": 0.4459293336057215,
+      "learning_rate": 5.854693196441641e-06,
+      "loss": 0.7184,
+      "step": 1676
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.5235429666240141,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.7367,
+      "step": 1677
+    },
+    {
+      "epoch": 0.8949333333333334,
+      "grad_norm": 0.44471026541607844,
+      "learning_rate": 5.738735415290642e-06,
+      "loss": 0.752,
+      "step": 1678
+    },
+    {
+      "epoch": 0.8954666666666666,
+      "grad_norm": 0.601396377057578,
+      "learning_rate": 5.681178656024055e-06,
+      "loss": 0.7916,
+      "step": 1679
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.5121445197696045,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6208,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8965333333333333,
+      "grad_norm": 0.4603280036865616,
+      "learning_rate": 5.566910259474289e-06,
+      "loss": 0.6457,
+      "step": 1681
+    },
+    {
+      "epoch": 0.8970666666666667,
+      "grad_norm": 0.47236374997848657,
+      "learning_rate": 5.510198963413881e-06,
+      "loss": 0.7362,
+      "step": 1682
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.42287209768869616,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.7348,
+      "step": 1683
+    },
+    {
+      "epoch": 0.8981333333333333,
+      "grad_norm": 0.42712644073449385,
+      "learning_rate": 5.397623022464226e-06,
+      "loss": 0.7161,
+      "step": 1684
+    },
+    {
+      "epoch": 0.8986666666666666,
+      "grad_norm": 0.48340298121129716,
+      "learning_rate": 5.341758713743828e-06,
+      "loss": 0.6387,
+      "step": 1685
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.5658462960948197,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6593,
+      "step": 1686
+    },
+    {
+      "epoch": 0.8997333333333334,
+      "grad_norm": 0.424084893434337,
+      "learning_rate": 5.230878253907912e-06,
+      "loss": 0.6479,
+      "step": 1687
+    },
+    {
+      "epoch": 0.9002666666666667,
+      "grad_norm": 0.5398925397600337,
+      "learning_rate": 5.175862433898282e-06,
+      "loss": 0.5938,
+      "step": 1688
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.606679189727865,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.7922,
+      "step": 1689
+    },
+    {
+      "epoch": 0.9013333333333333,
+      "grad_norm": 0.510985246916621,
+      "learning_rate": 5.066680435123106e-06,
+      "loss": 0.7588,
+      "step": 1690
+    },
+    {
+      "epoch": 0.9018666666666667,
+      "grad_norm": 0.4722608230327989,
+      "learning_rate": 5.012514582391592e-06,
+      "loss": 0.6713,
+      "step": 1691
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.42446110331205,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6643,
+      "step": 1692
+    },
+    {
+      "epoch": 0.9029333333333334,
+      "grad_norm": 0.4163265120072367,
+      "learning_rate": 4.905033978977491e-06,
+      "loss": 0.5705,
+      "step": 1693
+    },
+    {
+      "epoch": 0.9034666666666666,
+      "grad_norm": 0.5475943904187824,
+      "learning_rate": 4.851719549248301e-06,
+      "loss": 0.7402,
+      "step": 1694
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.5168728639343029,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6617,
+      "step": 1695
+    },
+    {
+      "epoch": 0.9045333333333333,
+      "grad_norm": 0.5155818274958027,
+      "learning_rate": 4.745943229770122e-06,
+      "loss": 0.7394,
+      "step": 1696
+    },
+    {
+      "epoch": 0.9050666666666667,
+      "grad_norm": 0.6213963608647247,
+      "learning_rate": 4.693481655885257e-06,
+      "loss": 0.7629,
+      "step": 1697
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.4450693753993077,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6953,
+      "step": 1698
+    },
+    {
+      "epoch": 0.9061333333333333,
+      "grad_norm": 0.49150080427279746,
+      "learning_rate": 4.58941246311464e-06,
+      "loss": 0.6962,
+      "step": 1699
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.44378449436598577,
+      "learning_rate": 4.537805154995278e-06,
+      "loss": 0.6751,
+      "step": 1700
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.4266567562385669,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.7267,
+      "step": 1701
+    },
+    {
+      "epoch": 0.9077333333333333,
+      "grad_norm": 0.5248025026202432,
+      "learning_rate": 4.435445885824285e-06,
+      "loss": 0.813,
+      "step": 1702
+    },
+    {
+      "epoch": 0.9082666666666667,
+      "grad_norm": 0.5436474839305038,
+      "learning_rate": 4.384694230432984e-06,
+      "loss": 0.6394,
+      "step": 1703
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3907459439703028,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.5963,
+      "step": 1704
+    },
+    {
+      "epoch": 0.9093333333333333,
+      "grad_norm": 0.47358630291017373,
+      "learning_rate": 4.2840476357989825e-06,
+      "loss": 0.7958,
+      "step": 1705
+    },
+    {
+      "epoch": 0.9098666666666667,
+      "grad_norm": 0.5943288010108304,
+      "learning_rate": 4.2341529971023255e-06,
+      "loss": 0.7544,
+      "step": 1706
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.42006006444833216,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6587,
+      "step": 1707
+    },
+    {
+      "epoch": 0.9109333333333334,
+      "grad_norm": 0.6834227269857183,
+      "learning_rate": 4.135221781914034e-06,
+      "loss": 0.7348,
+      "step": 1708
+    },
+    {
+      "epoch": 0.9114666666666666,
+      "grad_norm": 0.4744275929462346,
+      "learning_rate": 4.0861855008460405e-06,
+      "loss": 0.6711,
+      "step": 1709
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.514414629181434,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.7577,
+      "step": 1710
+    },
+    {
+      "epoch": 0.9125333333333333,
+      "grad_norm": 0.46627494146408927,
+      "learning_rate": 3.988972323910778e-06,
+      "loss": 0.6638,
+      "step": 1711
+    },
+    {
+      "epoch": 0.9130666666666667,
+      "grad_norm": 0.4953757495199464,
+      "learning_rate": 3.9407957183368095e-06,
+      "loss": 0.6732,
+      "step": 1712
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.48769603448556365,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.7318,
+      "step": 1713
+    },
+    {
+      "epoch": 0.9141333333333334,
+      "grad_norm": 0.5257519553325768,
+      "learning_rate": 3.845303192289074e-06,
+      "loss": 0.8015,
+      "step": 1714
+    },
+    {
+      "epoch": 0.9146666666666666,
+      "grad_norm": 0.468044162237833,
+      "learning_rate": 3.797987556970495e-06,
+      "loss": 0.7128,
+      "step": 1715
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.5165267345687184,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.7722,
+      "step": 1716
+    },
+    {
+      "epoch": 0.9157333333333333,
+      "grad_norm": 0.5179373494952317,
+      "learning_rate": 3.7042182482018075e-06,
+      "loss": 0.7424,
+      "step": 1717
+    },
+    {
+      "epoch": 0.9162666666666667,
+      "grad_norm": 0.5459932728845321,
+      "learning_rate": 3.6577648547611033e-06,
+      "loss": 0.7631,
+      "step": 1718
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.4475477142358694,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6574,
+      "step": 1719
+    },
+    {
+      "epoch": 0.9173333333333333,
+      "grad_norm": 0.4678213911961665,
+      "learning_rate": 3.565721283350931e-06,
+      "loss": 0.6534,
+      "step": 1720
+    },
+    {
+      "epoch": 0.9178666666666667,
+      "grad_norm": 0.4445865890861479,
+      "learning_rate": 3.5201313802375456e-06,
+      "loss": 0.6866,
+      "step": 1721
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.44496802431382726,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.721,
+      "step": 1722
+    },
+    {
+      "epoch": 0.9189333333333334,
+      "grad_norm": 0.5498354517588652,
+      "learning_rate": 3.4298160198856568e-06,
+      "loss": 0.6678,
+      "step": 1723
+    },
+    {
+      "epoch": 0.9194666666666667,
+      "grad_norm": 0.4781268590749618,
+      "learning_rate": 3.3850908323424967e-06,
+      "loss": 0.7688,
+      "step": 1724
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.4795911897631534,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.7867,
+      "step": 1725
+    },
+    {
+      "epoch": 0.9205333333333333,
+      "grad_norm": 0.4157086767533849,
+      "learning_rate": 3.296506110302422e-06,
+      "loss": 0.6362,
+      "step": 1726
+    },
+    {
+      "epoch": 0.9210666666666667,
+      "grad_norm": 0.460482019566879,
+      "learning_rate": 3.252646840332918e-06,
+      "loss": 0.8488,
+      "step": 1727
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4011290702239954,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6229,
+      "step": 1728
+    },
+    {
+      "epoch": 0.9221333333333334,
+      "grad_norm": 0.5335791859343584,
+      "learning_rate": 3.1657951373467497e-06,
+      "loss": 0.6649,
+      "step": 1729
+    },
+    {
+      "epoch": 0.9226666666666666,
+      "grad_norm": 0.48008138017971996,
+      "learning_rate": 3.1228029636824475e-06,
+      "loss": 0.6594,
+      "step": 1730
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.510030532078524,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.7212,
+      "step": 1731
+    },
+    {
+      "epoch": 0.9237333333333333,
+      "grad_norm": 0.6116496219261657,
+      "learning_rate": 3.037686613916857e-06,
+      "loss": 0.6314,
+      "step": 1732
+    },
+    {
+      "epoch": 0.9242666666666667,
+      "grad_norm": 0.5191573242107503,
+      "learning_rate": 2.995562691985898e-06,
+      "loss": 0.7689,
+      "step": 1733
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.48011259367440917,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.6856,
+      "step": 1734
+    },
+    {
+      "epoch": 0.9253333333333333,
+      "grad_norm": 0.8095906956777874,
+      "learning_rate": 2.912183982969385e-06,
+      "loss": 0.946,
+      "step": 1735
+    },
+    {
+      "epoch": 0.9258666666666666,
+      "grad_norm": 0.6547187796594096,
+      "learning_rate": 2.8709294448653225e-06,
+      "loss": 0.7037,
+      "step": 1736
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.4391012158565832,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6496,
+      "step": 1737
+    },
+    {
+      "epoch": 0.9269333333333334,
+      "grad_norm": 0.48372715186069265,
+      "learning_rate": 2.789290617426765e-06,
+      "loss": 0.7216,
+      "step": 1738
+    },
+    {
+      "epoch": 0.9274666666666667,
+      "grad_norm": 0.551843828007968,
+      "learning_rate": 2.748906571878207e-06,
+      "loss": 0.7497,
+      "step": 1739
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3706985805846255,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.6008,
+      "step": 1740
+    },
+    {
+      "epoch": 0.9285333333333333,
+      "grad_norm": 0.5079646438573806,
+      "learning_rate": 2.6690098200866098e-06,
+      "loss": 0.7427,
+      "step": 1741
+    },
+    {
+      "epoch": 0.9290666666666667,
+      "grad_norm": 0.5202994781101107,
+      "learning_rate": 2.6294973524274125e-06,
+      "loss": 0.7175,
+      "step": 1742
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.3951489412205076,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.635,
+      "step": 1743
+    },
+    {
+      "epoch": 0.9301333333333334,
+      "grad_norm": 0.5548245108473696,
+      "learning_rate": 2.551344823532964e-06,
+      "loss": 0.7317,
+      "step": 1744
+    },
+    {
+      "epoch": 0.9306666666666666,
+      "grad_norm": 0.49619550117391037,
+      "learning_rate": 2.5127049956730207e-06,
+      "loss": 0.6544,
+      "step": 1745
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.5787425597304093,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.7582,
+      "step": 1746
+    },
+    {
+      "epoch": 0.9317333333333333,
+      "grad_norm": 0.4821868118943535,
+      "learning_rate": 2.436298790049363e-06,
+      "loss": 0.7382,
+      "step": 1747
+    },
+    {
+      "epoch": 0.9322666666666667,
+      "grad_norm": 0.49726536415452277,
+      "learning_rate": 2.3985326404461604e-06,
+      "loss": 0.7179,
+      "step": 1748
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.4916875608972585,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.7419,
+      "step": 1749
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.5013111067249194,
+      "learning_rate": 2.3238748115339324e-06,
+      "loss": 0.7238,
+      "step": 1750
+    },
+    {
+      "epoch": 0.9338666666666666,
+      "grad_norm": 0.5111579653157136,
+      "learning_rate": 2.286983355164529e-06,
+      "loss": 0.7329,
+      "step": 1751
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.42180275764096986,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.5899,
+      "step": 1752
+    },
+    {
+      "epoch": 0.9349333333333333,
+      "grad_norm": 0.4136580821470877,
+      "learning_rate": 2.2140759094162467e-06,
+      "loss": 0.6377,
+      "step": 1753
+    },
+    {
+      "epoch": 0.9354666666666667,
+      "grad_norm": 0.496226995771142,
+      "learning_rate": 2.178060137750071e-06,
+      "loss": 0.7685,
+      "step": 1754
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.5285898567821652,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.7959,
+      "step": 1755
+    },
+    {
+      "epoch": 0.9365333333333333,
+      "grad_norm": 0.4197114796133001,
+      "learning_rate": 2.106905034576112e-06,
+      "loss": 0.6642,
+      "step": 1756
+    },
+    {
+      "epoch": 0.9370666666666667,
+      "grad_norm": 0.617889865561174,
+      "learning_rate": 2.0717659155482738e-06,
+      "loss": 0.8222,
+      "step": 1757
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.606292067950159,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.7569,
+      "step": 1758
+    },
+    {
+      "epoch": 0.9381333333333334,
+      "grad_norm": 0.48358230199811053,
+      "learning_rate": 2.002365067264289e-06,
+      "loss": 0.6421,
+      "step": 1759
+    },
+    {
+      "epoch": 0.9386666666666666,
+      "grad_norm": 0.7161695624087144,
+      "learning_rate": 1.968103545249611e-06,
+      "loss": 0.8082,
+      "step": 1760
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.4051625738114139,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6526,
+      "step": 1761
+    },
+    {
+      "epoch": 0.9397333333333333,
+      "grad_norm": 0.4299181922221351,
+      "learning_rate": 1.900458817025097e-06,
+      "loss": 0.6727,
+      "step": 1762
+    },
+    {
+      "epoch": 0.9402666666666667,
+      "grad_norm": 0.47701691210897,
+      "learning_rate": 1.8670758128126909e-06,
+      "loss": 0.7156,
+      "step": 1763
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.4466971408145806,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6131,
+      "step": 1764
+    },
+    {
+      "epoch": 0.9413333333333334,
+      "grad_norm": 0.47586344574699047,
+      "learning_rate": 1.8011890226208527e-06,
+      "loss": 0.7455,
+      "step": 1765
+    },
+    {
+      "epoch": 0.9418666666666666,
+      "grad_norm": 0.5333491333238738,
+      "learning_rate": 1.7686854333893833e-06,
+      "loss": 0.777,
+      "step": 1766
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.43097036219369156,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.686,
+      "step": 1767
+    },
+    {
+      "epoch": 0.9429333333333333,
+      "grad_norm": 0.40306995652351624,
+      "learning_rate": 1.7045583519583074e-06,
+      "loss": 0.6632,
+      "step": 1768
+    },
+    {
+      "epoch": 0.9434666666666667,
+      "grad_norm": 0.5054067762609026,
+      "learning_rate": 1.6729350512519005e-06,
+      "loss": 0.6999,
+      "step": 1769
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3771454558465983,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6567,
+      "step": 1770
+    },
+    {
+      "epoch": 0.9445333333333333,
+      "grad_norm": 0.5509412959915065,
+      "learning_rate": 1.6105694020169593e-06,
+      "loss": 0.7371,
+      "step": 1771
+    },
+    {
+      "epoch": 0.9450666666666667,
+      "grad_norm": 0.4482264117064355,
+      "learning_rate": 1.5798272397217095e-06,
+      "loss": 0.7454,
+      "step": 1772
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.5201668268071643,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.7104,
+      "step": 1773
+    },
+    {
+      "epoch": 0.9461333333333334,
+      "grad_norm": 0.4365736211274258,
+      "learning_rate": 1.5192246987791981e-06,
+      "loss": 0.7213,
+      "step": 1774
+    },
+    {
+      "epoch": 0.9466666666666667,
+      "grad_norm": 0.4606040027591618,
+      "learning_rate": 1.489364501100332e-06,
+      "loss": 0.7046,
+      "step": 1775
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.37495205500136525,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.5511,
+      "step": 1776
+    },
+    {
+      "epoch": 0.9477333333333333,
+      "grad_norm": 0.4649202973154337,
+      "learning_rate": 1.430526697162482e-06,
+      "loss": 0.6964,
+      "step": 1777
+    },
+    {
+      "epoch": 0.9482666666666667,
+      "grad_norm": 0.5162600069616848,
+      "learning_rate": 1.4015492666021312e-06,
+      "loss": 0.7756,
+      "step": 1778
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.44816010084396685,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.7204,
+      "step": 1779
+    },
+    {
+      "epoch": 0.9493333333333334,
+      "grad_norm": 0.4857083955103614,
+      "learning_rate": 1.344477780953346e-06,
+      "loss": 0.7151,
+      "step": 1780
+    },
+    {
+      "epoch": 0.9498666666666666,
+      "grad_norm": 0.4868499988858953,
+      "learning_rate": 1.3163838962890195e-06,
+      "loss": 0.6853,
+      "step": 1781
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.5366789933159594,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.7534,
+      "step": 1782
+    },
+    {
+      "epoch": 0.9509333333333333,
+      "grad_norm": 0.5758222815631294,
+      "learning_rate": 1.261080262743297e-06,
+      "loss": 0.7595,
+      "step": 1783
+    },
+    {
+      "epoch": 0.9514666666666667,
+      "grad_norm": 0.46310850721307445,
+      "learning_rate": 1.2338706790069431e-06,
+      "loss": 0.7028,
+      "step": 1784
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.5561069134921421,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.8377,
+      "step": 1785
+    },
+    {
+      "epoch": 0.9525333333333333,
+      "grad_norm": 0.6560905308494205,
+      "learning_rate": 1.1803363838667092e-06,
+      "loss": 0.6879,
+      "step": 1786
+    },
+    {
+      "epoch": 0.9530666666666666,
+      "grad_norm": 0.4449699955108516,
+      "learning_rate": 1.1540118323243865e-06,
+      "loss": 0.6719,
+      "step": 1787
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.4060169297987321,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6261,
+      "step": 1788
+    },
+    {
+      "epoch": 0.9541333333333334,
+      "grad_norm": 0.5230066158044145,
+      "learning_rate": 1.1022483143405705e-06,
+      "loss": 0.6871,
+      "step": 1789
+    },
+    {
+      "epoch": 0.9546666666666667,
+      "grad_norm": 0.46996784274130565,
+      "learning_rate": 1.076809502472831e-06,
+      "loss": 0.7292,
+      "step": 1790
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.5050186353073788,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.7157,
+      "step": 1791
+    },
+    {
+      "epoch": 0.9557333333333333,
+      "grad_norm": 0.5211608433595872,
+      "learning_rate": 1.0268181528061749e-06,
+      "loss": 0.6562,
+      "step": 1792
+    },
+    {
+      "epoch": 0.9562666666666667,
+      "grad_norm": 0.5379631995615644,
+      "learning_rate": 1.0022657642890231e-06,
+      "loss": 0.7603,
+      "step": 1793
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.5288416301092489,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.7771,
+      "step": 1794
+    },
+    {
+      "epoch": 0.9573333333333334,
+      "grad_norm": 0.5617692135985728,
+      "learning_rate": 9.540479264726676e-07,
+      "loss": 0.8067,
+      "step": 1795
+    },
+    {
+      "epoch": 0.9578666666666666,
+      "grad_norm": 0.469019831644795,
+      "learning_rate": 9.303826211592315e-07,
+      "loss": 0.7219,
+      "step": 1796
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.36186071420054156,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.58,
+      "step": 1797
+    },
+    {
+      "epoch": 0.9589333333333333,
+      "grad_norm": 0.5514952842601946,
+      "learning_rate": 8.839395910626213e-07,
+      "loss": 0.7673,
+      "step": 1798
+    },
+    {
+      "epoch": 0.9594666666666667,
+      "grad_norm": 0.47616261536395255,
+      "learning_rate": 8.611620049653879e-07,
+      "loss": 0.7325,
+      "step": 1799
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.49771928947304866,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.7444,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9605333333333334,
+      "grad_norm": 0.490805076177822,
+      "learning_rate": 8.16495030759501e-07,
+      "loss": 0.6328,
+      "step": 1801
+    },
+    {
+      "epoch": 0.9610666666666666,
+      "grad_norm": 0.4505767877704554,
+      "learning_rate": 7.946057760332193e-07,
+      "loss": 0.6216,
+      "step": 1802
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.546886120714011,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6847,
+      "step": 1803
+    },
+    {
+      "epoch": 0.9621333333333333,
+      "grad_norm": 0.4652153899264941,
+      "learning_rate": 7.517160581569372e-07,
+      "loss": 0.6592,
+      "step": 1804
+    },
+    {
+      "epoch": 0.9626666666666667,
+      "grad_norm": 0.403288371950824,
+      "learning_rate": 7.307157230821426e-07,
+      "loss": 0.6105,
+      "step": 1805
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.5203408586803837,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6982,
+      "step": 1806
+    },
+    {
+      "epoch": 0.9637333333333333,
+      "grad_norm": 0.5302648639664964,
+      "learning_rate": 6.896044142100433e-07,
+      "loss": 0.7863,
+      "step": 1807
+    },
+    {
+      "epoch": 0.9642666666666667,
+      "grad_norm": 0.5999242011265087,
+      "learning_rate": 6.694935631773258e-07,
+      "loss": 0.7431,
+      "step": 1808
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.5006843905869421,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.7036,
+      "step": 1809
+    },
+    {
+      "epoch": 0.9653333333333334,
+      "grad_norm": 0.4965402200645782,
+      "learning_rate": 6.301617681886863e-07,
+      "loss": 0.783,
+      "step": 1810
+    },
+    {
+      "epoch": 0.9658666666666667,
+      "grad_norm": 0.4969460812661328,
+      "learning_rate": 6.109409416834688e-07,
+      "loss": 0.784,
+      "step": 1811
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.531648603927093,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.8132,
+      "step": 1812
+    },
+    {
+      "epoch": 0.9669333333333333,
+      "grad_norm": 0.6240854665278419,
+      "learning_rate": 5.733897176325665e-07,
+      "loss": 0.8512,
+      "step": 1813
+    },
+    {
+      "epoch": 0.9674666666666667,
+      "grad_norm": 0.5580182083189841,
+      "learning_rate": 5.550594322205504e-07,
+      "loss": 0.7281,
+      "step": 1814
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.4859660233476685,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.7066,
+      "step": 1815
+    },
+    {
+      "epoch": 0.9685333333333334,
+      "grad_norm": 0.5125505489699571,
+      "learning_rate": 5.192897883082747e-07,
+      "loss": 0.762,
+      "step": 1816
+    },
+    {
+      "epoch": 0.9690666666666666,
+      "grad_norm": 0.5208541059125794,
+      "learning_rate": 5.018505366216175e-07,
+      "loss": 0.6834,
+      "step": 1817
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.6417611341342232,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.8377,
+      "step": 1818
+    },
+    {
+      "epoch": 0.9701333333333333,
+      "grad_norm": 0.39820648268764225,
+      "learning_rate": 4.678634341683252e-07,
+      "loss": 0.6743,
+      "step": 1819
+    },
+    {
+      "epoch": 0.9706666666666667,
+      "grad_norm": 0.41943172803182305,
+      "learning_rate": 4.5131568489236166e-07,
+      "loss": 0.6715,
+      "step": 1820
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.4365793004569459,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6766,
+      "step": 1821
+    },
+    {
+      "epoch": 0.9717333333333333,
+      "grad_norm": 0.4757877763567356,
+      "learning_rate": 4.191120373120749e-07,
+      "loss": 0.5985,
+      "step": 1822
+    },
+    {
+      "epoch": 0.9722666666666666,
+      "grad_norm": 0.5696292967795435,
+      "learning_rate": 4.034562351727389e-07,
+      "loss": 0.7671,
+      "step": 1823
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.46415712059206915,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6021,
+      "step": 1824
+    },
+    {
+      "epoch": 0.9733333333333334,
+      "grad_norm": 0.4882964088303243,
+      "learning_rate": 3.73036907948543e-07,
+      "loss": 0.7239,
+      "step": 1825
+    },
+    {
+      "epoch": 0.9738666666666667,
+      "grad_norm": 0.5036691213297696,
+      "learning_rate": 3.582734737004101e-07,
+      "loss": 0.6801,
+      "step": 1826
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.44412768989414975,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.6192,
+      "step": 1827
+    },
+    {
+      "epoch": 0.9749333333333333,
+      "grad_norm": 0.4334056206380038,
+      "learning_rate": 3.296392843612273e-07,
+      "loss": 0.7146,
+      "step": 1828
+    },
+    {
+      "epoch": 0.9754666666666667,
+      "grad_norm": 0.5317515881273797,
+      "learning_rate": 3.1576861477621287e-07,
+      "loss": 0.7146,
+      "step": 1829
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.44830874612596283,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.7163,
+      "step": 1830
+    },
+    {
+      "epoch": 0.9765333333333334,
+      "grad_norm": 0.5118343343875779,
+      "learning_rate": 2.889203328748424e-07,
+      "loss": 0.6868,
+      "step": 1831
+    },
+    {
+      "epoch": 0.9770666666666666,
+      "grad_norm": 0.5362455723338327,
+      "learning_rate": 2.759428007315212e-07,
+      "loss": 0.6871,
+      "step": 1832
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.4433335474125894,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6308,
+      "step": 1833
+    },
+    {
+      "epoch": 0.9781333333333333,
+      "grad_norm": 0.42613712933795245,
+      "learning_rate": 2.5088114782392257e-07,
+      "loss": 0.6636,
+      "step": 1834
+    },
+    {
+      "epoch": 0.9786666666666667,
+      "grad_norm": 0.44864076347431425,
+      "learning_rate": 2.3879710189753656e-07,
+      "loss": 0.7606,
+      "step": 1835
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.5355535529859561,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.7468,
+      "step": 1836
+    },
+    {
+      "epoch": 0.9797333333333333,
+      "grad_norm": 0.5484865023173826,
+      "learning_rate": 2.15522751523467e-07,
+      "loss": 0.8353,
+      "step": 1837
+    },
+    {
+      "epoch": 0.9802666666666666,
+      "grad_norm": 0.5197743045775367,
+      "learning_rate": 2.0433251657653308e-07,
+      "loss": 0.7494,
+      "step": 1838
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.4131281147883286,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.5756,
+      "step": 1839
+    },
+    {
+      "epoch": 0.9813333333333333,
+      "grad_norm": 0.49358540741723306,
+      "learning_rate": 1.8284609424142895e-07,
+      "loss": 0.7275,
+      "step": 1840
+    },
+    {
+      "epoch": 0.9818666666666667,
+      "grad_norm": 0.4289229445657535,
+      "learning_rate": 1.7254997101500137e-07,
+      "loss": 0.699,
+      "step": 1841
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.4987867160996175,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.736,
+      "step": 1842
+    },
+    {
+      "epoch": 0.9829333333333333,
+      "grad_norm": 0.44847119443742095,
+      "learning_rate": 1.5285205417319149e-07,
+      "loss": 0.6662,
+      "step": 1843
+    },
+    {
+      "epoch": 0.9834666666666667,
+      "grad_norm": 0.4404285750263958,
+      "learning_rate": 1.4345031937879062e-07,
+      "loss": 0.5925,
+      "step": 1844
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.5197926230065087,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.7181,
+      "step": 1845
+    },
+    {
+      "epoch": 0.9845333333333334,
+      "grad_norm": 0.44517753678196087,
+      "learning_rate": 1.255414374179531e-07,
+      "loss": 0.6479,
+      "step": 1846
+    },
+    {
+      "epoch": 0.9850666666666666,
+      "grad_norm": 0.43248993328356955,
+      "learning_rate": 1.170343437301491e-07,
+      "loss": 0.6896,
+      "step": 1847
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.37888857741562054,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.5676,
+      "step": 1848
+    },
+    {
+      "epoch": 0.9861333333333333,
+      "grad_norm": 0.4288486528383463,
+      "learning_rate": 1.0091497795706728e-07,
+      "loss": 0.6826,
+      "step": 1849
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.48226318453468403,
+      "learning_rate": 9.330275400666332e-08,
+      "loss": 0.7637,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.45033126161507564,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.7054,
+      "step": 1851
+    },
+    {
+      "epoch": 0.9877333333333334,
+      "grad_norm": 0.5224814686511968,
+      "learning_rate": 7.8973337634336e-08,
+      "loss": 0.6732,
+      "step": 1852
+    },
+    {
+      "epoch": 0.9882666666666666,
+      "grad_norm": 0.5040000891510696,
+      "learning_rate": 7.225618800222877e-08,
+      "loss": 0.6622,
+      "step": 1853
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.42715592838634286,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6281,
+      "step": 1854
+    },
+    {
+      "epoch": 0.9893333333333333,
+      "grad_norm": 0.44340509873449113,
+      "learning_rate": 5.971710613821291e-08,
+      "loss": 0.7003,
+      "step": 1855
+    },
+    {
+      "epoch": 0.9898666666666667,
+      "grad_norm": 0.44648750338781834,
+      "learning_rate": 5.389521134989695e-08,
+      "loss": 0.6881,
+      "step": 1856
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.42941753278300676,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6005,
+      "step": 1857
+    },
+    {
+      "epoch": 0.9909333333333333,
+      "grad_norm": 0.5182651819295042,
+      "learning_rate": 4.314680098592705e-08,
+      "loss": 0.7509,
+      "step": 1858
+    },
+    {
+      "epoch": 0.9914666666666667,
+      "grad_norm": 0.5228807722686285,
+      "learning_rate": 3.8220317506654226e-08,
+      "loss": 0.7386,
+      "step": 1859
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.5136503316973929,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6742,
+      "step": 1860
+    },
+    {
+      "epoch": 0.9925333333333334,
+      "grad_norm": 0.49047170464661055,
+      "learning_rate": 2.9262867509605163e-08,
+      "loss": 0.6734,
+      "step": 1861
+    },
+    {
+      "epoch": 0.9930666666666667,
+      "grad_norm": 0.6186312205906592,
+      "learning_rate": 2.5231927740154704e-08,
+      "loss": 0.81,
+      "step": 1862
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.4526584209336811,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.7051,
+      "step": 1863
+    },
+    {
+      "epoch": 0.9941333333333333,
+      "grad_norm": 0.45617373499579417,
+      "learning_rate": 1.8065678844314538e-08,
+      "loss": 0.6902,
+      "step": 1864
+    },
+    {
+      "epoch": 0.9946666666666667,
+      "grad_norm": 0.6027724907975999,
+      "learning_rate": 1.4930391117451426e-08,
+      "loss": 0.7722,
+      "step": 1865
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.4700052206922473,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.5983,
+      "step": 1866
+    },
+    {
+      "epoch": 0.9957333333333334,
+      "grad_norm": 0.7086846695221508,
+      "learning_rate": 9.555535917993297e-09,
+      "loss": 1.006,
+      "step": 1867
+    },
+    {
+      "epoch": 0.9962666666666666,
+      "grad_norm": 0.6814212714863245,
+      "learning_rate": 7.315984495548378e-09,
+      "loss": 0.7931,
+      "step": 1868
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.42297711302573726,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6786,
+      "step": 1869
+    },
+    {
+      "epoch": 0.9973333333333333,
+      "grad_norm": 0.5850099736471568,
+      "learning_rate": 3.732667443390181e-09,
+      "loss": 0.8436,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9978666666666667,
+      "grad_norm": 0.42956006902121935,
+      "learning_rate": 2.388912514017516e-09,
+      "loss": 0.6895,
+      "step": 1871
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.4260271478296352,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.6554,
+      "step": 1872
+    },
+    {
+      "epoch": 0.9989333333333333,
+      "grad_norm": 0.5290206099979772,
+      "learning_rate": 5.972299119250125e-10,
+      "loss": 0.6955,
+      "step": 1873
+    },
+    {
+      "epoch": 0.9994666666666666,
+      "grad_norm": 0.48785151961800294,
+      "learning_rate": 1.4930758944764479e-10,
+      "loss": 0.7176,
+      "step": 1874
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.4250442322020013,
+      "learning_rate": 0.0,
+      "loss": 0.6854,
+      "step": 1875
+    },
+    {
+      "epoch": 1.0,
+      "step": 1875,
+      "total_flos": 1526950804455424.0,
+      "train_loss": 0.7908816375732421,
+      "train_runtime": 28102.0472,
+      "train_samples_per_second": 1.068,
+      "train_steps_per_second": 0.067
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1526950804455424.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..054f4932b83048bbd77dedd425621288b6dbcdfd
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e4f6873f8ffc707bea00278f882935a36aa3c20a
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fba053cf38ab9fd24bf53350693387d55edb1fb204cad8950e80f774cf3b1382
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..28d9d8b407fb624923856103b6cc51eb9cd6be41
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:619757c697845a8c27d4970528301b1a73673034c8ba81f600ecf7ff8403f1e5
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..204aaee9761bb279ae87f756ecbddede99cdad90
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,13167 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005333333333333334,
+      "grad_norm": 0.7125623597453534,
+      "learning_rate": 3.5087719298245615e-06,
+      "loss": 1.1919,
+      "step": 1
+    },
+    {
+      "epoch": 0.0010666666666666667,
+      "grad_norm": 1.0674347992870037,
+      "learning_rate": 7.017543859649123e-06,
+      "loss": 1.2128,
+      "step": 2
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.8728510850894342,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.1698,
+      "step": 3
+    },
+    {
+      "epoch": 0.0021333333333333334,
+      "grad_norm": 0.9110198903157677,
+      "learning_rate": 1.4035087719298246e-05,
+      "loss": 1.3857,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026666666666666666,
+      "grad_norm": 0.877247879975666,
+      "learning_rate": 1.7543859649122806e-05,
+      "loss": 1.3312,
+      "step": 5
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8140465172895612,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.2262,
+      "step": 6
+    },
+    {
+      "epoch": 0.0037333333333333333,
+      "grad_norm": 0.895253458979813,
+      "learning_rate": 2.456140350877193e-05,
+      "loss": 1.2084,
+      "step": 7
+    },
+    {
+      "epoch": 0.004266666666666667,
+      "grad_norm": 0.7333061355376127,
+      "learning_rate": 2.8070175438596492e-05,
+      "loss": 1.1988,
+      "step": 8
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.6931096936857353,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.1417,
+      "step": 9
+    },
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 0.803854388768664,
+      "learning_rate": 3.508771929824561e-05,
+      "loss": 1.2071,
+      "step": 10
+    },
+    {
+      "epoch": 0.005866666666666667,
+      "grad_norm": 0.6601916733476776,
+      "learning_rate": 3.859649122807018e-05,
+      "loss": 0.9522,
+      "step": 11
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8912905302625813,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.1968,
+      "step": 12
+    },
+    {
+      "epoch": 0.006933333333333333,
+      "grad_norm": 0.8430962018602007,
+      "learning_rate": 4.56140350877193e-05,
+      "loss": 1.1235,
+      "step": 13
+    },
+    {
+      "epoch": 0.007466666666666667,
+      "grad_norm": 0.6550051251096707,
+      "learning_rate": 4.912280701754386e-05,
+      "loss": 0.9506,
+      "step": 14
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.1453521781894755,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.1559,
+      "step": 15
+    },
+    {
+      "epoch": 0.008533333333333334,
+      "grad_norm": 0.7815021254168898,
+      "learning_rate": 5.6140350877192984e-05,
+      "loss": 1.0631,
+      "step": 16
+    },
+    {
+      "epoch": 0.009066666666666667,
+      "grad_norm": 0.8095443644525637,
+      "learning_rate": 5.9649122807017544e-05,
+      "loss": 1.1157,
+      "step": 17
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7243647678054499,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.0483,
+      "step": 18
+    },
+    {
+      "epoch": 0.010133333333333333,
+      "grad_norm": 0.8661360484668469,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 1.0757,
+      "step": 19
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": 0.9197298869430908,
+      "learning_rate": 7.017543859649122e-05,
+      "loss": 1.1066,
+      "step": 20
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.7445562450644425,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.0975,
+      "step": 21
+    },
+    {
+      "epoch": 0.011733333333333333,
+      "grad_norm": 0.5301275567251413,
+      "learning_rate": 7.719298245614036e-05,
+      "loss": 0.9044,
+      "step": 22
+    },
+    {
+      "epoch": 0.012266666666666667,
+      "grad_norm": 0.6628615339362863,
+      "learning_rate": 8.070175438596491e-05,
+      "loss": 0.9849,
+      "step": 23
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.6189170160418722,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.9818,
+      "step": 24
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 0.6181847218507321,
+      "learning_rate": 8.771929824561403e-05,
+      "loss": 1.0158,
+      "step": 25
+    },
+    {
+      "epoch": 0.013866666666666666,
+      "grad_norm": 0.6121104085973382,
+      "learning_rate": 9.12280701754386e-05,
+      "loss": 0.8488,
+      "step": 26
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.7523164934717913,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9749,
+      "step": 27
+    },
+    {
+      "epoch": 0.014933333333333333,
+      "grad_norm": 0.7530922285738954,
+      "learning_rate": 9.824561403508771e-05,
+      "loss": 1.036,
+      "step": 28
+    },
+    {
+      "epoch": 0.015466666666666667,
+      "grad_norm": 0.5356182338531568,
+      "learning_rate": 0.0001017543859649123,
+      "loss": 0.7856,
+      "step": 29
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5288834988820852,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.848,
+      "step": 30
+    },
+    {
+      "epoch": 0.016533333333333334,
+      "grad_norm": 0.6164517754077974,
+      "learning_rate": 0.00010877192982456141,
+      "loss": 0.9562,
+      "step": 31
+    },
+    {
+      "epoch": 0.017066666666666667,
+      "grad_norm": 0.4632419418202054,
+      "learning_rate": 0.00011228070175438597,
+      "loss": 0.7896,
+      "step": 32
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.818896020329126,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 1.1009,
+      "step": 33
+    },
+    {
+      "epoch": 0.018133333333333335,
+      "grad_norm": 0.7170363974154533,
+      "learning_rate": 0.00011929824561403509,
+      "loss": 1.0739,
+      "step": 34
+    },
+    {
+      "epoch": 0.018666666666666668,
+      "grad_norm": 0.6836934050794474,
+      "learning_rate": 0.00012280701754385965,
+      "loss": 0.9919,
+      "step": 35
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.6643085774792657,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.9648,
+      "step": 36
+    },
+    {
+      "epoch": 0.019733333333333332,
+      "grad_norm": 1.3538917299813327,
+      "learning_rate": 0.0001298245614035088,
+      "loss": 1.0665,
+      "step": 37
+    },
+    {
+      "epoch": 0.020266666666666665,
+      "grad_norm": 0.5594299868622598,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.9335,
+      "step": 38
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.5988358395923179,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9207,
+      "step": 39
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 0.5071900127171447,
+      "learning_rate": 0.00014035087719298245,
+      "loss": 0.8156,
+      "step": 40
+    },
+    {
+      "epoch": 0.021866666666666666,
+      "grad_norm": 0.5033370206265442,
+      "learning_rate": 0.00014385964912280703,
+      "loss": 0.8482,
+      "step": 41
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5750991048540888,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.9306,
+      "step": 42
+    },
+    {
+      "epoch": 0.022933333333333333,
+      "grad_norm": 0.7797439497339441,
+      "learning_rate": 0.00015087719298245616,
+      "loss": 1.0038,
+      "step": 43
+    },
+    {
+      "epoch": 0.023466666666666667,
+      "grad_norm": 0.7397829823836828,
+      "learning_rate": 0.0001543859649122807,
+      "loss": 0.9437,
+      "step": 44
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.5147050573707562,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8392,
+      "step": 45
+    },
+    {
+      "epoch": 0.024533333333333334,
+      "grad_norm": 0.5508332246098021,
+      "learning_rate": 0.00016140350877192982,
+      "loss": 0.8978,
+      "step": 46
+    },
+    {
+      "epoch": 0.025066666666666668,
+      "grad_norm": 0.6240366974915812,
+      "learning_rate": 0.0001649122807017544,
+      "loss": 1.0485,
+      "step": 47
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6206028252741724,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8915,
+      "step": 48
+    },
+    {
+      "epoch": 0.026133333333333335,
+      "grad_norm": 0.5378699982729577,
+      "learning_rate": 0.00017192982456140353,
+      "loss": 0.9918,
+      "step": 49
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.4615725517131061,
+      "learning_rate": 0.00017543859649122806,
+      "loss": 0.8364,
+      "step": 50
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.839143978619063,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 1.0965,
+      "step": 51
+    },
+    {
+      "epoch": 0.027733333333333332,
+      "grad_norm": 0.791627020793675,
+      "learning_rate": 0.0001824561403508772,
+      "loss": 1.0167,
+      "step": 52
+    },
+    {
+      "epoch": 0.028266666666666666,
+      "grad_norm": 0.5353898387675298,
+      "learning_rate": 0.00018596491228070177,
+      "loss": 0.8873,
+      "step": 53
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.46155943784236264,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8004,
+      "step": 54
+    },
+    {
+      "epoch": 0.029333333333333333,
+      "grad_norm": 0.6060608354221403,
+      "learning_rate": 0.00019298245614035088,
+      "loss": 0.9862,
+      "step": 55
+    },
+    {
+      "epoch": 0.029866666666666666,
+      "grad_norm": 0.5540773123940826,
+      "learning_rate": 0.00019649122807017543,
+      "loss": 0.9106,
+      "step": 56
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.48593847336045554,
+      "learning_rate": 0.0002,
+      "loss": 0.7632,
+      "step": 57
+    },
+    {
+      "epoch": 0.030933333333333334,
+      "grad_norm": 0.5628081160242835,
+      "learning_rate": 0.00019999985069241055,
+      "loss": 0.9321,
+      "step": 58
+    },
+    {
+      "epoch": 0.031466666666666664,
+      "grad_norm": 0.5569538768407111,
+      "learning_rate": 0.00019999940277008808,
+      "loss": 0.9621,
+      "step": 59
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.4733778178712535,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.805,
+      "step": 60
+    },
+    {
+      "epoch": 0.03253333333333333,
+      "grad_norm": 0.5498882038803397,
+      "learning_rate": 0.00019999761108748597,
+      "loss": 0.8731,
+      "step": 61
+    },
+    {
+      "epoch": 0.03306666666666667,
+      "grad_norm": 0.6276025403213903,
+      "learning_rate": 0.00019999626733255662,
+      "loss": 0.9597,
+      "step": 62
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.6262714541836627,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.9889,
+      "step": 63
+    },
+    {
+      "epoch": 0.034133333333333335,
+      "grad_norm": 0.6116976323121694,
+      "learning_rate": 0.00019999268401550447,
+      "loss": 0.9198,
+      "step": 64
+    },
+    {
+      "epoch": 0.034666666666666665,
+      "grad_norm": 0.6830838441856558,
+      "learning_rate": 0.000199990444464082,
+      "loss": 0.876,
+      "step": 65
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.652455481238001,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.971,
+      "step": 66
+    },
+    {
+      "epoch": 0.03573333333333333,
+      "grad_norm": 0.7252847928776754,
+      "learning_rate": 0.00019998506960888256,
+      "loss": 0.9825,
+      "step": 67
+    },
+    {
+      "epoch": 0.03626666666666667,
+      "grad_norm": 0.6524244782693203,
+      "learning_rate": 0.00019998193432115572,
+      "loss": 0.8155,
+      "step": 68
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.7972076010671612,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.9859,
+      "step": 69
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 0.6557139336610636,
+      "learning_rate": 0.00019997476807225985,
+      "loss": 0.9518,
+      "step": 70
+    },
+    {
+      "epoch": 0.037866666666666667,
+      "grad_norm": 0.5767944519846985,
+      "learning_rate": 0.0001999707371324904,
+      "loss": 0.9456,
+      "step": 71
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.7424314830058474,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.994,
+      "step": 72
+    },
+    {
+      "epoch": 0.038933333333333334,
+      "grad_norm": 0.5590183604518738,
+      "learning_rate": 0.00019996177968249334,
+      "loss": 0.8808,
+      "step": 73
+    },
+    {
+      "epoch": 0.039466666666666664,
+      "grad_norm": 0.5326291568201051,
+      "learning_rate": 0.0001999568531990141,
+      "loss": 0.8654,
+      "step": 74
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6344187236250632,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.9406,
+      "step": 75
+    },
+    {
+      "epoch": 0.04053333333333333,
+      "grad_norm": 0.5164870062084322,
+      "learning_rate": 0.00019994610478865011,
+      "loss": 0.8699,
+      "step": 76
+    },
+    {
+      "epoch": 0.04106666666666667,
+      "grad_norm": 0.535560546495017,
+      "learning_rate": 0.0001999402828938618,
+      "loss": 0.8904,
+      "step": 77
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.604254569720925,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.9037,
+      "step": 78
+    },
+    {
+      "epoch": 0.042133333333333335,
+      "grad_norm": 0.6820823738021821,
+      "learning_rate": 0.00019992774381199778,
+      "loss": 0.9401,
+      "step": 79
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 0.6680958377488201,
+      "learning_rate": 0.00019992102666236566,
+      "loss": 0.9381,
+      "step": 80
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.5953355425943272,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.9229,
+      "step": 81
+    },
+    {
+      "epoch": 0.04373333333333333,
+      "grad_norm": 0.6303538915978696,
+      "learning_rate": 0.00019990669724599336,
+      "loss": 0.9379,
+      "step": 82
+    },
+    {
+      "epoch": 0.04426666666666667,
+      "grad_norm": 0.6555513492310497,
+      "learning_rate": 0.00019989908502204292,
+      "loss": 1.082,
+      "step": 83
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5544528105821924,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8451,
+      "step": 84
+    },
+    {
+      "epoch": 0.04533333333333334,
+      "grad_norm": 0.551371645828701,
+      "learning_rate": 0.00019988296565626987,
+      "loss": 0.7987,
+      "step": 85
+    },
+    {
+      "epoch": 0.04586666666666667,
+      "grad_norm": 0.5663512160236398,
+      "learning_rate": 0.00019987445856258206,
+      "loss": 0.8809,
+      "step": 86
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.5334254537578645,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.8859,
+      "step": 87
+    },
+    {
+      "epoch": 0.046933333333333334,
+      "grad_norm": 0.5864713325791271,
+      "learning_rate": 0.00019985654968062122,
+      "loss": 0.8676,
+      "step": 88
+    },
+    {
+      "epoch": 0.047466666666666664,
+      "grad_norm": 0.4529120336108661,
+      "learning_rate": 0.00019984714794582683,
+      "loss": 0.7923,
+      "step": 89
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5629961828154081,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8905,
+      "step": 90
+    },
+    {
+      "epoch": 0.04853333333333333,
+      "grad_norm": 0.5060733125222548,
+      "learning_rate": 0.000199827450028985,
+      "loss": 0.8003,
+      "step": 91
+    },
+    {
+      "epoch": 0.04906666666666667,
+      "grad_norm": 0.6099839802016345,
+      "learning_rate": 0.00019981715390575858,
+      "loss": 0.9851,
+      "step": 92
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.7766168781243934,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 1.0454,
+      "step": 93
+    },
+    {
+      "epoch": 0.050133333333333335,
+      "grad_norm": 0.6574911191619291,
+      "learning_rate": 0.00019979566748342347,
+      "loss": 0.8434,
+      "step": 94
+    },
+    {
+      "epoch": 0.050666666666666665,
+      "grad_norm": 0.5412993314160603,
+      "learning_rate": 0.00019978447724847652,
+      "loss": 0.8626,
+      "step": 95
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5318278327058145,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.9377,
+      "step": 96
+    },
+    {
+      "epoch": 0.05173333333333333,
+      "grad_norm": 0.6418398169575796,
+      "learning_rate": 0.00019976120289810247,
+      "loss": 0.9495,
+      "step": 97
+    },
+    {
+      "epoch": 0.05226666666666667,
+      "grad_norm": 0.4584177185767165,
+      "learning_rate": 0.00019974911885217608,
+      "loss": 0.8138,
+      "step": 98
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.6544310443497915,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8797,
+      "step": 99
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.6929605791129708,
+      "learning_rate": 0.0001997240571992685,
+      "loss": 0.9384,
+      "step": 100
+    },
+    {
+      "epoch": 0.05386666666666667,
+      "grad_norm": 0.5426845719604184,
+      "learning_rate": 0.00019971107966712518,
+      "loss": 0.868,
+      "step": 101
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.6276077620947658,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.9038,
+      "step": 102
+    },
+    {
+      "epoch": 0.054933333333333334,
+      "grad_norm": 0.610444561137079,
+      "learning_rate": 0.0001996842313852238,
+      "loss": 0.9425,
+      "step": 103
+    },
+    {
+      "epoch": 0.055466666666666664,
+      "grad_norm": 0.5085934994219482,
+      "learning_rate": 0.00019967036071563877,
+      "loss": 0.8444,
+      "step": 104
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.47254420281476195,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.7566,
+      "step": 105
+    },
+    {
+      "epoch": 0.05653333333333333,
+      "grad_norm": 0.6359428663755088,
+      "learning_rate": 0.0001996417265262996,
+      "loss": 0.8932,
+      "step": 106
+    },
+    {
+      "epoch": 0.05706666666666667,
+      "grad_norm": 0.5833109167273566,
+      "learning_rate": 0.00019962696309205148,
+      "loss": 0.9093,
+      "step": 107
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.7101667537550055,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.9376,
+      "step": 108
+    },
+    {
+      "epoch": 0.058133333333333335,
+      "grad_norm": 0.6655308545817167,
+      "learning_rate": 0.0001995965437648273,
+      "loss": 0.9556,
+      "step": 109
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 0.5239100967263433,
+      "learning_rate": 0.00019958088796268793,
+      "loss": 0.8741,
+      "step": 110
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.6492043209126855,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 1.0431,
+      "step": 111
+    },
+    {
+      "epoch": 0.05973333333333333,
+      "grad_norm": 0.6215814129519589,
+      "learning_rate": 0.00019954868431510764,
+      "loss": 0.9475,
+      "step": 112
+    },
+    {
+      "epoch": 0.06026666666666667,
+      "grad_norm": 0.6740097060761598,
+      "learning_rate": 0.00019953213656583168,
+      "loss": 1.0179,
+      "step": 113
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.690783804749006,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.9117,
+      "step": 114
+    },
+    {
+      "epoch": 0.06133333333333333,
+      "grad_norm": 0.5221887417592358,
+      "learning_rate": 0.00019949814946337838,
+      "loss": 0.8273,
+      "step": 115
+    },
+    {
+      "epoch": 0.06186666666666667,
+      "grad_norm": 0.5304131053513017,
+      "learning_rate": 0.00019948071021169174,
+      "loss": 0.8758,
+      "step": 116
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.6580880007697816,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 1.0554,
+      "step": 117
+    },
+    {
+      "epoch": 0.06293333333333333,
+      "grad_norm": 0.5121534197297978,
+      "learning_rate": 0.00019944494056777946,
+      "loss": 0.8894,
+      "step": 118
+    },
+    {
+      "epoch": 0.06346666666666667,
+      "grad_norm": 0.48233888311585627,
+      "learning_rate": 0.00019942661028236745,
+      "loss": 0.7437,
+      "step": 119
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.6005642494183314,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8986,
+      "step": 120
+    },
+    {
+      "epoch": 0.06453333333333333,
+      "grad_norm": 0.5404763024560123,
+      "learning_rate": 0.00019938905905831654,
+      "loss": 0.8837,
+      "step": 121
+    },
+    {
+      "epoch": 0.06506666666666666,
+      "grad_norm": 0.5935111172167694,
+      "learning_rate": 0.00019936983823181132,
+      "loss": 0.878,
+      "step": 122
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.6580202309296764,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.9591,
+      "step": 123
+    },
+    {
+      "epoch": 0.06613333333333334,
+      "grad_norm": 0.783516788896961,
+      "learning_rate": 0.00019933050643682269,
+      "loss": 0.9493,
+      "step": 124
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.5888802226903442,
+      "learning_rate": 0.00019931039558578997,
+      "loss": 0.8927,
+      "step": 125
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.5723859363109641,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.9258,
+      "step": 126
+    },
+    {
+      "epoch": 0.06773333333333334,
+      "grad_norm": 0.49145024025740786,
+      "learning_rate": 0.00019926928427691786,
+      "loss": 0.8225,
+      "step": 127
+    },
+    {
+      "epoch": 0.06826666666666667,
+      "grad_norm": 0.48434420326029637,
+      "learning_rate": 0.00019924828394184306,
+      "loss": 0.7424,
+      "step": 128
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.6298233258277486,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.9409,
+      "step": 129
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 0.5713270653533685,
+      "learning_rate": 0.0001992053942239668,
+      "loss": 0.9058,
+      "step": 130
+    },
+    {
+      "epoch": 0.06986666666666666,
+      "grad_norm": 0.6221411704388371,
+      "learning_rate": 0.0001991835049692405,
+      "loss": 0.8763,
+      "step": 131
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.5626768128370655,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.9247,
+      "step": 132
+    },
+    {
+      "epoch": 0.07093333333333333,
+      "grad_norm": 0.5697871650413173,
+      "learning_rate": 0.0001991388379950346,
+      "loss": 0.806,
+      "step": 133
+    },
+    {
+      "epoch": 0.07146666666666666,
+      "grad_norm": 0.6608383412606968,
+      "learning_rate": 0.0001991160604089374,
+      "loss": 0.9004,
+      "step": 134
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.584120743461091,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.7749,
+      "step": 135
+    },
+    {
+      "epoch": 0.07253333333333334,
+      "grad_norm": 0.5221342816426137,
+      "learning_rate": 0.00019906961737884077,
+      "loss": 0.8579,
+      "step": 136
+    },
+    {
+      "epoch": 0.07306666666666667,
+      "grad_norm": 0.5757818080661625,
+      "learning_rate": 0.00019904595207352737,
+      "loss": 0.8815,
+      "step": 137
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.6705434878822246,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.9205,
+      "step": 138
+    },
+    {
+      "epoch": 0.07413333333333333,
+      "grad_norm": 0.5023728751807374,
+      "learning_rate": 0.000198997734235711,
+      "loss": 0.9135,
+      "step": 139
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.6853077151873902,
+      "learning_rate": 0.00019897318184719385,
+      "loss": 0.9295,
+      "step": 140
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.7253071327402918,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 1.0147,
+      "step": 141
+    },
+    {
+      "epoch": 0.07573333333333333,
+      "grad_norm": 0.6436052857411607,
+      "learning_rate": 0.0001989231904975272,
+      "loss": 0.9075,
+      "step": 142
+    },
+    {
+      "epoch": 0.07626666666666666,
+      "grad_norm": 0.6739660650927344,
+      "learning_rate": 0.00019889775168565943,
+      "loss": 0.8374,
+      "step": 143
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.691497400214393,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.9414,
+      "step": 144
+    },
+    {
+      "epoch": 0.07733333333333334,
+      "grad_norm": 0.5369938083375458,
+      "learning_rate": 0.00019884598816767563,
+      "loss": 0.8273,
+      "step": 145
+    },
+    {
+      "epoch": 0.07786666666666667,
+      "grad_norm": 0.456701747705484,
+      "learning_rate": 0.0001988196636161333,
+      "loss": 0.814,
+      "step": 146
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.5059826225091343,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8691,
+      "step": 147
+    },
+    {
+      "epoch": 0.07893333333333333,
+      "grad_norm": 0.617153667680866,
+      "learning_rate": 0.00019876612932099308,
+      "loss": 0.9196,
+      "step": 148
+    },
+    {
+      "epoch": 0.07946666666666667,
+      "grad_norm": 0.5601388439895788,
+      "learning_rate": 0.0001987389197372567,
+      "loss": 0.8726,
+      "step": 149
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.4531004220034202,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8086,
+      "step": 150
+    },
+    {
+      "epoch": 0.08053333333333333,
+      "grad_norm": 0.5684981456905152,
+      "learning_rate": 0.00019868361610371097,
+      "loss": 0.9698,
+      "step": 151
+    },
+    {
+      "epoch": 0.08106666666666666,
+      "grad_norm": 0.5671637065437304,
+      "learning_rate": 0.00019865552221904665,
+      "loss": 0.7799,
+      "step": 152
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.6834264329339218,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 1.0069,
+      "step": 153
+    },
+    {
+      "epoch": 0.08213333333333334,
+      "grad_norm": 0.5939675059955585,
+      "learning_rate": 0.00019859845073339787,
+      "loss": 0.8895,
+      "step": 154
+    },
+    {
+      "epoch": 0.08266666666666667,
+      "grad_norm": 0.5067830181763635,
+      "learning_rate": 0.00019856947330283752,
+      "loss": 0.8352,
+      "step": 155
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.5755168088159903,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.857,
+      "step": 156
+    },
+    {
+      "epoch": 0.08373333333333334,
+      "grad_norm": 0.5914189367976758,
+      "learning_rate": 0.0001985106354988997,
+      "loss": 0.8485,
+      "step": 157
+    },
+    {
+      "epoch": 0.08426666666666667,
+      "grad_norm": 0.5161594091088438,
+      "learning_rate": 0.00019848077530122083,
+      "loss": 0.9185,
+      "step": 158
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.7476903413837354,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.9803,
+      "step": 159
+    },
+    {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.5232836284681833,
+      "learning_rate": 0.00019842017276027832,
+      "loss": 0.8095,
+      "step": 160
+    },
+    {
+      "epoch": 0.08586666666666666,
+      "grad_norm": 0.5151456889733076,
+      "learning_rate": 0.00019838943059798304,
+      "loss": 0.8796,
+      "step": 161
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.46469856502654683,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.7858,
+      "step": 162
+    },
+    {
+      "epoch": 0.08693333333333333,
+      "grad_norm": 0.5299216119887092,
+      "learning_rate": 0.0001983270649487481,
+      "loss": 0.8212,
+      "step": 163
+    },
+    {
+      "epoch": 0.08746666666666666,
+      "grad_norm": 0.5053108286419691,
+      "learning_rate": 0.0001982954416480417,
+      "loss": 0.7556,
+      "step": 164
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.5808418181585019,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.862,
+      "step": 165
+    },
+    {
+      "epoch": 0.08853333333333334,
+      "grad_norm": 0.5585096838497986,
+      "learning_rate": 0.00019823131456661063,
+      "loss": 0.8968,
+      "step": 166
+    },
+    {
+      "epoch": 0.08906666666666667,
+      "grad_norm": 0.5119764000821178,
+      "learning_rate": 0.00019819881097737915,
+      "loss": 0.8536,
+      "step": 167
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5870047182817826,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.8515,
+      "step": 168
+    },
+    {
+      "epoch": 0.09013333333333333,
+      "grad_norm": 0.7397478155545474,
+      "learning_rate": 0.00019813292418718732,
+      "loss": 1.097,
+      "step": 169
+    },
+    {
+      "epoch": 0.09066666666666667,
+      "grad_norm": 0.4864276836247347,
+      "learning_rate": 0.0001980995411829749,
+      "loss": 0.8035,
+      "step": 170
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.5744790894501604,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.929,
+      "step": 171
+    },
+    {
+      "epoch": 0.09173333333333333,
+      "grad_norm": 0.5799086997777948,
+      "learning_rate": 0.0001980318964547504,
+      "loss": 0.8759,
+      "step": 172
+    },
+    {
+      "epoch": 0.09226666666666666,
+      "grad_norm": 0.625810635582127,
+      "learning_rate": 0.0001979976349327357,
+      "loss": 0.9066,
+      "step": 173
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.6912852183983189,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.9587,
+      "step": 174
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 0.5504811720653686,
+      "learning_rate": 0.00019792823408445174,
+      "loss": 0.8924,
+      "step": 175
+    },
+    {
+      "epoch": 0.09386666666666667,
+      "grad_norm": 0.5509328554394032,
+      "learning_rate": 0.0001978930949654239,
+      "loss": 0.8583,
+      "step": 176
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.8655289083781732,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 1.0595,
+      "step": 177
+    },
+    {
+      "epoch": 0.09493333333333333,
+      "grad_norm": 0.7231047016289638,
+      "learning_rate": 0.00019782193986224995,
+      "loss": 1.0298,
+      "step": 178
+    },
+    {
+      "epoch": 0.09546666666666667,
+      "grad_norm": 0.5661017436347039,
+      "learning_rate": 0.00019778592409058378,
+      "loss": 0.8955,
+      "step": 179
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.7547569870104339,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.9419,
+      "step": 180
+    },
+    {
+      "epoch": 0.09653333333333333,
+      "grad_norm": 0.4770486725682064,
+      "learning_rate": 0.0001977130166448355,
+      "loss": 0.8131,
+      "step": 181
+    },
+    {
+      "epoch": 0.09706666666666666,
+      "grad_norm": 0.6187581030776353,
+      "learning_rate": 0.00019767612518846608,
+      "loss": 0.8965,
+      "step": 182
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.5163544468105237,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8454,
+      "step": 183
+    },
+    {
+      "epoch": 0.09813333333333334,
+      "grad_norm": 0.5793645359884587,
+      "learning_rate": 0.00019760146735955388,
+      "loss": 0.7825,
+      "step": 184
+    },
+    {
+      "epoch": 0.09866666666666667,
+      "grad_norm": 0.5436027110758196,
+      "learning_rate": 0.00019756370120995066,
+      "loss": 0.8983,
+      "step": 185
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.5047471384761516,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.864,
+      "step": 186
+    },
+    {
+      "epoch": 0.09973333333333333,
+      "grad_norm": 0.6023418503550535,
+      "learning_rate": 0.000197487295004327,
+      "loss": 0.8629,
+      "step": 187
+    },
+    {
+      "epoch": 0.10026666666666667,
+      "grad_norm": 0.6195352633209115,
+      "learning_rate": 0.00019744865517646706,
+      "loss": 0.7857,
+      "step": 188
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.6361948905586187,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.8491,
+      "step": 189
+    },
+    {
+      "epoch": 0.10133333333333333,
+      "grad_norm": 0.7444682066816518,
+      "learning_rate": 0.0001973705026475726,
+      "loss": 0.9496,
+      "step": 190
+    },
+    {
+      "epoch": 0.10186666666666666,
+      "grad_norm": 0.686742446118089,
+      "learning_rate": 0.00019733099017991341,
+      "loss": 0.9093,
+      "step": 191
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5113379352699342,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.7989,
+      "step": 192
+    },
+    {
+      "epoch": 0.10293333333333334,
+      "grad_norm": 0.5049749919705699,
+      "learning_rate": 0.0001972510934281218,
+      "loss": 0.8463,
+      "step": 193
+    },
+    {
+      "epoch": 0.10346666666666667,
+      "grad_norm": 0.47457260638752874,
+      "learning_rate": 0.00019721070938257324,
+      "loss": 0.8252,
+      "step": 194
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.548001405270674,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8478,
+      "step": 195
+    },
+    {
+      "epoch": 0.10453333333333334,
+      "grad_norm": 0.5765836534456652,
+      "learning_rate": 0.0001971290705551347,
+      "loss": 0.9433,
+      "step": 196
+    },
+    {
+      "epoch": 0.10506666666666667,
+      "grad_norm": 0.5510096349707276,
+      "learning_rate": 0.00019708781601703065,
+      "loss": 0.7919,
+      "step": 197
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.47654226994156895,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.7935,
+      "step": 198
+    },
+    {
+      "epoch": 0.10613333333333333,
+      "grad_norm": 0.650131696544161,
+      "learning_rate": 0.00019700443730801413,
+      "loss": 0.9422,
+      "step": 199
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.6350761406004101,
+      "learning_rate": 0.00019696231338608316,
+      "loss": 0.9221,
+      "step": 200
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.45138773459013287,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.7349,
+      "step": 201
+    },
+    {
+      "epoch": 0.10773333333333333,
+      "grad_norm": 0.4085582171559744,
+      "learning_rate": 0.00019687719703631755,
+      "loss": 0.724,
+      "step": 202
+    },
+    {
+      "epoch": 0.10826666666666666,
+      "grad_norm": 0.5614815222053043,
+      "learning_rate": 0.00019683420486265327,
+      "loss": 0.8807,
+      "step": 203
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.6353069654484065,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.9196,
+      "step": 204
+    },
+    {
+      "epoch": 0.10933333333333334,
+      "grad_norm": 0.5493836037313905,
+      "learning_rate": 0.0001967473531596671,
+      "loss": 0.8658,
+      "step": 205
+    },
+    {
+      "epoch": 0.10986666666666667,
+      "grad_norm": 0.6096807278667714,
+      "learning_rate": 0.0001967034938896976,
+      "loss": 0.8914,
+      "step": 206
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.5877456530312468,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.9244,
+      "step": 207
+    },
+    {
+      "epoch": 0.11093333333333333,
+      "grad_norm": 0.7394221123604389,
+      "learning_rate": 0.0001966149091676575,
+      "loss": 1.0139,
+      "step": 208
+    },
+    {
+      "epoch": 0.11146666666666667,
+      "grad_norm": 0.47483691931477484,
+      "learning_rate": 0.00019657018398011434,
+      "loss": 0.7767,
+      "step": 209
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5337733477737856,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8779,
+      "step": 210
+    },
+    {
+      "epoch": 0.11253333333333333,
+      "grad_norm": 0.5551043332116106,
+      "learning_rate": 0.00019647986861976246,
+      "loss": 0.8515,
+      "step": 211
+    },
+    {
+      "epoch": 0.11306666666666666,
+      "grad_norm": 0.5753532122700535,
+      "learning_rate": 0.0001964342787166491,
+      "loss": 0.8425,
+      "step": 212
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.5515137414835763,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.8717,
+      "step": 213
+    },
+    {
+      "epoch": 0.11413333333333334,
+      "grad_norm": 0.5989747854092063,
+      "learning_rate": 0.0001963422351452389,
+      "loss": 0.8948,
+      "step": 214
+    },
+    {
+      "epoch": 0.11466666666666667,
+      "grad_norm": 0.5690350553956068,
+      "learning_rate": 0.0001962957817517982,
+      "loss": 0.9121,
+      "step": 215
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.535263534612997,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.7511,
+      "step": 216
+    },
+    {
+      "epoch": 0.11573333333333333,
+      "grad_norm": 0.5367622148610552,
+      "learning_rate": 0.00019620201244302952,
+      "loss": 0.8927,
+      "step": 217
+    },
+    {
+      "epoch": 0.11626666666666667,
+      "grad_norm": 0.4765297941650462,
+      "learning_rate": 0.00019615469680771096,
+      "loss": 0.8122,
+      "step": 218
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.46220681275412034,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7763,
+      "step": 219
+    },
+    {
+      "epoch": 0.11733333333333333,
+      "grad_norm": 0.5408068249788149,
+      "learning_rate": 0.00019605920428166323,
+      "loss": 0.9132,
+      "step": 220
+    },
+    {
+      "epoch": 0.11786666666666666,
+      "grad_norm": 0.4997243415023694,
+      "learning_rate": 0.00019601102767608923,
+      "loss": 0.8844,
+      "step": 221
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5151314108392654,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.7692,
+      "step": 222
+    },
+    {
+      "epoch": 0.11893333333333334,
+      "grad_norm": 0.611846646200774,
+      "learning_rate": 0.00019591381449915397,
+      "loss": 0.9588,
+      "step": 223
+    },
+    {
+      "epoch": 0.11946666666666667,
+      "grad_norm": 0.6200993223544174,
+      "learning_rate": 0.00019586477821808597,
+      "loss": 0.9916,
+      "step": 224
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5448278768368114,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.7907,
+      "step": 225
+    },
+    {
+      "epoch": 0.12053333333333334,
+      "grad_norm": 0.5276761208069791,
+      "learning_rate": 0.00019576584700289768,
+      "loss": 0.7751,
+      "step": 226
+    },
+    {
+      "epoch": 0.12106666666666667,
+      "grad_norm": 0.5080078311818583,
+      "learning_rate": 0.00019571595236420102,
+      "loss": 0.8814,
+      "step": 227
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.49117795723543123,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.8158,
+      "step": 228
+    },
+    {
+      "epoch": 0.12213333333333333,
+      "grad_norm": 0.562514031292076,
+      "learning_rate": 0.00019561530576956703,
+      "loss": 0.8298,
+      "step": 229
+    },
+    {
+      "epoch": 0.12266666666666666,
+      "grad_norm": 0.5797678566569934,
+      "learning_rate": 0.00019556455411417573,
+      "loss": 0.8828,
+      "step": 230
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.6577166703095476,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.9074,
+      "step": 231
+    },
+    {
+      "epoch": 0.12373333333333333,
+      "grad_norm": 0.48373898945054816,
+      "learning_rate": 0.00019546219484500475,
+      "loss": 0.7984,
+      "step": 232
+    },
+    {
+      "epoch": 0.12426666666666666,
+      "grad_norm": 0.5307741985891761,
+      "learning_rate": 0.00019541058753688538,
+      "loss": 0.8549,
+      "step": 233
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.6142434497799985,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.9496,
+      "step": 234
+    },
+    {
+      "epoch": 0.12533333333333332,
+      "grad_norm": 0.5938697329821294,
+      "learning_rate": 0.00019530651834411474,
+      "loss": 0.9931,
+      "step": 235
+    },
+    {
+      "epoch": 0.12586666666666665,
+      "grad_norm": 0.658751929512093,
+      "learning_rate": 0.00019525405677022989,
+      "loss": 0.9422,
+      "step": 236
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.5596776517350291,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.9411,
+      "step": 237
+    },
+    {
+      "epoch": 0.12693333333333334,
+      "grad_norm": 0.5238208832588955,
+      "learning_rate": 0.0001951482804507517,
+      "loss": 0.8706,
+      "step": 238
+    },
+    {
+      "epoch": 0.12746666666666667,
+      "grad_norm": 0.4895294860974578,
+      "learning_rate": 0.00019509496602102252,
+      "loss": 0.7881,
+      "step": 239
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.6272015233556667,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.7681,
+      "step": 240
+    },
+    {
+      "epoch": 0.12853333333333333,
+      "grad_norm": 0.45760205070554244,
+      "learning_rate": 0.00019498748541760846,
+      "loss": 0.8156,
+      "step": 241
+    },
+    {
+      "epoch": 0.12906666666666666,
+      "grad_norm": 0.4664629500963827,
+      "learning_rate": 0.0001949333195648769,
+      "loss": 0.8131,
+      "step": 242
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.5458988273906292,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8886,
+      "step": 243
+    },
+    {
+      "epoch": 0.13013333333333332,
+      "grad_norm": 0.5440558602615054,
+      "learning_rate": 0.00019482413756610173,
+      "loss": 0.8039,
+      "step": 244
+    },
+    {
+      "epoch": 0.13066666666666665,
+      "grad_norm": 0.6619631460221503,
+      "learning_rate": 0.0001947691217460921,
+      "loss": 0.9791,
+      "step": 245
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.5381141507925012,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8854,
+      "step": 246
+    },
+    {
+      "epoch": 0.13173333333333334,
+      "grad_norm": 0.5664167012076495,
+      "learning_rate": 0.00019465824128625617,
+      "loss": 0.9155,
+      "step": 247
+    },
+    {
+      "epoch": 0.13226666666666667,
+      "grad_norm": 0.541046429290967,
+      "learning_rate": 0.00019460237697753577,
+      "loss": 0.8782,
+      "step": 248
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.5453642655002823,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.9673,
+      "step": 249
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.5697833022573667,
+      "learning_rate": 0.00019448980103658613,
+      "loss": 0.9215,
+      "step": 250
+    },
+    {
+      "epoch": 0.13386666666666666,
+      "grad_norm": 0.47879164860636253,
+      "learning_rate": 0.0001944330897405257,
+      "loss": 0.843,
+      "step": 251
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.6495951621386878,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.9015,
+      "step": 252
+    },
+    {
+      "epoch": 0.13493333333333332,
+      "grad_norm": 0.4636656153460495,
+      "learning_rate": 0.00019431882134397598,
+      "loss": 0.8059,
+      "step": 253
+    },
+    {
+      "epoch": 0.13546666666666668,
+      "grad_norm": 0.6725516324126248,
+      "learning_rate": 0.00019426126458470936,
+      "loss": 0.8878,
+      "step": 254
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.562661589535857,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.8702,
+      "step": 255
+    },
+    {
+      "epoch": 0.13653333333333334,
+      "grad_norm": 0.6505106114904452,
+      "learning_rate": 0.00019414530680355837,
+      "loss": 1.0138,
+      "step": 256
+    },
+    {
+      "epoch": 0.13706666666666667,
+      "grad_norm": 0.4574309778395966,
+      "learning_rate": 0.00019408690612794148,
+      "loss": 0.7697,
+      "step": 257
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.554929549141918,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.8963,
+      "step": 258
+    },
+    {
+      "epoch": 0.13813333333333333,
+      "grad_norm": 0.6040855008818894,
+      "learning_rate": 0.00019396926207859084,
+      "loss": 0.9157,
+      "step": 259
+    },
+    {
+      "epoch": 0.13866666666666666,
+      "grad_norm": 0.5905603233723661,
+      "learning_rate": 0.0001939100190561601,
+      "loss": 0.8624,
+      "step": 260
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.6270065120847444,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.9767,
+      "step": 261
+    },
+    {
+      "epoch": 0.13973333333333332,
+      "grad_norm": 0.5514306602953485,
+      "learning_rate": 0.0001937906919003304,
+      "loss": 0.8357,
+      "step": 262
+    },
+    {
+      "epoch": 0.14026666666666668,
+      "grad_norm": 0.4374748974270059,
+      "learning_rate": 0.00019373060812326052,
+      "loss": 0.7702,
+      "step": 263
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5204322990650728,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8523,
+      "step": 264
+    },
+    {
+      "epoch": 0.14133333333333334,
+      "grad_norm": 0.5033734052047955,
+      "learning_rate": 0.00019360960106790643,
+      "loss": 0.8098,
+      "step": 265
+    },
+    {
+      "epoch": 0.14186666666666667,
+      "grad_norm": 0.7935532163140239,
+      "learning_rate": 0.0001935486781509677,
+      "loss": 0.9774,
+      "step": 266
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.5412100673639471,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.9287,
+      "step": 267
+    },
+    {
+      "epoch": 0.14293333333333333,
+      "grad_norm": 0.5219454497503629,
+      "learning_rate": 0.00019342599444819168,
+      "loss": 0.7984,
+      "step": 268
+    },
+    {
+      "epoch": 0.14346666666666666,
+      "grad_norm": 0.6061628378117733,
+      "learning_rate": 0.00019336423402870653,
+      "loss": 0.9205,
+      "step": 269
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.5161486477334278,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.7802,
+      "step": 270
+    },
+    {
+      "epoch": 0.14453333333333335,
+      "grad_norm": 0.9075205901276785,
+      "learning_rate": 0.0001932398769756714,
+      "loss": 1.1094,
+      "step": 271
+    },
+    {
+      "epoch": 0.14506666666666668,
+      "grad_norm": 0.5262421682921276,
+      "learning_rate": 0.0001931772807134704,
+      "loss": 0.843,
+      "step": 272
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.44567338043003074,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.8223,
+      "step": 273
+    },
+    {
+      "epoch": 0.14613333333333334,
+      "grad_norm": 0.6767324917049274,
+      "learning_rate": 0.00019305125365231084,
+      "loss": 0.9738,
+      "step": 274
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 0.8764162817484429,
+      "learning_rate": 0.00019298782322968815,
+      "loss": 1.1272,
+      "step": 275
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.535796392615648,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.8814,
+      "step": 276
+    },
+    {
+      "epoch": 0.14773333333333333,
+      "grad_norm": 0.628205380189567,
+      "learning_rate": 0.0001928601295474208,
+      "loss": 0.925,
+      "step": 277
+    },
+    {
+      "epoch": 0.14826666666666666,
+      "grad_norm": 0.6255289735241575,
+      "learning_rate": 0.00019279586666908884,
+      "loss": 0.9198,
+      "step": 278
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.6645268896556138,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.86,
+      "step": 279
+    },
+    {
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.5935261617288775,
+      "learning_rate": 0.00019266650979752136,
+      "loss": 0.903,
+      "step": 280
+    },
+    {
+      "epoch": 0.14986666666666668,
+      "grad_norm": 0.5707801546251843,
+      "learning_rate": 0.00019260141619056507,
+      "loss": 0.8798,
+      "step": 281
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.4906465515099141,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.772,
+      "step": 282
+    },
+    {
+      "epoch": 0.15093333333333334,
+      "grad_norm": 0.62594135393271,
+      "learning_rate": 0.0001924703996062038,
+      "loss": 0.9959,
+      "step": 283
+    },
+    {
+      "epoch": 0.15146666666666667,
+      "grad_norm": 0.585052245345864,
+      "learning_rate": 0.0001924044770200342,
+      "loss": 0.8952,
+      "step": 284
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.4802588033010636,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.7456,
+      "step": 285
+    },
+    {
+      "epoch": 0.15253333333333333,
+      "grad_norm": 0.6392170407300541,
+      "learning_rate": 0.0001922718042439908,
+      "loss": 0.9384,
+      "step": 286
+    },
+    {
+      "epoch": 0.15306666666666666,
+      "grad_norm": 0.4963931621308698,
+      "learning_rate": 0.000192205054450298,
+      "loss": 0.834,
+      "step": 287
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.5826452587701353,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8792,
+      "step": 288
+    },
+    {
+      "epoch": 0.15413333333333334,
+      "grad_norm": 0.5612821833468113,
+      "learning_rate": 0.00019207072904819486,
+      "loss": 0.8895,
+      "step": 289
+    },
+    {
+      "epoch": 0.15466666666666667,
+      "grad_norm": 0.4528717380241006,
+      "learning_rate": 0.00019200315384090044,
+      "loss": 0.8122,
+      "step": 290
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.5375315189417282,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.9046,
+      "step": 291
+    },
+    {
+      "epoch": 0.15573333333333333,
+      "grad_norm": 0.5305721749962367,
+      "learning_rate": 0.00019186717942277462,
+      "loss": 0.8452,
+      "step": 292
+    },
+    {
+      "epoch": 0.15626666666666666,
+      "grad_norm": 0.5154352397321587,
+      "learning_rate": 0.00019179878061798347,
+      "loss": 0.8022,
+      "step": 293
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.5288507605466519,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.9426,
+      "step": 294
+    },
+    {
+      "epoch": 0.15733333333333333,
+      "grad_norm": 0.5862423688214016,
+      "learning_rate": 0.00019166116083819002,
+      "loss": 0.9661,
+      "step": 295
+    },
+    {
+      "epoch": 0.15786666666666666,
+      "grad_norm": 0.45551657692460773,
+      "learning_rate": 0.00019159194027414128,
+      "loss": 0.8545,
+      "step": 296
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.4811193693958405,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.7884,
+      "step": 297
+    },
+    {
+      "epoch": 0.15893333333333334,
+      "grad_norm": 0.5258397858079111,
+      "learning_rate": 0.00019145267883125482,
+      "loss": 0.867,
+      "step": 298
+    },
+    {
+      "epoch": 0.15946666666666667,
+      "grad_norm": 0.595162856717673,
+      "learning_rate": 0.00019138263836827288,
+      "loss": 0.831,
+      "step": 299
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5659986534420202,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.7968,
+      "step": 300
+    },
+    {
+      "epoch": 0.16053333333333333,
+      "grad_norm": 0.5868058207947794,
+      "learning_rate": 0.00019124173900498818,
+      "loss": 0.9463,
+      "step": 301
+    },
+    {
+      "epoch": 0.16106666666666666,
+      "grad_norm": 0.549031007008094,
+      "learning_rate": 0.00019117088052543233,
+      "loss": 0.9249,
+      "step": 302
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.4388197544653247,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.7764,
+      "step": 303
+    },
+    {
+      "epoch": 0.16213333333333332,
+      "grad_norm": 0.5052690162443814,
+      "learning_rate": 0.00019102834702846387,
+      "loss": 0.8578,
+      "step": 304
+    },
+    {
+      "epoch": 0.16266666666666665,
+      "grad_norm": 0.45750769573278793,
+      "learning_rate": 0.0001909566724366779,
+      "loss": 0.8042,
+      "step": 305
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.42333752935950547,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.6729,
+      "step": 306
+    },
+    {
+      "epoch": 0.16373333333333334,
+      "grad_norm": 0.6361550342067476,
+      "learning_rate": 0.00019081250863665794,
+      "loss": 0.8955,
+      "step": 307
+    },
+    {
+      "epoch": 0.16426666666666667,
+      "grad_norm": 0.5838815775367465,
+      "learning_rate": 0.0001907400198589189,
+      "loss": 0.919,
+      "step": 308
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.47131326237580695,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.8234,
+      "step": 309
+    },
+    {
+      "epoch": 0.16533333333333333,
+      "grad_norm": 0.4939472774275297,
+      "learning_rate": 0.00019059422963029464,
+      "loss": 0.8204,
+      "step": 310
+    },
+    {
+      "epoch": 0.16586666666666666,
+      "grad_norm": 0.4661191073013235,
+      "learning_rate": 0.0001905209286147611,
+      "loss": 0.8462,
+      "step": 311
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.5161016855471054,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.8125,
+      "step": 312
+    },
+    {
+      "epoch": 0.16693333333333332,
+      "grad_norm": 0.6230917896093047,
+      "learning_rate": 0.0001903735158756905,
+      "loss": 0.8682,
+      "step": 313
+    },
+    {
+      "epoch": 0.16746666666666668,
+      "grad_norm": 0.5465885854054142,
+      "learning_rate": 0.0001902994045923502,
+      "loss": 0.8712,
+      "step": 314
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.5838856241344086,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.8742,
+      "step": 315
+    },
+    {
+      "epoch": 0.16853333333333334,
+      "grad_norm": 0.49070003117787664,
+      "learning_rate": 0.0001901503733045967,
+      "loss": 0.7377,
+      "step": 316
+    },
+    {
+      "epoch": 0.16906666666666667,
+      "grad_norm": 0.485262845687119,
+      "learning_rate": 0.00019007545374521355,
+      "loss": 0.8659,
+      "step": 317
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.4831813449914069,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.8319,
+      "step": 318
+    },
+    {
+      "epoch": 0.17013333333333333,
+      "grad_norm": 0.4581343977513169,
+      "learning_rate": 0.00018992480791403958,
+      "loss": 0.7677,
+      "step": 319
+    },
+    {
+      "epoch": 0.17066666666666666,
+      "grad_norm": 0.5571624248436221,
+      "learning_rate": 0.0001898490820921001,
+      "loss": 0.7621,
+      "step": 320
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.5849752225132492,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.818,
+      "step": 321
+    },
+    {
+      "epoch": 0.17173333333333332,
+      "grad_norm": 0.5608480928855067,
+      "learning_rate": 0.0001896968257661595,
+      "loss": 0.8772,
+      "step": 322
+    },
+    {
+      "epoch": 0.17226666666666668,
+      "grad_norm": 0.6465737882703233,
+      "learning_rate": 0.00018962029571681886,
+      "loss": 0.8579,
+      "step": 323
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.5953323704630802,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.9377,
+      "step": 324
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 0.6159644380271597,
+      "learning_rate": 0.00018946643298804793,
+      "loss": 0.9118,
+      "step": 325
+    },
+    {
+      "epoch": 0.17386666666666667,
+      "grad_norm": 0.4360808267460942,
+      "learning_rate": 0.00018938910076807513,
+      "loss": 0.7541,
+      "step": 326
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.5513392870287326,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.88,
+      "step": 327
+    },
+    {
+      "epoch": 0.17493333333333333,
+      "grad_norm": 0.5327690536560286,
+      "learning_rate": 0.0001892336357715829,
+      "loss": 0.8506,
+      "step": 328
+    },
+    {
+      "epoch": 0.17546666666666666,
+      "grad_norm": 0.4041086223362272,
+      "learning_rate": 0.0001891555034593055,
+      "loss": 0.8096,
+      "step": 329
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.6228855606861909,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.9362,
+      "step": 330
+    },
+    {
+      "epoch": 0.17653333333333332,
+      "grad_norm": 0.611510846634051,
+      "learning_rate": 0.00018899844037326225,
+      "loss": 0.8662,
+      "step": 331
+    },
+    {
+      "epoch": 0.17706666666666668,
+      "grad_norm": 0.48009499579003484,
+      "learning_rate": 0.0001889195100685106,
+      "loss": 0.7633,
+      "step": 332
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.5541681054693345,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.8379,
+      "step": 333
+    },
+    {
+      "epoch": 0.17813333333333334,
+      "grad_norm": 0.5982894164030271,
+      "learning_rate": 0.00018876085311403593,
+      "loss": 0.9079,
+      "step": 334
+    },
+    {
+      "epoch": 0.17866666666666667,
+      "grad_norm": 0.5548055035975625,
+      "learning_rate": 0.00018868112693808665,
+      "loss": 0.7873,
+      "step": 335
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4727454212792718,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7872,
+      "step": 336
+    },
+    {
+      "epoch": 0.17973333333333333,
+      "grad_norm": 0.5257309643502509,
+      "learning_rate": 0.00018852088037913577,
+      "loss": 0.8199,
+      "step": 337
+    },
+    {
+      "epoch": 0.18026666666666666,
+      "grad_norm": 0.5360988450390519,
+      "learning_rate": 0.0001884403604746547,
+      "loss": 0.8156,
+      "step": 338
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.4892954466148729,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.811,
+      "step": 339
+    },
+    {
+      "epoch": 0.18133333333333335,
+      "grad_norm": 0.5316589543705368,
+      "learning_rate": 0.00018827852861790398,
+      "loss": 0.7961,
+      "step": 340
+    },
+    {
+      "epoch": 0.18186666666666668,
+      "grad_norm": 0.4370835791050171,
+      "learning_rate": 0.00018819721714888877,
+      "loss": 0.8066,
+      "step": 341
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.5555697668670965,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.8289,
+      "step": 342
+    },
+    {
+      "epoch": 0.18293333333333334,
+      "grad_norm": 0.4958795650773353,
+      "learning_rate": 0.00018803380434362,
+      "loss": 0.8921,
+      "step": 343
+    },
+    {
+      "epoch": 0.18346666666666667,
+      "grad_norm": 0.4561120934623149,
+      "learning_rate": 0.0001879517034953418,
+      "loss": 0.7719,
+      "step": 344
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.5831076390016625,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 1.049,
+      "step": 345
+    },
+    {
+      "epoch": 0.18453333333333333,
+      "grad_norm": 0.5067384623030481,
+      "learning_rate": 0.00018778671413332513,
+      "loss": 0.7914,
+      "step": 346
+    },
+    {
+      "epoch": 0.18506666666666666,
+      "grad_norm": 0.564946409052129,
+      "learning_rate": 0.00018770382611226987,
+      "loss": 0.8551,
+      "step": 347
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.491576477038357,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.79,
+      "step": 348
+    },
+    {
+      "epoch": 0.18613333333333335,
+      "grad_norm": 0.5572261155862809,
+      "learning_rate": 0.000187537264627646,
+      "loss": 0.8445,
+      "step": 349
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.5212090244254038,
+      "learning_rate": 0.00018745359166145523,
+      "loss": 0.9127,
+      "step": 350
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.4913901261761234,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7871,
+      "step": 351
+    },
+    {
+      "epoch": 0.18773333333333334,
+      "grad_norm": 0.5241268418892111,
+      "learning_rate": 0.00018728546253061614,
+      "loss": 0.7065,
+      "step": 352
+    },
+    {
+      "epoch": 0.18826666666666667,
+      "grad_norm": 0.42515391209663606,
+      "learning_rate": 0.00018720100686802694,
+      "loss": 0.7261,
+      "step": 353
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.6581467658741232,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.9722,
+      "step": 354
+    },
+    {
+      "epoch": 0.18933333333333333,
+      "grad_norm": 0.5298306438086058,
+      "learning_rate": 0.00018703131460949554,
+      "loss": 0.9069,
+      "step": 355
+    },
+    {
+      "epoch": 0.18986666666666666,
+      "grad_norm": 0.46546361098263916,
+      "learning_rate": 0.0001869460785202802,
+      "loss": 0.7564,
+      "step": 356
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.485893984659004,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.8271,
+      "step": 357
+    },
+    {
+      "epoch": 0.19093333333333334,
+      "grad_norm": 0.5144090274852071,
+      "learning_rate": 0.00018677482769458904,
+      "loss": 0.8359,
+      "step": 358
+    },
+    {
+      "epoch": 0.19146666666666667,
+      "grad_norm": 0.6949783760386742,
+      "learning_rate": 0.00018668881346949417,
+      "loss": 0.8699,
+      "step": 359
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.6198190399280962,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8451,
+      "step": 360
+    },
+    {
+      "epoch": 0.19253333333333333,
+      "grad_norm": 0.4401101529728473,
+      "learning_rate": 0.00018651600867906272,
+      "loss": 0.7306,
+      "step": 361
+    },
+    {
+      "epoch": 0.19306666666666666,
+      "grad_norm": 0.6101096311868891,
+      "learning_rate": 0.00018642921862974742,
+      "loss": 0.8547,
+      "step": 362
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.5377151040001709,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.817,
+      "step": 363
+    },
+    {
+      "epoch": 0.19413333333333332,
+      "grad_norm": 0.47658728416602897,
+      "learning_rate": 0.00018625486451875843,
+      "loss": 0.8621,
+      "step": 364
+    },
+    {
+      "epoch": 0.19466666666666665,
+      "grad_norm": 0.579309769307992,
+      "learning_rate": 0.0001861673009777325,
+      "loss": 0.8925,
+      "step": 365
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.5376705048519224,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.867,
+      "step": 366
+    },
+    {
+      "epoch": 0.19573333333333334,
+      "grad_norm": 0.512744799308176,
+      "learning_rate": 0.00018599140223200716,
+      "loss": 0.7927,
+      "step": 367
+    },
+    {
+      "epoch": 0.19626666666666667,
+      "grad_norm": 0.5994810647777205,
+      "learning_rate": 0.0001859030675525681,
+      "loss": 0.8483,
+      "step": 368
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.44422235051039305,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.8125,
+      "step": 369
+    },
+    {
+      "epoch": 0.19733333333333333,
+      "grad_norm": 0.5180666707705607,
+      "learning_rate": 0.0001857256288994402,
+      "loss": 0.8957,
+      "step": 370
+    },
+    {
+      "epoch": 0.19786666666666666,
+      "grad_norm": 0.5524856689864757,
+      "learning_rate": 0.00018563652545561013,
+      "loss": 0.924,
+      "step": 371
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4702109494579363,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7929,
+      "step": 372
+    },
+    {
+      "epoch": 0.19893333333333332,
+      "grad_norm": 0.5994746178925932,
+      "learning_rate": 0.000185457551663799,
+      "loss": 0.9002,
+      "step": 373
+    },
+    {
+      "epoch": 0.19946666666666665,
+      "grad_norm": 0.47361831549663996,
+      "learning_rate": 0.00018536768185026083,
+      "loss": 0.8271,
+      "step": 374
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5930536559375674,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.8387,
+      "step": 375
+    },
+    {
+      "epoch": 0.20053333333333334,
+      "grad_norm": 0.5238430141281338,
+      "learning_rate": 0.00018518717772974302,
+      "loss": 0.8138,
+      "step": 376
+    },
+    {
+      "epoch": 0.20106666666666667,
+      "grad_norm": 0.506429838380469,
+      "learning_rate": 0.00018509654396177609,
+      "loss": 0.8263,
+      "step": 377
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.5794579155651455,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8536,
+      "step": 378
+    },
+    {
+      "epoch": 0.20213333333333333,
+      "grad_norm": 0.47506482339905026,
+      "learning_rate": 0.00018491451436365627,
+      "loss": 0.7853,
+      "step": 379
+    },
+    {
+      "epoch": 0.20266666666666666,
+      "grad_norm": 0.450027524333986,
+      "learning_rate": 0.0001848231190770714,
+      "loss": 0.7821,
+      "step": 380
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.557993144873831,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.769,
+      "step": 381
+    },
+    {
+      "epoch": 0.20373333333333332,
+      "grad_norm": 0.4578599686187814,
+      "learning_rate": 0.00018463956889345194,
+      "loss": 0.8607,
+      "step": 382
+    },
+    {
+      "epoch": 0.20426666666666668,
+      "grad_norm": 0.5844386845197508,
+      "learning_rate": 0.00018454741454452603,
+      "loss": 0.8208,
+      "step": 383
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.5074206945564635,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.78,
+      "step": 384
+    },
+    {
+      "epoch": 0.20533333333333334,
+      "grad_norm": 0.44395521429628143,
+      "learning_rate": 0.00018436234870837547,
+      "loss": 0.7179,
+      "step": 385
+    },
+    {
+      "epoch": 0.20586666666666667,
+      "grad_norm": 0.5100997833574008,
+      "learning_rate": 0.00018426943777378552,
+      "loss": 0.7766,
+      "step": 386
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.454280185635574,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.8336,
+      "step": 387
+    },
+    {
+      "epoch": 0.20693333333333333,
+      "grad_norm": 0.5285292757673242,
+      "learning_rate": 0.00018408286125880604,
+      "loss": 0.8874,
+      "step": 388
+    },
+    {
+      "epoch": 0.20746666666666666,
+      "grad_norm": 0.6344098140860316,
+      "learning_rate": 0.00018398919623556238,
+      "loss": 0.9041,
+      "step": 389
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.5149194288974933,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.857,
+      "step": 390
+    },
+    {
+      "epoch": 0.20853333333333332,
+      "grad_norm": 0.5109486020277634,
+      "learning_rate": 0.0001838011140560562,
+      "loss": 0.8142,
+      "step": 391
+    },
+    {
+      "epoch": 0.20906666666666668,
+      "grad_norm": 0.5235105155371217,
+      "learning_rate": 0.00018370669746143564,
+      "loss": 0.7875,
+      "step": 392
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5503602375551576,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.8122,
+      "step": 393
+    },
+    {
+      "epoch": 0.21013333333333334,
+      "grad_norm": 0.5674118651197855,
+      "learning_rate": 0.0001835171146721701,
+      "loss": 0.7899,
+      "step": 394
+    },
+    {
+      "epoch": 0.21066666666666667,
+      "grad_norm": 0.530104587343037,
+      "learning_rate": 0.00018342194904364813,
+      "loss": 0.7918,
+      "step": 395
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5115579238706,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.9503,
+      "step": 396
+    },
+    {
+      "epoch": 0.21173333333333333,
+      "grad_norm": 0.6329129854824773,
+      "learning_rate": 0.00018323087073971993,
+      "loss": 0.8994,
+      "step": 397
+    },
+    {
+      "epoch": 0.21226666666666666,
+      "grad_norm": 0.5377736255259186,
+      "learning_rate": 0.00018313495863490258,
+      "loss": 0.8565,
+      "step": 398
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.5079375630365733,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.9063,
+      "step": 399
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.4573476129721287,
+      "learning_rate": 0.00018294238995160094,
+      "loss": 0.8598,
+      "step": 400
+    },
+    {
+      "epoch": 0.21386666666666668,
+      "grad_norm": 0.49418943063174736,
+      "learning_rate": 0.00018284573394815597,
+      "loss": 0.7805,
+      "step": 401
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.47243497761678455,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.8249,
+      "step": 402
+    },
+    {
+      "epoch": 0.21493333333333334,
+      "grad_norm": 0.4112186563303933,
+      "learning_rate": 0.00018265168006082437,
+      "loss": 0.7063,
+      "step": 403
+    },
+    {
+      "epoch": 0.21546666666666667,
+      "grad_norm": 0.4414313706621415,
+      "learning_rate": 0.00018255428275641214,
+      "loss": 0.7382,
+      "step": 404
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5761644030030776,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.8861,
+      "step": 405
+    },
+    {
+      "epoch": 0.21653333333333333,
+      "grad_norm": 0.7557863909743763,
+      "learning_rate": 0.0001823587488803095,
+      "loss": 0.8796,
+      "step": 406
+    },
+    {
+      "epoch": 0.21706666666666666,
+      "grad_norm": 0.5402777514357807,
+      "learning_rate": 0.00018226061289251298,
+      "loss": 0.7842,
+      "step": 407
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.48727149899937433,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.791,
+      "step": 408
+    },
+    {
+      "epoch": 0.21813333333333335,
+      "grad_norm": 0.44541974930624056,
+      "learning_rate": 0.00018206360428267332,
+      "loss": 0.7707,
+      "step": 409
+    },
+    {
+      "epoch": 0.21866666666666668,
+      "grad_norm": 0.580246214808751,
+      "learning_rate": 0.00018196473224892784,
+      "loss": 0.8541,
+      "step": 410
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.581307078232911,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.8594,
+      "step": 411
+    },
+    {
+      "epoch": 0.21973333333333334,
+      "grad_norm": 0.5708358247918823,
+      "learning_rate": 0.0001817662542000192,
+      "loss": 0.9647,
+      "step": 412
+    },
+    {
+      "epoch": 0.22026666666666667,
+      "grad_norm": 0.4649649314795202,
+      "learning_rate": 0.0001816666487775416,
+      "loss": 0.7483,
+      "step": 413
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.774756832425865,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 1.0104,
+      "step": 414
+    },
+    {
+      "epoch": 0.22133333333333333,
+      "grad_norm": 0.605587182068161,
+      "learning_rate": 0.00018146670662372354,
+      "loss": 0.8086,
+      "step": 415
+    },
+    {
+      "epoch": 0.22186666666666666,
+      "grad_norm": 0.5472845405240865,
+      "learning_rate": 0.0001813663704894407,
+      "loss": 0.8232,
+      "step": 416
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.5017489912343042,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.8404,
+      "step": 417
+    },
+    {
+      "epoch": 0.22293333333333334,
+      "grad_norm": 0.49818526027683374,
+      "learning_rate": 0.00018116496960422107,
+      "loss": 0.8592,
+      "step": 418
+    },
+    {
+      "epoch": 0.22346666666666667,
+      "grad_norm": 0.689125839132402,
+      "learning_rate": 0.00018106390545469795,
+      "loss": 0.8359,
+      "step": 419
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.6821308162692863,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.8958,
+      "step": 420
+    },
+    {
+      "epoch": 0.22453333333333333,
+      "grad_norm": 0.5804857693615908,
+      "learning_rate": 0.00018086105125078857,
+      "loss": 0.9093,
+      "step": 421
+    },
+    {
+      "epoch": 0.22506666666666666,
+      "grad_norm": 0.46982462769170197,
+      "learning_rate": 0.00018075926180215576,
+      "loss": 0.792,
+      "step": 422
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.541140037074842,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.7495,
+      "step": 423
+    },
+    {
+      "epoch": 0.22613333333333333,
+      "grad_norm": 0.622171168911818,
+      "learning_rate": 0.0001805549597313267,
+      "loss": 0.93,
+      "step": 424
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 0.5569050502367818,
+      "learning_rate": 0.0001804524477192075,
+      "loss": 0.928,
+      "step": 425
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.48875970515751554,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.8371,
+      "step": 426
+    },
+    {
+      "epoch": 0.22773333333333334,
+      "grad_norm": 0.5755493350394935,
+      "learning_rate": 0.00018024670327214084,
+      "loss": 0.8447,
+      "step": 427
+    },
+    {
+      "epoch": 0.22826666666666667,
+      "grad_norm": 0.5583798722074151,
+      "learning_rate": 0.00018014347145157755,
+      "loss": 0.8703,
+      "step": 428
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.5206717758140702,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7646,
+      "step": 429
+    },
+    {
+      "epoch": 0.22933333333333333,
+      "grad_norm": 0.4885017057637796,
+      "learning_rate": 0.0001799362901577196,
+      "loss": 0.8477,
+      "step": 430
+    },
+    {
+      "epoch": 0.22986666666666666,
+      "grad_norm": 0.5604629624899274,
+      "learning_rate": 0.00017983234130309968,
+      "loss": 0.9046,
+      "step": 431
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.464216914383139,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7891,
+      "step": 432
+    },
+    {
+      "epoch": 0.23093333333333332,
+      "grad_norm": 0.5746567610258846,
+      "learning_rate": 0.00017962372873051252,
+      "loss": 0.9133,
+      "step": 433
+    },
+    {
+      "epoch": 0.23146666666666665,
+      "grad_norm": 0.42200089908841276,
+      "learning_rate": 0.00017951906563549397,
+      "loss": 0.7272,
+      "step": 434
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.6165414978826635,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.938,
+      "step": 435
+    },
+    {
+      "epoch": 0.23253333333333334,
+      "grad_norm": 0.4702346524928013,
+      "learning_rate": 0.00017930902739070562,
+      "loss": 0.7541,
+      "step": 436
+    },
+    {
+      "epoch": 0.23306666666666667,
+      "grad_norm": 0.4727315201073824,
+      "learning_rate": 0.00017920365286814183,
+      "loss": 0.6898,
+      "step": 437
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4299838281148631,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.758,
+      "step": 438
+    },
+    {
+      "epoch": 0.23413333333333333,
+      "grad_norm": 0.44337457223936744,
+      "learning_rate": 0.0001789921945959958,
+      "loss": 0.7556,
+      "step": 439
+    },
+    {
+      "epoch": 0.23466666666666666,
+      "grad_norm": 0.5329729018475009,
+      "learning_rate": 0.00017888611147786002,
+      "loss": 0.864,
+      "step": 440
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.5705008239613129,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.8289,
+      "step": 441
+    },
+    {
+      "epoch": 0.23573333333333332,
+      "grad_norm": 0.47263183096368583,
+      "learning_rate": 0.00017867323886136348,
+      "loss": 0.7253,
+      "step": 442
+    },
+    {
+      "epoch": 0.23626666666666668,
+      "grad_norm": 0.45304333722800333,
+      "learning_rate": 0.00017856644999867264,
+      "loss": 0.7468,
+      "step": 443
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5060938939140778,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7394,
+      "step": 444
+    },
+    {
+      "epoch": 0.23733333333333334,
+      "grad_norm": 0.6564144054411251,
+      "learning_rate": 0.00017835216875884368,
+      "loss": 0.9074,
+      "step": 445
+    },
+    {
+      "epoch": 0.23786666666666667,
+      "grad_norm": 0.42445989401827156,
+      "learning_rate": 0.0001782446770215819,
+      "loss": 0.739,
+      "step": 446
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.5315627798908884,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7656,
+      "step": 447
+    },
+    {
+      "epoch": 0.23893333333333333,
+      "grad_norm": 0.4688436730717214,
+      "learning_rate": 0.00017802899291729585,
+      "loss": 0.7674,
+      "step": 448
+    },
+    {
+      "epoch": 0.23946666666666666,
+      "grad_norm": 0.5810720136539039,
+      "learning_rate": 0.0001779208011943371,
+      "loss": 0.8692,
+      "step": 449
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5812458391708225,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.8987,
+      "step": 450
+    },
+    {
+      "epoch": 0.24053333333333332,
+      "grad_norm": 0.4799125350467684,
+      "learning_rate": 0.00017770372002217172,
+      "loss": 0.7877,
+      "step": 451
+    },
+    {
+      "epoch": 0.24106666666666668,
+      "grad_norm": 0.6412843712952934,
+      "learning_rate": 0.00017759483122120238,
+      "loss": 0.9124,
+      "step": 452
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.4507351816970884,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7917,
+      "step": 453
+    },
+    {
+      "epoch": 0.24213333333333334,
+      "grad_norm": 0.5232869881443669,
+      "learning_rate": 0.00017737635881528196,
+      "loss": 0.8898,
+      "step": 454
+    },
+    {
+      "epoch": 0.24266666666666667,
+      "grad_norm": 0.6678830672310265,
+      "learning_rate": 0.00017726677586272263,
+      "loss": 0.9137,
+      "step": 455
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.5801502567328287,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.8537,
+      "step": 456
+    },
+    {
+      "epoch": 0.24373333333333333,
+      "grad_norm": 0.45841158112039687,
+      "learning_rate": 0.00017704691809456143,
+      "loss": 0.8383,
+      "step": 457
+    },
+    {
+      "epoch": 0.24426666666666666,
+      "grad_norm": 0.6025432050942682,
+      "learning_rate": 0.0001769366439354882,
+      "loss": 0.8565,
+      "step": 458
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.4970815494062268,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.851,
+      "step": 459
+    },
+    {
+      "epoch": 0.24533333333333332,
+      "grad_norm": 0.4518959864049737,
+      "learning_rate": 0.00017671540671383243,
+      "loss": 0.7825,
+      "step": 460
+    },
+    {
+      "epoch": 0.24586666666666668,
+      "grad_norm": 0.46228300717288223,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 0.7594,
+      "step": 461
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.4739273906053965,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7488,
+      "step": 462
+    },
+    {
+      "epoch": 0.24693333333333334,
+      "grad_norm": 0.5092330687540366,
+      "learning_rate": 0.00017638183358256696,
+      "loss": 0.9223,
+      "step": 463
+    },
+    {
+      "epoch": 0.24746666666666667,
+      "grad_norm": 0.5717462977076776,
+      "learning_rate": 0.00017627018591992018,
+      "loss": 0.8208,
+      "step": 464
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.5863086266796643,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.9333,
+      "step": 465
+    },
+    {
+      "epoch": 0.24853333333333333,
+      "grad_norm": 0.41277279746009543,
+      "learning_rate": 0.00017604620766564723,
+      "loss": 0.7436,
+      "step": 466
+    },
+    {
+      "epoch": 0.24906666666666666,
+      "grad_norm": 0.6106807286396574,
+      "learning_rate": 0.00017593387774285412,
+      "loss": 0.9381,
+      "step": 467
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.5677050966969265,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.7547,
+      "step": 468
+    },
+    {
+      "epoch": 0.2501333333333333,
+      "grad_norm": 0.5833341849301831,
+      "learning_rate": 0.0001757085379831246,
+      "loss": 0.8697,
+      "step": 469
+    },
+    {
+      "epoch": 0.25066666666666665,
+      "grad_norm": 0.5290752412549475,
+      "learning_rate": 0.00017559552881908695,
+      "loss": 0.7354,
+      "step": 470
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.5779717687931067,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.7323,
+      "step": 471
+    },
+    {
+      "epoch": 0.2517333333333333,
+      "grad_norm": 0.4392699173768945,
+      "learning_rate": 0.00017536883360997743,
+      "loss": 0.7517,
+      "step": 472
+    },
+    {
+      "epoch": 0.25226666666666664,
+      "grad_norm": 0.5508248857407402,
+      "learning_rate": 0.00017525514824185185,
+      "loss": 0.8888,
+      "step": 473
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.6781607961525851,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.9684,
+      "step": 474
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 0.4791095538129747,
+      "learning_rate": 0.00017502710367586687,
+      "loss": 0.7642,
+      "step": 475
+    },
+    {
+      "epoch": 0.2538666666666667,
+      "grad_norm": 0.5166694348338413,
+      "learning_rate": 0.0001749127451589832,
+      "loss": 0.8259,
+      "step": 476
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.3489532858102896,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.6568,
+      "step": 477
+    },
+    {
+      "epoch": 0.25493333333333335,
+      "grad_norm": 0.5245925867164768,
+      "learning_rate": 0.00017468335736489177,
+      "loss": 0.8743,
+      "step": 478
+    },
+    {
+      "epoch": 0.2554666666666667,
+      "grad_norm": 0.44304976380850064,
+      "learning_rate": 0.00017456832877267084,
+      "loss": 0.8157,
+      "step": 479
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.42422297872920073,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7929,
+      "step": 480
+    },
+    {
+      "epoch": 0.25653333333333334,
+      "grad_norm": 0.7557404944246874,
+      "learning_rate": 0.00017433760391534167,
+      "loss": 0.9597,
+      "step": 481
+    },
+    {
+      "epoch": 0.25706666666666667,
+      "grad_norm": 0.48485914445666506,
+      "learning_rate": 0.00017422190833921283,
+      "loss": 0.739,
+      "step": 482
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.5039331423519677,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.858,
+      "step": 483
+    },
+    {
+      "epoch": 0.2581333333333333,
+      "grad_norm": 0.7303548271504826,
+      "learning_rate": 0.00017398985261944856,
+      "loss": 0.8248,
+      "step": 484
+    },
+    {
+      "epoch": 0.25866666666666666,
+      "grad_norm": 0.6772369751239833,
+      "learning_rate": 0.00017387349316876666,
+      "loss": 0.8311,
+      "step": 485
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.5187610284586263,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.8564,
+      "step": 486
+    },
+    {
+      "epoch": 0.2597333333333333,
+      "grad_norm": 0.464388798089011,
+      "learning_rate": 0.0001736401128231373,
+      "loss": 0.7823,
+      "step": 487
+    },
+    {
+      "epoch": 0.26026666666666665,
+      "grad_norm": 0.5359802209397111,
+      "learning_rate": 0.00017352309262509894,
+      "loss": 0.8423,
+      "step": 488
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.556993993005146,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.8595,
+      "step": 489
+    },
+    {
+      "epoch": 0.2613333333333333,
+      "grad_norm": 0.41901610477892903,
+      "learning_rate": 0.0001732883939257742,
+      "loss": 0.8083,
+      "step": 490
+    },
+    {
+      "epoch": 0.2618666666666667,
+      "grad_norm": 0.5102021415665858,
+      "learning_rate": 0.0001731707161253338,
+      "loss": 0.8268,
+      "step": 491
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.5754800536381837,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.821,
+      "step": 492
+    },
+    {
+      "epoch": 0.26293333333333335,
+      "grad_norm": 0.48205165510631953,
+      "learning_rate": 0.00017293470537991463,
+      "loss": 0.8306,
+      "step": 493
+    },
+    {
+      "epoch": 0.2634666666666667,
+      "grad_norm": 0.5247804469877534,
+      "learning_rate": 0.00017281637313969978,
+      "loss": 0.7947,
+      "step": 494
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.520546460368178,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.809,
+      "step": 495
+    },
+    {
+      "epoch": 0.26453333333333334,
+      "grad_norm": 0.41373617382203987,
+      "learning_rate": 0.00017257905669104874,
+      "loss": 0.7432,
+      "step": 496
+    },
+    {
+      "epoch": 0.2650666666666667,
+      "grad_norm": 0.44134956144894705,
+      "learning_rate": 0.00017246007319127545,
+      "loss": 0.7787,
+      "step": 497
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.475468089448575,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7971,
+      "step": 498
+    },
+    {
+      "epoch": 0.26613333333333333,
+      "grad_norm": 0.6027268130260609,
+      "learning_rate": 0.00017222145741734626,
+      "loss": 0.8426,
+      "step": 499
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.46290862472857397,
+      "learning_rate": 0.00017210182585573327,
+      "loss": 0.7634,
+      "step": 500
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.566108444859895,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.8173,
+      "step": 501
+    },
+    {
+      "epoch": 0.2677333333333333,
+      "grad_norm": 0.5507765253343672,
+      "learning_rate": 0.00017186191716939944,
+      "loss": 0.8316,
+      "step": 502
+    },
+    {
+      "epoch": 0.26826666666666665,
+      "grad_norm": 0.40469652703742315,
+      "learning_rate": 0.0001717416407610824,
+      "loss": 0.7054,
+      "step": 503
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.48157658854253194,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8114,
+      "step": 504
+    },
+    {
+      "epoch": 0.2693333333333333,
+      "grad_norm": 0.4801546623719645,
+      "learning_rate": 0.00017150044560996488,
+      "loss": 0.775,
+      "step": 505
+    },
+    {
+      "epoch": 0.26986666666666664,
+      "grad_norm": 0.4301332412661433,
+      "learning_rate": 0.00017137952758740978,
+      "loss": 0.7568,
+      "step": 506
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.5135811751297393,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.844,
+      "step": 507
+    },
+    {
+      "epoch": 0.27093333333333336,
+      "grad_norm": 0.5221443518980844,
+      "learning_rate": 0.00017113705245370368,
+      "loss": 0.8096,
+      "step": 508
+    },
+    {
+      "epoch": 0.2714666666666667,
+      "grad_norm": 0.6576443077268289,
+      "learning_rate": 0.00017101549606662024,
+      "loss": 0.9185,
+      "step": 509
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.6650147294272072,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.8357,
+      "step": 510
+    },
+    {
+      "epoch": 0.27253333333333335,
+      "grad_norm": 0.5463577951199781,
+      "learning_rate": 0.00017077174746692056,
+      "loss": 0.8872,
+      "step": 511
+    },
+    {
+      "epoch": 0.2730666666666667,
+      "grad_norm": 0.4984353756271598,
+      "learning_rate": 0.00017064955598217462,
+      "loss": 0.7617,
+      "step": 512
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.4616081485174494,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7243,
+      "step": 513
+    },
+    {
+      "epoch": 0.27413333333333334,
+      "grad_norm": 0.5084453619371878,
+      "learning_rate": 0.00017040454046730115,
+      "loss": 0.7767,
+      "step": 514
+    },
+    {
+      "epoch": 0.27466666666666667,
+      "grad_norm": 0.626500513697084,
+      "learning_rate": 0.00017028171716882714,
+      "loss": 0.9369,
+      "step": 515
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4529032013789942,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7285,
+      "step": 516
+    },
+    {
+      "epoch": 0.27573333333333333,
+      "grad_norm": 0.5515420159752935,
+      "learning_rate": 0.00017003544132364846,
+      "loss": 0.8331,
+      "step": 517
+    },
+    {
+      "epoch": 0.27626666666666666,
+      "grad_norm": 0.49116840088614994,
+      "learning_rate": 0.00016991198951236088,
+      "loss": 0.8529,
+      "step": 518
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.6553757063449491,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.8676,
+      "step": 519
+    },
+    {
+      "epoch": 0.2773333333333333,
+      "grad_norm": 0.5824943501067116,
+      "learning_rate": 0.00016966445995561727,
+      "loss": 0.8433,
+      "step": 520
+    },
+    {
+      "epoch": 0.27786666666666665,
+      "grad_norm": 0.5497170047766405,
+      "learning_rate": 0.00016954038294932216,
+      "loss": 0.8439,
+      "step": 521
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.5140969276726812,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.8009,
+      "step": 522
+    },
+    {
+      "epoch": 0.2789333333333333,
+      "grad_norm": 0.48686499946393014,
+      "learning_rate": 0.0001692916063334479,
+      "loss": 0.8416,
+      "step": 523
+    },
+    {
+      "epoch": 0.27946666666666664,
+      "grad_norm": 0.5168470295829324,
+      "learning_rate": 0.0001691669074667535,
+      "loss": 0.8766,
+      "step": 524
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.6026993798612537,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.8608,
+      "step": 525
+    },
+    {
+      "epoch": 0.28053333333333336,
+      "grad_norm": 0.48750661458077205,
+      "learning_rate": 0.0001689168904776979,
+      "loss": 0.8569,
+      "step": 526
+    },
+    {
+      "epoch": 0.2810666666666667,
+      "grad_norm": 0.5205322822608496,
+      "learning_rate": 0.00016879157310192535,
+      "loss": 0.8943,
+      "step": 527
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.45029603832584264,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7671,
+      "step": 528
+    },
+    {
+      "epoch": 0.28213333333333335,
+      "grad_norm": 0.7667010384373698,
+      "learning_rate": 0.00016854032245897308,
+      "loss": 0.936,
+      "step": 529
+    },
+    {
+      "epoch": 0.2826666666666667,
+      "grad_norm": 0.6808760932653368,
+      "learning_rate": 0.00016841438994206595,
+      "loss": 0.9906,
+      "step": 530
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.5107596500012404,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.8291,
+      "step": 531
+    },
+    {
+      "epoch": 0.28373333333333334,
+      "grad_norm": 0.615128336771932,
+      "learning_rate": 0.00016816191239765667,
+      "loss": 0.8414,
+      "step": 532
+    },
+    {
+      "epoch": 0.28426666666666667,
+      "grad_norm": 0.4972186646105337,
+      "learning_rate": 0.00016803536812409075,
+      "loss": 0.7722,
+      "step": 533
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.5552812456187501,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.8224,
+      "step": 534
+    },
+    {
+      "epoch": 0.2853333333333333,
+      "grad_norm": 0.5412211364871428,
+      "learning_rate": 0.00016778167046363734,
+      "loss": 0.864,
+      "step": 535
+    },
+    {
+      "epoch": 0.28586666666666666,
+      "grad_norm": 0.47978554124338063,
+      "learning_rate": 0.00016765451783432953,
+      "loss": 0.8055,
+      "step": 536
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.4962602604272398,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.8053,
+      "step": 537
+    },
+    {
+      "epoch": 0.2869333333333333,
+      "grad_norm": 0.48090103519226757,
+      "learning_rate": 0.0001673996068760359,
+      "loss": 0.7141,
+      "step": 538
+    },
+    {
+      "epoch": 0.28746666666666665,
+      "grad_norm": 0.5189967882380148,
+      "learning_rate": 0.00016727184930825288,
+      "loss": 0.8366,
+      "step": 539
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.46111912936807337,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.785,
+      "step": 540
+    },
+    {
+      "epoch": 0.2885333333333333,
+      "grad_norm": 0.5539541504876097,
+      "learning_rate": 0.00016701573190293077,
+      "loss": 0.8262,
+      "step": 541
+    },
+    {
+      "epoch": 0.2890666666666667,
+      "grad_norm": 0.5247755483401154,
+      "learning_rate": 0.00016688737283019706,
+      "loss": 0.8989,
+      "step": 542
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.47242081364961513,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7879,
+      "step": 543
+    },
+    {
+      "epoch": 0.29013333333333335,
+      "grad_norm": 0.5696691564826674,
+      "learning_rate": 0.00016663005586108176,
+      "loss": 0.9036,
+      "step": 544
+    },
+    {
+      "epoch": 0.2906666666666667,
+      "grad_norm": 0.5620281936198633,
+      "learning_rate": 0.00016650109873308765,
+      "loss": 0.8152,
+      "step": 545
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.4672591331363155,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7788,
+      "step": 546
+    },
+    {
+      "epoch": 0.29173333333333334,
+      "grad_norm": 0.6262792002322631,
+      "learning_rate": 0.0001662425891156531,
+      "loss": 0.8908,
+      "step": 547
+    },
+    {
+      "epoch": 0.2922666666666667,
+      "grad_norm": 0.518269709369013,
+      "learning_rate": 0.00016611303739816168,
+      "loss": 0.8044,
+      "step": 548
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.39337241146210455,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.6769,
+      "step": 549
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.5417690530616373,
+      "learning_rate": 0.00016585334207993476,
+      "loss": 0.848,
+      "step": 550
+    },
+    {
+      "epoch": 0.29386666666666666,
+      "grad_norm": 1.187369023455772,
+      "learning_rate": 0.00016572319925468892,
+      "loss": 0.8774,
+      "step": 551
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.5423071451639491,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7612,
+      "step": 552
+    },
+    {
+      "epoch": 0.2949333333333333,
+      "grad_norm": 0.5084989532325404,
+      "learning_rate": 0.0001654623252150624,
+      "loss": 0.845,
+      "step": 553
+    },
+    {
+      "epoch": 0.29546666666666666,
+      "grad_norm": 0.5604399463617918,
+      "learning_rate": 0.00016533159477969122,
+      "loss": 0.7828,
+      "step": 554
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.5264007894818428,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.8529,
+      "step": 555
+    },
+    {
+      "epoch": 0.2965333333333333,
+      "grad_norm": 0.37714368324182157,
+      "learning_rate": 0.00016506954902973655,
+      "loss": 0.7372,
+      "step": 556
+    },
+    {
+      "epoch": 0.29706666666666665,
+      "grad_norm": 0.4490152569575573,
+      "learning_rate": 0.00016493823449766136,
+      "loss": 0.7629,
+      "step": 557
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.5242042096888933,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.8096,
+      "step": 558
+    },
+    {
+      "epoch": 0.2981333333333333,
+      "grad_norm": 0.7189712158273117,
+      "learning_rate": 0.00016467502407993992,
+      "loss": 0.933,
+      "step": 559
+    },
+    {
+      "epoch": 0.2986666666666667,
+      "grad_norm": 0.5567176210351147,
+      "learning_rate": 0.0001645431289802799,
+      "loss": 0.8033,
+      "step": 560
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.6606946387299742,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.8288,
+      "step": 561
+    },
+    {
+      "epoch": 0.29973333333333335,
+      "grad_norm": 0.5674771200484571,
+      "learning_rate": 0.00016427876096865394,
+      "loss": 0.9467,
+      "step": 562
+    },
+    {
+      "epoch": 0.3002666666666667,
+      "grad_norm": 0.4934615713701951,
+      "learning_rate": 0.00016414628884613107,
+      "loss": 0.8295,
+      "step": 563
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.6417839110950121,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.818,
+      "step": 564
+    },
+    {
+      "epoch": 0.30133333333333334,
+      "grad_norm": 0.5258462179062289,
+      "learning_rate": 0.00016388077034557355,
+      "loss": 0.8301,
+      "step": 565
+    },
+    {
+      "epoch": 0.30186666666666667,
+      "grad_norm": 0.6337368767683507,
+      "learning_rate": 0.00016374772476041748,
+      "loss": 0.8649,
+      "step": 566
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.4598815827476441,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.8378,
+      "step": 567
+    },
+    {
+      "epoch": 0.30293333333333333,
+      "grad_norm": 0.45326439680218955,
+      "learning_rate": 0.00016348106290682118,
+      "loss": 0.7619,
+      "step": 568
+    },
+    {
+      "epoch": 0.30346666666666666,
+      "grad_norm": 0.47063645637459545,
+      "learning_rate": 0.00016334744743467364,
+      "loss": 0.7845,
+      "step": 569
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4463096527646349,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7873,
+      "step": 570
+    },
+    {
+      "epoch": 0.3045333333333333,
+      "grad_norm": 0.5566967859146053,
+      "learning_rate": 0.00016307964939465914,
+      "loss": 0.8291,
+      "step": 571
+    },
+    {
+      "epoch": 0.30506666666666665,
+      "grad_norm": 0.5026777227425168,
+      "learning_rate": 0.00016294546762647775,
+      "loss": 0.8067,
+      "step": 572
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.45265094263269107,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7061,
+      "step": 573
+    },
+    {
+      "epoch": 0.3061333333333333,
+      "grad_norm": 0.518154942694519,
+      "learning_rate": 0.0001626765405972011,
+      "loss": 0.8052,
+      "step": 574
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 0.671606637480916,
+      "learning_rate": 0.00016254179613916278,
+      "loss": 0.8991,
+      "step": 575
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4264578310653122,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7505,
+      "step": 576
+    },
+    {
+      "epoch": 0.30773333333333336,
+      "grad_norm": 0.5875387971076562,
+      "learning_rate": 0.000162271747348122,
+      "loss": 0.888,
+      "step": 577
+    },
+    {
+      "epoch": 0.3082666666666667,
+      "grad_norm": 0.6559006975934076,
+      "learning_rate": 0.0001621364438215262,
+      "loss": 0.995,
+      "step": 578
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.5510152039312369,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.8637,
+      "step": 579
+    },
+    {
+      "epoch": 0.30933333333333335,
+      "grad_norm": 0.6094500810226675,
+      "learning_rate": 0.00016186528052636692,
+      "loss": 0.8572,
+      "step": 580
+    },
+    {
+      "epoch": 0.3098666666666667,
+      "grad_norm": 0.5620392199625313,
+      "learning_rate": 0.0001617294215675382,
+      "loss": 0.8427,
+      "step": 581
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.4799019641814468,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.883,
+      "step": 582
+    },
+    {
+      "epoch": 0.31093333333333334,
+      "grad_norm": 0.6367837218131216,
+      "learning_rate": 0.0001614571510558588,
+      "loss": 0.8408,
+      "step": 583
+    },
+    {
+      "epoch": 0.31146666666666667,
+      "grad_norm": 0.46555585537157745,
+      "learning_rate": 0.00016132074031604917,
+      "loss": 0.7537,
+      "step": 584
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.5013403235334765,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.8318,
+      "step": 585
+    },
+    {
+      "epoch": 0.31253333333333333,
+      "grad_norm": 0.5632794940134113,
+      "learning_rate": 0.00016104736990520468,
+      "loss": 0.9249,
+      "step": 586
+    },
+    {
+      "epoch": 0.31306666666666666,
+      "grad_norm": 0.6051919782229207,
+      "learning_rate": 0.0001609104110504954,
+      "loss": 0.9439,
+      "step": 587
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.5289845126344039,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7709,
+      "step": 588
+    },
+    {
+      "epoch": 0.3141333333333333,
+      "grad_norm": 0.6319831806991704,
+      "learning_rate": 0.00016063594808740113,
+      "loss": 0.8988,
+      "step": 589
+    },
+    {
+      "epoch": 0.31466666666666665,
+      "grad_norm": 0.5270325735548156,
+      "learning_rate": 0.00016049844479860422,
+      "loss": 0.8349,
+      "step": 590
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.610773726540534,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.9001,
+      "step": 591
+    },
+    {
+      "epoch": 0.3157333333333333,
+      "grad_norm": 0.5316834587247324,
+      "learning_rate": 0.00016022289665953808,
+      "loss": 0.7978,
+      "step": 592
+    },
+    {
+      "epoch": 0.31626666666666664,
+      "grad_norm": 0.5118701114233776,
+      "learning_rate": 0.00016008485263209742,
+      "loss": 0.7495,
+      "step": 593
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.6208567717250233,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.9326,
+      "step": 594
+    },
+    {
+      "epoch": 0.31733333333333336,
+      "grad_norm": 0.5471864762151855,
+      "learning_rate": 0.0001598082267225018,
+      "loss": 0.8352,
+      "step": 595
+    },
+    {
+      "epoch": 0.3178666666666667,
+      "grad_norm": 0.5276315900469803,
+      "learning_rate": 0.0001596696456663938,
+      "loss": 0.7815,
+      "step": 596
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.4721223348277774,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7736,
+      "step": 597
+    },
+    {
+      "epoch": 0.31893333333333335,
+      "grad_norm": 0.46873833958214894,
+      "learning_rate": 0.00015939194942067646,
+      "loss": 0.7499,
+      "step": 598
+    },
+    {
+      "epoch": 0.3194666666666667,
+      "grad_norm": 0.529810506600649,
+      "learning_rate": 0.0001592528350603103,
+      "loss": 0.8143,
+      "step": 599
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.44327375649757317,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.8177,
+      "step": 600
+    },
+    {
+      "epoch": 0.32053333333333334,
+      "grad_norm": 0.46649303987907914,
+      "learning_rate": 0.00015897407594164467,
+      "loss": 0.737,
+      "step": 601
+    },
+    {
+      "epoch": 0.32106666666666667,
+      "grad_norm": 0.5574604609860693,
+      "learning_rate": 0.00015883443201576225,
+      "loss": 0.8278,
+      "step": 602
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.42663942892965007,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.675,
+      "step": 603
+    },
+    {
+      "epoch": 0.3221333333333333,
+      "grad_norm": 0.45701736799284803,
+      "learning_rate": 0.00015855461751588677,
+      "loss": 0.7096,
+      "step": 604
+    },
+    {
+      "epoch": 0.32266666666666666,
+      "grad_norm": 0.4720496840763521,
+      "learning_rate": 0.0001584144477774623,
+      "loss": 0.7855,
+      "step": 605
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.5063091619828909,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.8439,
+      "step": 606
+    },
+    {
+      "epoch": 0.3237333333333333,
+      "grad_norm": 0.49331665539251424,
+      "learning_rate": 0.00015813358541647915,
+      "loss": 0.8041,
+      "step": 607
+    },
+    {
+      "epoch": 0.32426666666666665,
+      "grad_norm": 0.6465649336061632,
+      "learning_rate": 0.00015799289363261813,
+      "loss": 0.9504,
+      "step": 608
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.49017445195332465,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.8274,
+      "step": 609
+    },
+    {
+      "epoch": 0.3253333333333333,
+      "grad_norm": 0.5478699688037733,
+      "learning_rate": 0.00015771099095879108,
+      "loss": 0.8629,
+      "step": 610
+    },
+    {
+      "epoch": 0.3258666666666667,
+      "grad_norm": 0.532454452552899,
+      "learning_rate": 0.0001575697809106292,
+      "loss": 0.8421,
+      "step": 611
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.536805449004451,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.7848,
+      "step": 612
+    },
+    {
+      "epoch": 0.32693333333333335,
+      "grad_norm": 0.5215712291897809,
+      "learning_rate": 0.00015728684550018064,
+      "loss": 0.8874,
+      "step": 613
+    },
+    {
+      "epoch": 0.3274666666666667,
+      "grad_norm": 0.5487771288459847,
+      "learning_rate": 0.0001571451209827821,
+      "loss": 0.8961,
+      "step": 614
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.5116161086365358,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.8396,
+      "step": 615
+    },
+    {
+      "epoch": 0.32853333333333334,
+      "grad_norm": 0.48924499411970224,
+      "learning_rate": 0.00015686116043968972,
+      "loss": 0.7564,
+      "step": 616
+    },
+    {
+      "epoch": 0.3290666666666667,
+      "grad_norm": 0.5364015158108658,
+      "learning_rate": 0.00015671892526194516,
+      "loss": 0.7626,
+      "step": 617
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.4663661313576595,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.6755,
+      "step": 618
+    },
+    {
+      "epoch": 0.33013333333333333,
+      "grad_norm": 0.4761060519561345,
+      "learning_rate": 0.0001564339472177373,
+      "loss": 0.7709,
+      "step": 619
+    },
+    {
+      "epoch": 0.33066666666666666,
+      "grad_norm": 0.46693843751676056,
+      "learning_rate": 0.00015629120520226165,
+      "loss": 0.7818,
+      "step": 620
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.5503320372672844,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.8936,
+      "step": 621
+    },
+    {
+      "epoch": 0.3317333333333333,
+      "grad_norm": 0.609742122470184,
+      "learning_rate": 0.0001560052173158123,
+      "loss": 0.8283,
+      "step": 622
+    },
+    {
+      "epoch": 0.33226666666666665,
+      "grad_norm": 0.4786888058526543,
+      "learning_rate": 0.00015586197229884184,
+      "loss": 0.7054,
+      "step": 623
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.5468961137967208,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.8858,
+      "step": 624
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.5079561568537743,
+      "learning_rate": 0.00015557498225616487,
+      "loss": 0.7851,
+      "step": 625
+    },
+    {
+      "epoch": 0.33386666666666664,
+      "grad_norm": 0.4686408474857009,
+      "learning_rate": 0.0001554312380874542,
+      "loss": 0.7707,
+      "step": 626
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.5996669398126965,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7502,
+      "step": 627
+    },
+    {
+      "epoch": 0.33493333333333336,
+      "grad_norm": 0.4449647185867954,
+      "learning_rate": 0.00015514325360149668,
+      "loss": 0.7447,
+      "step": 628
+    },
+    {
+      "epoch": 0.3354666666666667,
+      "grad_norm": 0.4834378503665272,
+      "learning_rate": 0.0001549990141442153,
+      "loss": 0.8161,
+      "step": 629
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5402678542348858,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7989,
+      "step": 630
+    },
+    {
+      "epoch": 0.33653333333333335,
+      "grad_norm": 0.4463779543248393,
+      "learning_rate": 0.00015471004295465035,
+      "loss": 0.7468,
+      "step": 631
+    },
+    {
+      "epoch": 0.3370666666666667,
+      "grad_norm": 0.49253680351793444,
+      "learning_rate": 0.0001545653120852787,
+      "loss": 0.859,
+      "step": 632
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.5474150764496007,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.827,
+      "step": 633
+    },
+    {
+      "epoch": 0.33813333333333334,
+      "grad_norm": 0.5574422779343221,
+      "learning_rate": 0.00015427536195829742,
+      "loss": 0.8604,
+      "step": 634
+    },
+    {
+      "epoch": 0.33866666666666667,
+      "grad_norm": 0.4531471454712154,
+      "learning_rate": 0.00015413014356652286,
+      "loss": 0.8479,
+      "step": 635
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.8027928220061923,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.8997,
+      "step": 636
+    },
+    {
+      "epoch": 0.33973333333333333,
+      "grad_norm": 0.5309692551303301,
+      "learning_rate": 0.00015383922229462549,
+      "loss": 0.82,
+      "step": 637
+    },
+    {
+      "epoch": 0.34026666666666666,
+      "grad_norm": 0.5254409038796218,
+      "learning_rate": 0.00015369352028323774,
+      "loss": 0.8391,
+      "step": 638
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.48868712676371545,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7736,
+      "step": 639
+    },
+    {
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.5746536982609687,
+      "learning_rate": 0.0001534016356850244,
+      "loss": 0.964,
+      "step": 640
+    },
+    {
+      "epoch": 0.34186666666666665,
+      "grad_norm": 0.5663542278344792,
+      "learning_rate": 0.0001532554539698105,
+      "loss": 0.79,
+      "step": 641
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.7322489622484162,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 1.0124,
+      "step": 642
+    },
+    {
+      "epoch": 0.3429333333333333,
+      "grad_norm": 0.570448953372935,
+      "learning_rate": 0.00015296261388977108,
+      "loss": 0.8195,
+      "step": 643
+    },
+    {
+      "epoch": 0.34346666666666664,
+      "grad_norm": 0.5244050729061402,
+      "learning_rate": 0.0001528159563994104,
+      "loss": 0.7873,
+      "step": 644
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.563009538878919,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7979,
+      "step": 645
+    },
+    {
+      "epoch": 0.34453333333333336,
+      "grad_norm": 0.47992530098772557,
+      "learning_rate": 0.00015252216870771345,
+      "loss": 0.7757,
+      "step": 646
+    },
+    {
+      "epoch": 0.3450666666666667,
+      "grad_norm": 0.5488372438427589,
+      "learning_rate": 0.00015237503938367186,
+      "loss": 0.8166,
+      "step": 647
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.8745056416840453,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.8564,
+      "step": 648
+    },
+    {
+      "epoch": 0.34613333333333335,
+      "grad_norm": 0.5704675748450467,
+      "learning_rate": 0.00015208031197595356,
+      "loss": 0.7455,
+      "step": 649
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.5079628164978088,
+      "learning_rate": 0.0001519327147723776,
+      "loss": 0.7786,
+      "step": 650
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.46045188551189115,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7682,
+      "step": 651
+    },
+    {
+      "epoch": 0.34773333333333334,
+      "grad_norm": 0.5642003124117353,
+      "learning_rate": 0.0001516370555695291,
+      "loss": 0.8237,
+      "step": 652
+    },
+    {
+      "epoch": 0.34826666666666667,
+      "grad_norm": 0.45036166419602947,
+      "learning_rate": 0.00015148899445313981,
+      "loss": 0.7517,
+      "step": 653
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.62770242511897,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.9288,
+      "step": 654
+    },
+    {
+      "epoch": 0.34933333333333333,
+      "grad_norm": 0.5286171186931936,
+      "learning_rate": 0.00015119241140109467,
+      "loss": 0.7778,
+      "step": 655
+    },
+    {
+      "epoch": 0.34986666666666666,
+      "grad_norm": 0.6078858094826769,
+      "learning_rate": 0.00015104389035108077,
+      "loss": 0.8137,
+      "step": 656
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.428744760977173,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.7799,
+      "step": 657
+    },
+    {
+      "epoch": 0.3509333333333333,
+      "grad_norm": 0.6999273455997498,
+      "learning_rate": 0.0001507463914206012,
+      "loss": 0.8146,
+      "step": 658
+    },
+    {
+      "epoch": 0.35146666666666665,
+      "grad_norm": 0.5741621314582245,
+      "learning_rate": 0.0001505974144285124,
+      "loss": 0.868,
+      "step": 659
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5124125389535803,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.868,
+      "step": 660
+    },
+    {
+      "epoch": 0.3525333333333333,
+      "grad_norm": 0.4881782705813716,
+      "learning_rate": 0.00015029900761497506,
+      "loss": 0.8667,
+      "step": 661
+    },
+    {
+      "epoch": 0.35306666666666664,
+      "grad_norm": 0.53066060155896,
+      "learning_rate": 0.00015014957868461458,
+      "loss": 0.8457,
+      "step": 662
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.5591736195185854,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8802,
+      "step": 663
+    },
+    {
+      "epoch": 0.35413333333333336,
+      "grad_norm": 0.4905697116884893,
+      "learning_rate": 0.000149850272007796,
+      "loss": 0.7155,
+      "step": 664
+    },
+    {
+      "epoch": 0.3546666666666667,
+      "grad_norm": 0.5844276562874432,
+      "learning_rate": 0.00014970039515511304,
+      "loss": 0.9422,
+      "step": 665
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.603290743417517,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.8599,
+      "step": 666
+    },
+    {
+      "epoch": 0.35573333333333335,
+      "grad_norm": 0.5088283751229241,
+      "learning_rate": 0.0001494001966589736,
+      "loss": 0.7846,
+      "step": 667
+    },
+    {
+      "epoch": 0.3562666666666667,
+      "grad_norm": 0.531314508584258,
+      "learning_rate": 0.00014924987591195547,
+      "loss": 0.7477,
+      "step": 668
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.48487488648848864,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7756,
+      "step": 669
+    },
+    {
+      "epoch": 0.35733333333333334,
+      "grad_norm": 0.46300148178031775,
+      "learning_rate": 0.0001489487936644237,
+      "loss": 0.8847,
+      "step": 670
+    },
+    {
+      "epoch": 0.35786666666666667,
+      "grad_norm": 0.4817338737680085,
+      "learning_rate": 0.00014879803306298736,
+      "loss": 0.7519,
+      "step": 671
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.5697214418512658,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7859,
+      "step": 672
+    },
+    {
+      "epoch": 0.3589333333333333,
+      "grad_norm": 0.6148209323459753,
+      "learning_rate": 0.00014849607515574276,
+      "loss": 0.818,
+      "step": 673
+    },
+    {
+      "epoch": 0.35946666666666666,
+      "grad_norm": 0.5003433116216678,
+      "learning_rate": 0.00014834487875162657,
+      "loss": 0.8501,
+      "step": 674
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5432992354727689,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.8104,
+      "step": 675
+    },
+    {
+      "epoch": 0.3605333333333333,
+      "grad_norm": 0.5083729917715238,
+      "learning_rate": 0.00014804205329988225,
+      "loss": 0.7957,
+      "step": 676
+    },
+    {
+      "epoch": 0.36106666666666665,
+      "grad_norm": 0.46631811126979217,
+      "learning_rate": 0.00014789042515653687,
+      "loss": 0.748,
+      "step": 677
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4274038453696366,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.6714,
+      "step": 678
+    },
+    {
+      "epoch": 0.3621333333333333,
+      "grad_norm": 0.4642123927377642,
+      "learning_rate": 0.00014758674029882152,
+      "loss": 0.7946,
+      "step": 679
+    },
+    {
+      "epoch": 0.3626666666666667,
+      "grad_norm": 0.5682266655302921,
+      "learning_rate": 0.00014743468449130063,
+      "loss": 0.893,
+      "step": 680
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.45219212372742434,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7013,
+      "step": 681
+    },
+    {
+      "epoch": 0.36373333333333335,
+      "grad_norm": 0.46249946521024066,
+      "learning_rate": 0.00014713014838923976,
+      "loss": 0.8187,
+      "step": 682
+    },
+    {
+      "epoch": 0.3642666666666667,
+      "grad_norm": 0.5362530489588935,
+      "learning_rate": 0.00014697766900409074,
+      "loss": 0.9251,
+      "step": 683
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.47112151670674485,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.8204,
+      "step": 684
+    },
+    {
+      "epoch": 0.36533333333333334,
+      "grad_norm": 0.43562242348644264,
+      "learning_rate": 0.0001466722898421873,
+      "loss": 0.817,
+      "step": 685
+    },
+    {
+      "epoch": 0.3658666666666667,
+      "grad_norm": 0.6017771725031066,
+      "learning_rate": 0.0001465193909773413,
+      "loss": 0.798,
+      "step": 686
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.5331535348236007,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.8401,
+      "step": 687
+    },
+    {
+      "epoch": 0.36693333333333333,
+      "grad_norm": 0.5717651781716162,
+      "learning_rate": 0.00014621317696275564,
+      "loss": 0.9325,
+      "step": 688
+    },
+    {
+      "epoch": 0.36746666666666666,
+      "grad_norm": 0.526960588209441,
+      "learning_rate": 0.00014605986272741748,
+      "loss": 0.7866,
+      "step": 689
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.5253453194212678,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.8615,
+      "step": 690
+    },
+    {
+      "epoch": 0.3685333333333333,
+      "grad_norm": 0.5285391045903164,
+      "learning_rate": 0.00014575282208974702,
+      "loss": 0.8502,
+      "step": 691
+    },
+    {
+      "epoch": 0.36906666666666665,
+      "grad_norm": 0.4159020717447715,
+      "learning_rate": 0.00014559909660428468,
+      "loss": 0.7361,
+      "step": 692
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.47293504331080716,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.8081,
+      "step": 693
+    },
+    {
+      "epoch": 0.3701333333333333,
+      "grad_norm": 0.6337490496519959,
+      "learning_rate": 0.00014529123759534255,
+      "loss": 0.8865,
+      "step": 694
+    },
+    {
+      "epoch": 0.37066666666666664,
+      "grad_norm": 0.5802499413114943,
+      "learning_rate": 0.00014513710499117647,
+      "loss": 0.8497,
+      "step": 695
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4176215842349179,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.6968,
+      "step": 696
+    },
+    {
+      "epoch": 0.37173333333333336,
+      "grad_norm": 0.4825865010446155,
+      "learning_rate": 0.00014482843588476974,
+      "loss": 0.8353,
+      "step": 697
+    },
+    {
+      "epoch": 0.3722666666666667,
+      "grad_norm": 0.5184616712009764,
+      "learning_rate": 0.00014467390030426186,
+      "loss": 0.7453,
+      "step": 698
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.5336813961952697,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.8843,
+      "step": 699
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.5713473772631764,
+      "learning_rate": 0.0001443644293959693,
+      "loss": 0.8288,
+      "step": 700
+    },
+    {
+      "epoch": 0.3738666666666667,
+      "grad_norm": 0.6517004215524038,
+      "learning_rate": 0.00014420949499231172,
+      "loss": 1.0056,
+      "step": 701
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.564644177450356,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.8924,
+      "step": 702
+    },
+    {
+      "epoch": 0.37493333333333334,
+      "grad_norm": 0.6382855123267954,
+      "learning_rate": 0.00014389923059926062,
+      "loss": 0.9,
+      "step": 703
+    },
+    {
+      "epoch": 0.37546666666666667,
+      "grad_norm": 0.52168540914425,
+      "learning_rate": 0.0001437439015363638,
+      "loss": 0.7179,
+      "step": 704
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.9199605405327886,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.8602,
+      "step": 705
+    },
+    {
+      "epoch": 0.37653333333333333,
+      "grad_norm": 0.6630619697723,
+      "learning_rate": 0.00014343285199700683,
+      "loss": 0.9434,
+      "step": 706
+    },
+    {
+      "epoch": 0.37706666666666666,
+      "grad_norm": 0.453200819137961,
+      "learning_rate": 0.0001432771324493879,
+      "loss": 0.7443,
+      "step": 707
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.5373021365917932,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.8585,
+      "step": 708
+    },
+    {
+      "epoch": 0.3781333333333333,
+      "grad_norm": 0.6638608339620459,
+      "learning_rate": 0.00014296530612327863,
+      "loss": 0.8807,
+      "step": 709
+    },
+    {
+      "epoch": 0.37866666666666665,
+      "grad_norm": 0.5852792040916357,
+      "learning_rate": 0.00014280920027594907,
+      "loss": 0.8623,
+      "step": 710
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.6045019203606804,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.8099,
+      "step": 711
+    },
+    {
+      "epoch": 0.3797333333333333,
+      "grad_norm": 0.5578165086402765,
+      "learning_rate": 0.00014249660554351752,
+      "loss": 0.8561,
+      "step": 712
+    },
+    {
+      "epoch": 0.38026666666666664,
+      "grad_norm": 0.5135089124962354,
+      "learning_rate": 0.00014234011759187083,
+      "loss": 0.822,
+      "step": 713
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.5229531767262776,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.8289,
+      "step": 714
+    },
+    {
+      "epoch": 0.38133333333333336,
+      "grad_norm": 0.5760480824148356,
+      "learning_rate": 0.00014202676285419812,
+      "loss": 0.9167,
+      "step": 715
+    },
+    {
+      "epoch": 0.3818666666666667,
+      "grad_norm": 0.530069512579078,
+      "learning_rate": 0.00014186989700389687,
+      "loss": 0.7878,
+      "step": 716
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.5483414880778632,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7549,
+      "step": 717
+    },
+    {
+      "epoch": 0.38293333333333335,
+      "grad_norm": 0.5805853791465041,
+      "learning_rate": 0.0001415557906824895,
+      "loss": 0.821,
+      "step": 718
+    },
+    {
+      "epoch": 0.3834666666666667,
+      "grad_norm": 0.4120710990268928,
+      "learning_rate": 0.00014139855114935252,
+      "loss": 0.6579,
+      "step": 719
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.5476826539300194,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.8163,
+      "step": 720
+    },
+    {
+      "epoch": 0.38453333333333334,
+      "grad_norm": 0.6855726624547981,
+      "learning_rate": 0.0001410837016859161,
+      "loss": 0.8886,
+      "step": 721
+    },
+    {
+      "epoch": 0.38506666666666667,
+      "grad_norm": 0.4900184992286336,
+      "learning_rate": 0.00014092609269580496,
+      "loss": 0.7316,
+      "step": 722
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.46367309796918,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.8123,
+      "step": 723
+    },
+    {
+      "epoch": 0.38613333333333333,
+      "grad_norm": 0.5535492820939049,
+      "learning_rate": 0.00014061050855201723,
+      "loss": 0.8137,
+      "step": 724
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 0.6416401917887545,
+      "learning_rate": 0.0001404525343407228,
+      "loss": 0.8032,
+      "step": 725
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.46814921453479513,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.719,
+      "step": 726
+    },
+    {
+      "epoch": 0.3877333333333333,
+      "grad_norm": 0.42856462107175863,
+      "learning_rate": 0.00014013622399800627,
+      "loss": 0.7507,
+      "step": 727
+    },
+    {
+      "epoch": 0.38826666666666665,
+      "grad_norm": 0.5056120515390788,
+      "learning_rate": 0.00013997788881113489,
+      "loss": 0.8359,
+      "step": 728
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.4700390756542135,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7485,
+      "step": 729
+    },
+    {
+      "epoch": 0.3893333333333333,
+      "grad_norm": 0.6086509832803118,
+      "learning_rate": 0.0001396608607704289,
+      "loss": 0.8363,
+      "step": 730
+    },
+    {
+      "epoch": 0.38986666666666664,
+      "grad_norm": 0.5078168467062275,
+      "learning_rate": 0.0001395021688632882,
+      "loss": 0.8616,
+      "step": 731
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.49783676941608546,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7553,
+      "step": 732
+    },
+    {
+      "epoch": 0.39093333333333335,
+      "grad_norm": 0.49272398424024877,
+      "learning_rate": 0.00013918443164482046,
+      "loss": 0.7754,
+      "step": 733
+    },
+    {
+      "epoch": 0.3914666666666667,
+      "grad_norm": 0.49484045111362585,
+      "learning_rate": 0.000139025387282305,
+      "loss": 0.7436,
+      "step": 734
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.5297393470406898,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.8124,
+      "step": 735
+    },
+    {
+      "epoch": 0.39253333333333335,
+      "grad_norm": 0.6179185544593533,
+      "learning_rate": 0.0001387069494253626,
+      "loss": 0.8115,
+      "step": 736
+    },
+    {
+      "epoch": 0.3930666666666667,
+      "grad_norm": 0.5373045261507816,
+      "learning_rate": 0.0001385475568818394,
+      "loss": 0.7461,
+      "step": 737
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.45127402949937356,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7444,
+      "step": 738
+    },
+    {
+      "epoch": 0.39413333333333334,
+      "grad_norm": 0.5181242897858944,
+      "learning_rate": 0.00013822842694453924,
+      "loss": 0.8767,
+      "step": 739
+    },
+    {
+      "epoch": 0.39466666666666667,
+      "grad_norm": 0.5655969062462569,
+      "learning_rate": 0.0001380686905037327,
+      "loss": 0.85,
+      "step": 740
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.5168015972212235,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7222,
+      "step": 741
+    },
+    {
+      "epoch": 0.3957333333333333,
+      "grad_norm": 0.5531154877149115,
+      "learning_rate": 0.00013774887706279165,
+      "loss": 0.8093,
+      "step": 742
+    },
+    {
+      "epoch": 0.39626666666666666,
+      "grad_norm": 0.519075203089709,
+      "learning_rate": 0.0001375888010176686,
+      "loss": 0.7641,
+      "step": 743
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.5481275811684683,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.8409,
+      "step": 744
+    },
+    {
+      "epoch": 0.3973333333333333,
+      "grad_norm": 0.5239988375125255,
+      "learning_rate": 0.00013726831266817278,
+      "loss": 0.8484,
+      "step": 745
+    },
+    {
+      "epoch": 0.39786666666666665,
+      "grad_norm": 0.5181888297872964,
+      "learning_rate": 0.00013710790132082692,
+      "loss": 0.8188,
+      "step": 746
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.5057049279499224,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.8139,
+      "step": 747
+    },
+    {
+      "epoch": 0.3989333333333333,
+      "grad_norm": 0.5086212837186439,
+      "learning_rate": 0.00013678674667600102,
+      "loss": 0.807,
+      "step": 748
+    },
+    {
+      "epoch": 0.3994666666666667,
+      "grad_norm": 0.5024752340256642,
+      "learning_rate": 0.00013662600433753745,
+      "loss": 0.9308,
+      "step": 749
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4089641077527304,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.6833,
+      "step": 750
+    },
+    {
+      "epoch": 0.40053333333333335,
+      "grad_norm": 0.4351236747525622,
+      "learning_rate": 0.00013630419202851284,
+      "loss": 0.803,
+      "step": 751
+    },
+    {
+      "epoch": 0.4010666666666667,
+      "grad_norm": 0.49002818471326925,
+      "learning_rate": 0.00013614312301893223,
+      "loss": 0.7913,
+      "step": 752
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.4466890684514906,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.7246,
+      "step": 753
+    },
+    {
+      "epoch": 0.40213333333333334,
+      "grad_norm": 0.48094890037933113,
+      "learning_rate": 0.00013582066169451535,
+      "loss": 0.7866,
+      "step": 754
+    },
+    {
+      "epoch": 0.4026666666666667,
+      "grad_norm": 0.5754215224270137,
+      "learning_rate": 0.0001356592703425976,
+      "loss": 0.8077,
+      "step": 755
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.5090030089273849,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7786,
+      "step": 756
+    },
+    {
+      "epoch": 0.40373333333333333,
+      "grad_norm": 0.4891915865862635,
+      "learning_rate": 0.00013533616866903735,
+      "loss": 0.765,
+      "step": 757
+    },
+    {
+      "epoch": 0.40426666666666666,
+      "grad_norm": 0.46155122526575937,
+      "learning_rate": 0.0001351744593122255,
+      "loss": 0.7049,
+      "step": 758
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.4841794153051128,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7862,
+      "step": 759
+    },
+    {
+      "epoch": 0.4053333333333333,
+      "grad_norm": 0.6634877800731043,
+      "learning_rate": 0.00013485072597298038,
+      "loss": 0.8723,
+      "step": 760
+    },
+    {
+      "epoch": 0.40586666666666665,
+      "grad_norm": 0.5370652892356707,
+      "learning_rate": 0.00013468870295726398,
+      "loss": 0.7755,
+      "step": 761
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.4801178316957006,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.7923,
+      "step": 762
+    },
+    {
+      "epoch": 0.4069333333333333,
+      "grad_norm": 0.5019134348868551,
+      "learning_rate": 0.00013436434665276865,
+      "loss": 0.7057,
+      "step": 763
+    },
+    {
+      "epoch": 0.40746666666666664,
+      "grad_norm": 0.46813690102209793,
+      "learning_rate": 0.00013420201433256689,
+      "loss": 0.7459,
+      "step": 764
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.5398079540801454,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7864,
+      "step": 765
+    },
+    {
+      "epoch": 0.40853333333333336,
+      "grad_norm": 0.4976426982944783,
+      "learning_rate": 0.00013387704377999842,
+      "loss": 0.8075,
+      "step": 766
+    },
+    {
+      "epoch": 0.4090666666666667,
+      "grad_norm": 0.46870292537268243,
+      "learning_rate": 0.00013371440651804313,
+      "loss": 0.6822,
+      "step": 767
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.545783509190825,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7946,
+      "step": 768
+    },
+    {
+      "epoch": 0.41013333333333335,
+      "grad_norm": 0.5344080341027531,
+      "learning_rate": 0.00013338883045108674,
+      "loss": 0.7631,
+      "step": 769
+    },
+    {
+      "epoch": 0.4106666666666667,
+      "grad_norm": 0.6977326518743707,
+      "learning_rate": 0.00013322589261830517,
+      "loss": 0.8548,
+      "step": 770
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.3904121194482292,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.6869,
+      "step": 771
+    },
+    {
+      "epoch": 0.41173333333333334,
+      "grad_norm": 0.4579030171134873,
+      "learning_rate": 0.0001328997197869194,
+      "loss": 0.7054,
+      "step": 772
+    },
+    {
+      "epoch": 0.41226666666666667,
+      "grad_norm": 0.5432445941796074,
+      "learning_rate": 0.0001327364857623168,
+      "loss": 0.8541,
+      "step": 773
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4507607056080124,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7294,
+      "step": 774
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 0.5770144319792957,
+      "learning_rate": 0.00013240972493249847,
+      "loss": 0.7824,
+      "step": 775
+    },
+    {
+      "epoch": 0.41386666666666666,
+      "grad_norm": 0.6222383621338347,
+      "learning_rate": 0.0001322461991030402,
+      "loss": 0.9508,
+      "step": 776
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.5083213398241904,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7967,
+      "step": 777
+    },
+    {
+      "epoch": 0.4149333333333333,
+      "grad_norm": 0.5769391778626363,
+      "learning_rate": 0.00013191885905658872,
+      "loss": 0.8215,
+      "step": 778
+    },
+    {
+      "epoch": 0.41546666666666665,
+      "grad_norm": 0.4753318829986412,
+      "learning_rate": 0.0001317550458170826,
+      "loss": 0.8173,
+      "step": 779
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.5848923979207764,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.8812,
+      "step": 780
+    },
+    {
+      "epoch": 0.4165333333333333,
+      "grad_norm": 0.5506274044347146,
+      "learning_rate": 0.00013142713535136414,
+      "loss": 0.6753,
+      "step": 781
+    },
+    {
+      "epoch": 0.41706666666666664,
+      "grad_norm": 0.48945492331007395,
+      "learning_rate": 0.00013126303910434214,
+      "loss": 0.8095,
+      "step": 782
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.47922500251569633,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7426,
+      "step": 783
+    },
+    {
+      "epoch": 0.41813333333333336,
+      "grad_norm": 0.4474830678952825,
+      "learning_rate": 0.00013093456703205288,
+      "loss": 0.71,
+      "step": 784
+    },
+    {
+      "epoch": 0.4186666666666667,
+      "grad_norm": 0.4901138815648903,
+      "learning_rate": 0.00013077019218765305,
+      "loss": 0.7971,
+      "step": 785
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.46553748999705974,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7039,
+      "step": 786
+    },
+    {
+      "epoch": 0.41973333333333335,
+      "grad_norm": 0.6818603988852794,
+      "learning_rate": 0.0001304411673365826,
+      "loss": 0.8921,
+      "step": 787
+    },
+    {
+      "epoch": 0.4202666666666667,
+      "grad_norm": 0.5970769694778394,
+      "learning_rate": 0.0001302765183124302,
+      "loss": 0.9815,
+      "step": 788
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.47872069397262745,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7985,
+      "step": 789
+    },
+    {
+      "epoch": 0.42133333333333334,
+      "grad_norm": 0.3861546598029685,
+      "learning_rate": 0.00012994694952522435,
+      "loss": 0.6453,
+      "step": 790
+    },
+    {
+      "epoch": 0.42186666666666667,
+      "grad_norm": 0.4603594590690842,
+      "learning_rate": 0.00012978203074631334,
+      "loss": 0.685,
+      "step": 791
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.6089557164729323,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.8086,
+      "step": 792
+    },
+    {
+      "epoch": 0.42293333333333333,
+      "grad_norm": 0.603704990267899,
+      "learning_rate": 0.00012945192688023624,
+      "loss": 0.9752,
+      "step": 793
+    },
+    {
+      "epoch": 0.42346666666666666,
+      "grad_norm": 0.7537321811367974,
+      "learning_rate": 0.0001292867427788104,
+      "loss": 0.9012,
+      "step": 794
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.4930352911616266,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.8116,
+      "step": 795
+    },
+    {
+      "epoch": 0.4245333333333333,
+      "grad_norm": 0.4315763050222387,
+      "learning_rate": 0.00012895611270550666,
+      "loss": 0.7109,
+      "step": 796
+    },
+    {
+      "epoch": 0.42506666666666665,
+      "grad_norm": 0.4637858942591107,
+      "learning_rate": 0.0001287906677209403,
+      "loss": 0.7524,
+      "step": 797
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.4676651569490796,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7661,
+      "step": 798
+    },
+    {
+      "epoch": 0.4261333333333333,
+      "grad_norm": 0.4897362982883825,
+      "learning_rate": 0.0001284595203261965,
+      "loss": 0.7982,
+      "step": 799
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.6143052681462965,
+      "learning_rate": 0.00012829381890487536,
+      "loss": 0.7956,
+      "step": 800
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.5266193218351073,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.776,
+      "step": 801
+    },
+    {
+      "epoch": 0.42773333333333335,
+      "grad_norm": 0.5450427556760508,
+      "learning_rate": 0.00012796216308838117,
+      "loss": 0.8631,
+      "step": 802
+    },
+    {
+      "epoch": 0.4282666666666667,
+      "grad_norm": 0.6456045728698774,
+      "learning_rate": 0.00012779620968358273,
+      "loss": 0.925,
+      "step": 803
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.46905239069900634,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7524,
+      "step": 804
+    },
+    {
+      "epoch": 0.42933333333333334,
+      "grad_norm": 0.5991620705144164,
+      "learning_rate": 0.00012746405435869198,
+      "loss": 0.7948,
+      "step": 805
+    },
+    {
+      "epoch": 0.4298666666666667,
+      "grad_norm": 0.40463001077070404,
+      "learning_rate": 0.00012729785343046588,
+      "loss": 0.7792,
+      "step": 806
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.5473140968009028,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7967,
+      "step": 807
+    },
+    {
+      "epoch": 0.43093333333333333,
+      "grad_norm": 0.4846464786438877,
+      "learning_rate": 0.00012696520752395672,
+      "loss": 0.7561,
+      "step": 808
+    },
+    {
+      "epoch": 0.43146666666666667,
+      "grad_norm": 0.49235137719971495,
+      "learning_rate": 0.00012679876353900482,
+      "loss": 0.7865,
+      "step": 809
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.48082981780816214,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7826,
+      "step": 810
+    },
+    {
+      "epoch": 0.4325333333333333,
+      "grad_norm": 0.45705357979137046,
+      "learning_rate": 0.00012646563599083996,
+      "loss": 0.7237,
+      "step": 811
+    },
+    {
+      "epoch": 0.43306666666666666,
+      "grad_norm": 0.6079835635983911,
+      "learning_rate": 0.00012629895342239643,
+      "loss": 0.8438,
+      "step": 812
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.4697357339508937,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.7754,
+      "step": 813
+    },
+    {
+      "epoch": 0.4341333333333333,
+      "grad_norm": 0.4341312917563034,
+      "learning_rate": 0.00012596535318548289,
+      "loss": 0.7538,
+      "step": 814
+    },
+    {
+      "epoch": 0.43466666666666665,
+      "grad_norm": 0.49600772082635314,
+      "learning_rate": 0.0001257984365131938,
+      "loss": 0.7376,
+      "step": 815
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.5598019054150565,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.8522,
+      "step": 816
+    },
+    {
+      "epoch": 0.4357333333333333,
+      "grad_norm": 0.3921679797595081,
+      "learning_rate": 0.00012546437255314222,
+      "loss": 0.7071,
+      "step": 817
+    },
+    {
+      "epoch": 0.4362666666666667,
+      "grad_norm": 0.586665310853696,
+      "learning_rate": 0.0001252972262629454,
+      "loss": 0.8259,
+      "step": 818
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.6436832286186337,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.8968,
+      "step": 819
+    },
+    {
+      "epoch": 0.43733333333333335,
+      "grad_norm": 0.4532961689343766,
+      "learning_rate": 0.00012496270755782914,
+      "loss": 0.7185,
+      "step": 820
+    },
+    {
+      "epoch": 0.4378666666666667,
+      "grad_norm": 0.6054960827856545,
+      "learning_rate": 0.00012479533614183334,
+      "loss": 0.842,
+      "step": 821
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.5145811623028095,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.8141,
+      "step": 822
+    },
+    {
+      "epoch": 0.43893333333333334,
+      "grad_norm": 0.5784054764016591,
+      "learning_rate": 0.00012446037168194714,
+      "loss": 0.7729,
+      "step": 823
+    },
+    {
+      "epoch": 0.43946666666666667,
+      "grad_norm": 0.7220645660369499,
+      "learning_rate": 0.00012429277963831148,
+      "loss": 0.8989,
+      "step": 824
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.4980071811549346,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.8106,
+      "step": 825
+    },
+    {
+      "epoch": 0.44053333333333333,
+      "grad_norm": 0.5348419778871377,
+      "learning_rate": 0.00012395737842592995,
+      "loss": 0.8369,
+      "step": 826
+    },
+    {
+      "epoch": 0.44106666666666666,
+      "grad_norm": 0.5353879738979862,
+      "learning_rate": 0.000123789570258743,
+      "loss": 0.7932,
+      "step": 827
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.6651689873960853,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.7473,
+      "step": 828
+    },
+    {
+      "epoch": 0.4421333333333333,
+      "grad_norm": 0.5447327142112047,
+      "learning_rate": 0.00012345374130787854,
+      "loss": 0.8328,
+      "step": 829
+    },
+    {
+      "epoch": 0.44266666666666665,
+      "grad_norm": 0.5413161115653148,
+      "learning_rate": 0.00012328572152703725,
+      "loss": 0.7804,
+      "step": 830
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.5663918612560117,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.8009,
+      "step": 831
+    },
+    {
+      "epoch": 0.4437333333333333,
+      "grad_norm": 0.5940970783318418,
+      "learning_rate": 0.00012294947386319794,
+      "loss": 0.9107,
+      "step": 832
+    },
+    {
+      "epoch": 0.44426666666666664,
+      "grad_norm": 0.4615580956786893,
+      "learning_rate": 0.0001227812469842864,
+      "loss": 0.7274,
+      "step": 833
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.47289010215546234,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7662,
+      "step": 834
+    },
+    {
+      "epoch": 0.44533333333333336,
+      "grad_norm": 0.5282870098169392,
+      "learning_rate": 0.00012244458964423327,
+      "loss": 0.791,
+      "step": 835
+    },
+    {
+      "epoch": 0.4458666666666667,
+      "grad_norm": 0.7165095897507946,
+      "learning_rate": 0.00012227616018840154,
+      "loss": 0.9146,
+      "step": 836
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.51113905343607,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.8546,
+      "step": 837
+    },
+    {
+      "epoch": 0.44693333333333335,
+      "grad_norm": 0.5025443898392142,
+      "learning_rate": 0.00012193910221990581,
+      "loss": 0.747,
+      "step": 838
+    },
+    {
+      "epoch": 0.4474666666666667,
+      "grad_norm": 0.48667988939210144,
+      "learning_rate": 0.00012177047471374807,
+      "loss": 0.827,
+      "step": 839
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.547760882436182,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.8357,
+      "step": 840
+    },
+    {
+      "epoch": 0.44853333333333334,
+      "grad_norm": 0.5514487469145651,
+      "learning_rate": 0.0001214330251753481,
+      "loss": 0.8204,
+      "step": 841
+    },
+    {
+      "epoch": 0.44906666666666667,
+      "grad_norm": 0.5454077540745722,
+      "learning_rate": 0.00012126420415078132,
+      "loss": 0.81,
+      "step": 842
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.5471013858109142,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.8675,
+      "step": 843
+    },
+    {
+      "epoch": 0.45013333333333333,
+      "grad_norm": 0.4681508026598542,
+      "learning_rate": 0.00012092637211153885,
+      "loss": 0.7791,
+      "step": 844
+    },
+    {
+      "epoch": 0.45066666666666666,
+      "grad_norm": 0.5056140317477327,
+      "learning_rate": 0.0001207573621056809,
+      "loss": 0.7927,
+      "step": 845
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.6032147508403578,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.8263,
+      "step": 846
+    },
+    {
+      "epoch": 0.4517333333333333,
+      "grad_norm": 0.5495111420104595,
+      "learning_rate": 0.00012041915664493761,
+      "loss": 0.8267,
+      "step": 847
+    },
+    {
+      "epoch": 0.45226666666666665,
+      "grad_norm": 0.40949344338288285,
+      "learning_rate": 0.00012024996219998517,
+      "loss": 0.7962,
+      "step": 848
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.4485925074166394,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7756,
+      "step": 849
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.4345634842782298,
+      "learning_rate": 0.00011991139240711857,
+      "loss": 0.7192,
+      "step": 850
+    },
+    {
+      "epoch": 0.45386666666666664,
+      "grad_norm": 0.5809157009620177,
+      "learning_rate": 0.00011974201807022525,
+      "loss": 0.6909,
+      "step": 851
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.5158782652435949,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7563,
+      "step": 852
+    },
+    {
+      "epoch": 0.45493333333333336,
+      "grad_norm": 0.5279338422076013,
+      "learning_rate": 0.00011940309304440433,
+      "loss": 0.8242,
+      "step": 853
+    },
+    {
+      "epoch": 0.4554666666666667,
+      "grad_norm": 0.570446810953975,
+      "learning_rate": 0.00011923354336755835,
+      "loss": 0.8387,
+      "step": 854
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.5039590872605328,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.7736,
+      "step": 855
+    },
+    {
+      "epoch": 0.45653333333333335,
+      "grad_norm": 0.46931152350354255,
+      "learning_rate": 0.00011889427221749916,
+      "loss": 0.7721,
+      "step": 856
+    },
+    {
+      "epoch": 0.4570666666666667,
+      "grad_norm": 0.5370225410341043,
+      "learning_rate": 0.00011872455175740112,
+      "loss": 0.8398,
+      "step": 857
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.48478742078050574,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.8141,
+      "step": 858
+    },
+    {
+      "epoch": 0.45813333333333334,
+      "grad_norm": 0.36934981231353725,
+      "learning_rate": 0.00011838494360112185,
+      "loss": 0.6759,
+      "step": 859
+    },
+    {
+      "epoch": 0.45866666666666667,
+      "grad_norm": 0.6624303879159779,
+      "learning_rate": 0.00011821505691906216,
+      "loss": 0.8269,
+      "step": 860
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.4479024680557409,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7301,
+      "step": 861
+    },
+    {
+      "epoch": 0.4597333333333333,
+      "grad_norm": 0.5909808951009032,
+      "learning_rate": 0.00011787512088363817,
+      "loss": 0.772,
+      "step": 862
+    },
+    {
+      "epoch": 0.46026666666666666,
+      "grad_norm": 0.5209290099731972,
+      "learning_rate": 0.00011770507254537453,
+      "loss": 0.7593,
+      "step": 863
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.4770750875340812,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.8468,
+      "step": 864
+    },
+    {
+      "epoch": 0.4613333333333333,
+      "grad_norm": 0.4438156717825776,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.8036,
+      "step": 865
+    },
+    {
+      "epoch": 0.46186666666666665,
+      "grad_norm": 0.40077651122930297,
+      "learning_rate": 0.00011719461234232764,
+      "loss": 0.6876,
+      "step": 866
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.382397124928795,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.6042,
+      "step": 867
+    },
+    {
+      "epoch": 0.4629333333333333,
+      "grad_norm": 0.5474722912103955,
+      "learning_rate": 0.00011685404796484225,
+      "loss": 0.7456,
+      "step": 868
+    },
+    {
+      "epoch": 0.4634666666666667,
+      "grad_norm": 0.5240517045008879,
+      "learning_rate": 0.00011668369002869912,
+      "loss": 0.74,
+      "step": 869
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.5834307542847271,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7741,
+      "step": 870
+    },
+    {
+      "epoch": 0.46453333333333335,
+      "grad_norm": 0.5382694432201459,
+      "learning_rate": 0.00011634282520518383,
+      "loss": 0.87,
+      "step": 871
+    },
+    {
+      "epoch": 0.4650666666666667,
+      "grad_norm": 0.567001530938856,
+      "learning_rate": 0.00011617231933568578,
+      "loss": 0.8846,
+      "step": 872
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.44139727689976666,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.757,
+      "step": 873
+    },
+    {
+      "epoch": 0.46613333333333334,
+      "grad_norm": 0.586655799449206,
+      "learning_rate": 0.00011583116322698935,
+      "loss": 0.7793,
+      "step": 874
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.6959512702435237,
+      "learning_rate": 0.00011566051400653486,
+      "loss": 0.9044,
+      "step": 875
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.592929579373112,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7487,
+      "step": 876
+    },
+    {
+      "epoch": 0.46773333333333333,
+      "grad_norm": 0.48794267966208865,
+      "learning_rate": 0.00011531907578133429,
+      "loss": 0.7603,
+      "step": 877
+    },
+    {
+      "epoch": 0.46826666666666666,
+      "grad_norm": 0.4877169301389832,
+      "learning_rate": 0.00011514828779617459,
+      "loss": 0.6525,
+      "step": 878
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.46896945750895147,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.743,
+      "step": 879
+    },
+    {
+      "epoch": 0.4693333333333333,
+      "grad_norm": 0.3997482721363189,
+      "learning_rate": 0.00011480657663072896,
+      "loss": 0.7099,
+      "step": 880
+    },
+    {
+      "epoch": 0.46986666666666665,
+      "grad_norm": 0.5253024025415763,
+      "learning_rate": 0.00011463565447084445,
+      "loss": 0.7998,
+      "step": 881
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.5408869645494739,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.8468,
+      "step": 882
+    },
+    {
+      "epoch": 0.4709333333333333,
+      "grad_norm": 0.500146352240882,
+      "learning_rate": 0.00011429367954874819,
+      "loss": 0.8356,
+      "step": 883
+    },
+    {
+      "epoch": 0.47146666666666665,
+      "grad_norm": 0.5093589387038205,
+      "learning_rate": 0.0001141226278077254,
+      "loss": 0.761,
+      "step": 884
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.5382482323711277,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7361,
+      "step": 885
+    },
+    {
+      "epoch": 0.47253333333333336,
+      "grad_norm": 0.44435322450815196,
+      "learning_rate": 0.00011378039831966134,
+      "loss": 0.7391,
+      "step": 886
+    },
+    {
+      "epoch": 0.4730666666666667,
+      "grad_norm": 0.5007857334530672,
+      "learning_rate": 0.00011360922159456928,
+      "loss": 0.7404,
+      "step": 887
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.6778260798197766,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.8802,
+      "step": 888
+    },
+    {
+      "epoch": 0.47413333333333335,
+      "grad_norm": 0.45643085924947757,
+      "learning_rate": 0.00011326674673806195,
+      "loss": 0.7668,
+      "step": 889
+    },
+    {
+      "epoch": 0.4746666666666667,
+      "grad_norm": 0.42576546347475686,
+      "learning_rate": 0.00011309544962932862,
+      "loss": 0.7699,
+      "step": 890
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.4878184993169895,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7348,
+      "step": 891
+    },
+    {
+      "epoch": 0.47573333333333334,
+      "grad_norm": 0.5948727960245362,
+      "learning_rate": 0.00011275273860849684,
+      "loss": 0.8081,
+      "step": 892
+    },
+    {
+      "epoch": 0.47626666666666667,
+      "grad_norm": 0.6086558137800696,
+      "learning_rate": 0.00011258132571978555,
+      "loss": 0.7815,
+      "step": 893
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.4513654437774048,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.7895,
+      "step": 894
+    },
+    {
+      "epoch": 0.47733333333333333,
+      "grad_norm": 0.7062086064639138,
+      "learning_rate": 0.00011223838774509514,
+      "loss": 0.9012,
+      "step": 895
+    },
+    {
+      "epoch": 0.47786666666666666,
+      "grad_norm": 0.5468225898868876,
+      "learning_rate": 0.00011206686368318086,
+      "loss": 0.8039,
+      "step": 896
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.5036838194152392,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7766,
+      "step": 897
+    },
+    {
+      "epoch": 0.4789333333333333,
+      "grad_norm": 0.6261411301740473,
+      "learning_rate": 0.00011172370797119712,
+      "loss": 0.8658,
+      "step": 898
+    },
+    {
+      "epoch": 0.47946666666666665,
+      "grad_norm": 0.4494687514476649,
+      "learning_rate": 0.00011155207734584263,
+      "loss": 0.787,
+      "step": 899
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.43209719743065006,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7668,
+      "step": 900
+    },
+    {
+      "epoch": 0.4805333333333333,
+      "grad_norm": 0.4770864718919109,
+      "learning_rate": 0.00011120871311898254,
+      "loss": 0.7272,
+      "step": 901
+    },
+    {
+      "epoch": 0.48106666666666664,
+      "grad_norm": 0.4745528891103059,
+      "learning_rate": 0.0001110369805428146,
+      "loss": 0.7449,
+      "step": 902
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.7470359928641405,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.7807,
+      "step": 903
+    },
+    {
+      "epoch": 0.48213333333333336,
+      "grad_norm": 0.45471146705812854,
+      "learning_rate": 0.0001106934170290991,
+      "loss": 0.7722,
+      "step": 904
+    },
+    {
+      "epoch": 0.4826666666666667,
+      "grad_norm": 0.4253950362738694,
+      "learning_rate": 0.00011052158711748434,
+      "loss": 0.7093,
+      "step": 905
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.49454665074672327,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.8657,
+      "step": 906
+    },
+    {
+      "epoch": 0.48373333333333335,
+      "grad_norm": 0.6964158869594843,
+      "learning_rate": 0.00011017783355029026,
+      "loss": 0.8693,
+      "step": 907
+    },
+    {
+      "epoch": 0.4842666666666667,
+      "grad_norm": 0.4414620619141758,
+      "learning_rate": 0.00011000591092121127,
+      "loss": 0.8054,
+      "step": 908
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.4672882678215716,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7854,
+      "step": 909
+    },
+    {
+      "epoch": 0.48533333333333334,
+      "grad_norm": 0.5087716215112642,
+      "learning_rate": 0.0001096619765390232,
+      "loss": 0.7391,
+      "step": 910
+    },
+    {
+      "epoch": 0.48586666666666667,
+      "grad_norm": 0.4264799455950689,
+      "learning_rate": 0.00010948996581295436,
+      "loss": 0.6833,
+      "step": 911
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.5367640341135063,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7739,
+      "step": 912
+    },
+    {
+      "epoch": 0.48693333333333333,
+      "grad_norm": 0.372192351444737,
+      "learning_rate": 0.00010914585985911632,
+      "loss": 0.7144,
+      "step": 913
+    },
+    {
+      "epoch": 0.48746666666666666,
+      "grad_norm": 0.5048598526746167,
+      "learning_rate": 0.00010897376565889971,
+      "loss": 0.7033,
+      "step": 914
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.5270412702986649,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.7899,
+      "step": 915
+    },
+    {
+      "epoch": 0.4885333333333333,
+      "grad_norm": 0.5481561010954568,
+      "learning_rate": 0.00010862949738136681,
+      "loss": 0.807,
+      "step": 916
+    },
+    {
+      "epoch": 0.48906666666666665,
+      "grad_norm": 0.4236700078969928,
+      "learning_rate": 0.00010845732433208779,
+      "loss": 0.736,
+      "step": 917
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.5273943593421361,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.8277,
+      "step": 918
+    },
+    {
+      "epoch": 0.4901333333333333,
+      "grad_norm": 0.4372769409605662,
+      "learning_rate": 0.00010811290298317755,
+      "loss": 0.6718,
+      "step": 919
+    },
+    {
+      "epoch": 0.49066666666666664,
+      "grad_norm": 0.5905463937231734,
+      "learning_rate": 0.00010794065571204072,
+      "loss": 0.7761,
+      "step": 920
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.4339096696432376,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7329,
+      "step": 921
+    },
+    {
+      "epoch": 0.49173333333333336,
+      "grad_norm": 0.6385924503289472,
+      "learning_rate": 0.00010759609054818458,
+      "loss": 0.8094,
+      "step": 922
+    },
+    {
+      "epoch": 0.4922666666666667,
+      "grad_norm": 0.6224472942858802,
+      "learning_rate": 0.00010742377368438914,
+      "loss": 0.7984,
+      "step": 923
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.5987915588234599,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.8015,
+      "step": 924
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 0.48684483100299114,
+      "learning_rate": 0.00010707907396588361,
+      "loss": 0.6933,
+      "step": 925
+    },
+    {
+      "epoch": 0.4938666666666667,
+      "grad_norm": 0.47889562425964494,
+      "learning_rate": 0.0001069066921404992,
+      "loss": 0.7893,
+      "step": 926
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.4427783280540483,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7504,
+      "step": 927
+    },
+    {
+      "epoch": 0.49493333333333334,
+      "grad_norm": 0.45953862660412464,
+      "learning_rate": 0.00010656186713125689,
+      "loss": 0.818,
+      "step": 928
+    },
+    {
+      "epoch": 0.49546666666666667,
+      "grad_norm": 0.4632980682604805,
+      "learning_rate": 0.0001063894249770989,
+      "loss": 0.7654,
+      "step": 929
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4970646438155739,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7649,
+      "step": 930
+    },
+    {
+      "epoch": 0.4965333333333333,
+      "grad_norm": 0.47855367676538474,
+      "learning_rate": 0.00010604448394439983,
+      "loss": 0.8348,
+      "step": 931
+    },
+    {
+      "epoch": 0.49706666666666666,
+      "grad_norm": 0.5150032033217121,
+      "learning_rate": 0.00010587198609590505,
+      "loss": 0.7443,
+      "step": 932
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.4827091017526877,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.7594,
+      "step": 933
+    },
+    {
+      "epoch": 0.4981333333333333,
+      "grad_norm": 0.5630702112689725,
+      "learning_rate": 0.00010552693831014726,
+      "loss": 0.8098,
+      "step": 934
+    },
+    {
+      "epoch": 0.49866666666666665,
+      "grad_norm": 0.6480528114874299,
+      "learning_rate": 0.0001053543894032493,
+      "loss": 0.884,
+      "step": 935
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.5159217909240126,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.7916,
+      "step": 936
+    },
+    {
+      "epoch": 0.4997333333333333,
+      "grad_norm": 0.5262820068326394,
+      "learning_rate": 0.00010500924413769988,
+      "loss": 0.7374,
+      "step": 937
+    },
+    {
+      "epoch": 0.5002666666666666,
+      "grad_norm": 0.4993473580240705,
+      "learning_rate": 0.00010483664880970457,
+      "loss": 0.7151,
+      "step": 938
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.4657089901590285,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.724,
+      "step": 939
+    },
+    {
+      "epoch": 0.5013333333333333,
+      "grad_norm": 0.5228105787943458,
+      "learning_rate": 0.00010449141534025045,
+      "loss": 0.7466,
+      "step": 940
+    },
+    {
+      "epoch": 0.5018666666666667,
+      "grad_norm": 0.4133632297831274,
+      "learning_rate": 0.00010431877822971117,
+      "loss": 0.7487,
+      "step": 941
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.5962099027010531,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.7504,
+      "step": 942
+    },
+    {
+      "epoch": 0.5029333333333333,
+      "grad_norm": 0.5156682261075197,
+      "learning_rate": 0.00010397346583460971,
+      "loss": 0.7657,
+      "step": 943
+    },
+    {
+      "epoch": 0.5034666666666666,
+      "grad_norm": 0.55049333424343,
+      "learning_rate": 0.0001038007915812028,
+      "loss": 0.8423,
+      "step": 944
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.5652328546558053,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.8679,
+      "step": 945
+    },
+    {
+      "epoch": 0.5045333333333333,
+      "grad_norm": 0.5851425172299666,
+      "learning_rate": 0.0001034554095408326,
+      "loss": 0.914,
+      "step": 946
+    },
+    {
+      "epoch": 0.5050666666666667,
+      "grad_norm": 0.5358604988343294,
+      "learning_rate": 0.00010328270278523256,
+      "loss": 0.8724,
+      "step": 947
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.4765769291896708,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7257,
+      "step": 948
+    },
+    {
+      "epoch": 0.5061333333333333,
+      "grad_norm": 0.5555226218834401,
+      "learning_rate": 0.00010293726038184393,
+      "loss": 0.7924,
+      "step": 949
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.48312215529746294,
+      "learning_rate": 0.00010276452576559879,
+      "loss": 0.7212,
+      "step": 950
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.4778138090491278,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7238,
+      "step": 951
+    },
+    {
+      "epoch": 0.5077333333333334,
+      "grad_norm": 0.6689189049077776,
+      "learning_rate": 0.00010241903228306431,
+      "loss": 0.9201,
+      "step": 952
+    },
+    {
+      "epoch": 0.5082666666666666,
+      "grad_norm": 0.4532393405967173,
+      "learning_rate": 0.0001022462744484709,
+      "loss": 0.7076,
+      "step": 953
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.506970275115917,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.79,
+      "step": 954
+    },
+    {
+      "epoch": 0.5093333333333333,
+      "grad_norm": 0.46879915621380663,
+      "learning_rate": 0.00010190073917203589,
+      "loss": 0.7753,
+      "step": 955
+    },
+    {
+      "epoch": 0.5098666666666667,
+      "grad_norm": 0.5004248052216412,
+      "learning_rate": 0.00010172796276201503,
+      "loss": 0.7596,
+      "step": 956
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.48676595236330417,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.8244,
+      "step": 957
+    },
+    {
+      "epoch": 0.5109333333333334,
+      "grad_norm": 0.4012451103630835,
+      "learning_rate": 0.00010138239497804804,
+      "loss": 0.6793,
+      "step": 958
+    },
+    {
+      "epoch": 0.5114666666666666,
+      "grad_norm": 0.5113472466549227,
+      "learning_rate": 0.00010120960463601976,
+      "loss": 0.7168,
+      "step": 959
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.5585082337357712,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.786,
+      "step": 960
+    },
+    {
+      "epoch": 0.5125333333333333,
+      "grad_norm": 0.5132952281443672,
+      "learning_rate": 0.00010086401363176305,
+      "loss": 0.7239,
+      "step": 961
+    },
+    {
+      "epoch": 0.5130666666666667,
+      "grad_norm": 0.5627208817373884,
+      "learning_rate": 0.00010069121400152181,
+      "loss": 0.8331,
+      "step": 962
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.5185703694893471,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.7443,
+      "step": 963
+    },
+    {
+      "epoch": 0.5141333333333333,
+      "grad_norm": 0.5778757800808362,
+      "learning_rate": 0.0001003456090648416,
+      "loss": 0.7399,
+      "step": 964
+    },
+    {
+      "epoch": 0.5146666666666667,
+      "grad_norm": 0.5194758506650604,
+      "learning_rate": 0.00010017280479043147,
+      "loss": 0.7753,
+      "step": 965
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.49661070959338893,
+      "learning_rate": 0.0001,
+      "loss": 0.7752,
+      "step": 966
+    },
+    {
+      "epoch": 0.5157333333333334,
+      "grad_norm": 0.48506621784549836,
+      "learning_rate": 9.982719520956855e-05,
+      "loss": 0.8398,
+      "step": 967
+    },
+    {
+      "epoch": 0.5162666666666667,
+      "grad_norm": 0.4875209967744212,
+      "learning_rate": 9.965439093515841e-05,
+      "loss": 0.6914,
+      "step": 968
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.5712704204753644,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7663,
+      "step": 969
+    },
+    {
+      "epoch": 0.5173333333333333,
+      "grad_norm": 0.42577760098407713,
+      "learning_rate": 9.930878599847821e-05,
+      "loss": 0.7742,
+      "step": 970
+    },
+    {
+      "epoch": 0.5178666666666667,
+      "grad_norm": 0.5145933577513895,
+      "learning_rate": 9.913598636823693e-05,
+      "loss": 0.8162,
+      "step": 971
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.47544170946764996,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.749,
+      "step": 972
+    },
+    {
+      "epoch": 0.5189333333333334,
+      "grad_norm": 0.47257430999421846,
+      "learning_rate": 9.879039536398024e-05,
+      "loss": 0.787,
+      "step": 973
+    },
+    {
+      "epoch": 0.5194666666666666,
+      "grad_norm": 0.43052217825413647,
+      "learning_rate": 9.861760502195197e-05,
+      "loss": 0.7219,
+      "step": 974
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.46573574169891335,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7606,
+      "step": 975
+    },
+    {
+      "epoch": 0.5205333333333333,
+      "grad_norm": 0.4733087979911936,
+      "learning_rate": 9.827203723798498e-05,
+      "loss": 0.7155,
+      "step": 976
+    },
+    {
+      "epoch": 0.5210666666666667,
+      "grad_norm": 0.4930793593232939,
+      "learning_rate": 9.809926082796415e-05,
+      "loss": 0.7838,
+      "step": 977
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.5797961248719126,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.8436,
+      "step": 978
+    },
+    {
+      "epoch": 0.5221333333333333,
+      "grad_norm": 0.47705044869808927,
+      "learning_rate": 9.775372555152912e-05,
+      "loss": 0.7516,
+      "step": 979
+    },
+    {
+      "epoch": 0.5226666666666666,
+      "grad_norm": 0.604131424977959,
+      "learning_rate": 9.758096771693573e-05,
+      "loss": 0.9429,
+      "step": 980
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.44206954405281973,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.7349,
+      "step": 981
+    },
+    {
+      "epoch": 0.5237333333333334,
+      "grad_norm": 0.534673109019627,
+      "learning_rate": 9.723547423440122e-05,
+      "loss": 0.8102,
+      "step": 982
+    },
+    {
+      "epoch": 0.5242666666666667,
+      "grad_norm": 0.5319890275331417,
+      "learning_rate": 9.70627396181561e-05,
+      "loss": 0.7864,
+      "step": 983
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.5041767156879674,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7042,
+      "step": 984
+    },
+    {
+      "epoch": 0.5253333333333333,
+      "grad_norm": 0.5009543847409859,
+      "learning_rate": 9.671729721476746e-05,
+      "loss": 0.7679,
+      "step": 985
+    },
+    {
+      "epoch": 0.5258666666666667,
+      "grad_norm": 0.46028265141081554,
+      "learning_rate": 9.654459045916743e-05,
+      "loss": 0.7723,
+      "step": 986
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.48486788993634405,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7767,
+      "step": 987
+    },
+    {
+      "epoch": 0.5269333333333334,
+      "grad_norm": 0.5173426088972937,
+      "learning_rate": 9.619920841879725e-05,
+      "loss": 0.8181,
+      "step": 988
+    },
+    {
+      "epoch": 0.5274666666666666,
+      "grad_norm": 0.5401368625618846,
+      "learning_rate": 9.602653416539031e-05,
+      "loss": 0.7998,
+      "step": 989
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.49433598961855285,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.793,
+      "step": 990
+    },
+    {
+      "epoch": 0.5285333333333333,
+      "grad_norm": 0.5585235720199297,
+      "learning_rate": 9.568122177028884e-05,
+      "loss": 0.8139,
+      "step": 991
+    },
+    {
+      "epoch": 0.5290666666666667,
+      "grad_norm": 0.5752829228742448,
+      "learning_rate": 9.550858465974958e-05,
+      "loss": 0.7584,
+      "step": 992
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.57631471280755,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.9417,
+      "step": 993
+    },
+    {
+      "epoch": 0.5301333333333333,
+      "grad_norm": 0.4752286727403488,
+      "learning_rate": 9.516335119029546e-05,
+      "loss": 0.8116,
+      "step": 994
+    },
+    {
+      "epoch": 0.5306666666666666,
+      "grad_norm": 0.5048984776307732,
+      "learning_rate": 9.499075586230013e-05,
+      "loss": 0.8326,
+      "step": 995
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.5016459109163391,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.7255,
+      "step": 996
+    },
+    {
+      "epoch": 0.5317333333333333,
+      "grad_norm": 0.48471486192604013,
+      "learning_rate": 9.464561059675073e-05,
+      "loss": 0.7633,
+      "step": 997
+    },
+    {
+      "epoch": 0.5322666666666667,
+      "grad_norm": 0.5048034917358186,
+      "learning_rate": 9.44730616898528e-05,
+      "loss": 0.7139,
+      "step": 998
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.5423514630028117,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.7734,
+      "step": 999
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.40773479485225583,
+      "learning_rate": 9.412801390409497e-05,
+      "loss": 0.7378,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5338666666666667,
+      "grad_norm": 0.6044628207918135,
+      "learning_rate": 9.395551605560018e-05,
+      "loss": 0.9084,
+      "step": 1001
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.5387590843182661,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.8517,
+      "step": 1002
+    },
+    {
+      "epoch": 0.5349333333333334,
+      "grad_norm": 0.49638266994093244,
+      "learning_rate": 9.361057502290113e-05,
+      "loss": 0.7362,
+      "step": 1003
+    },
+    {
+      "epoch": 0.5354666666666666,
+      "grad_norm": 0.43680978163350903,
+      "learning_rate": 9.343813286874312e-05,
+      "loss": 0.7057,
+      "step": 1004
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.42401098881189553,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.6663,
+      "step": 1005
+    },
+    {
+      "epoch": 0.5365333333333333,
+      "grad_norm": 0.5510951143045089,
+      "learning_rate": 9.309330785950086e-05,
+      "loss": 0.8326,
+      "step": 1006
+    },
+    {
+      "epoch": 0.5370666666666667,
+      "grad_norm": 0.41375524431208105,
+      "learning_rate": 9.292092603411641e-05,
+      "loss": 0.656,
+      "step": 1007
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.6699514086298767,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.8479,
+      "step": 1008
+    },
+    {
+      "epoch": 0.5381333333333334,
+      "grad_norm": 0.45454789301126036,
+      "learning_rate": 9.257622631561085e-05,
+      "loss": 0.7507,
+      "step": 1009
+    },
+    {
+      "epoch": 0.5386666666666666,
+      "grad_norm": 0.4518759794712517,
+      "learning_rate": 9.240390945181543e-05,
+      "loss": 0.6467,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.6425706473983626,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.9246,
+      "step": 1011
+    },
+    {
+      "epoch": 0.5397333333333333,
+      "grad_norm": 0.5434987449911853,
+      "learning_rate": 9.205934428795929e-05,
+      "loss": 0.8056,
+      "step": 1012
+    },
+    {
+      "epoch": 0.5402666666666667,
+      "grad_norm": 0.4492697269052785,
+      "learning_rate": 9.188709701682247e-05,
+      "loss": 0.771,
+      "step": 1013
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.4606910163821094,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.8024,
+      "step": 1014
+    },
+    {
+      "epoch": 0.5413333333333333,
+      "grad_norm": 0.5384198728605855,
+      "learning_rate": 9.154267566791223e-05,
+      "loss": 0.763,
+      "step": 1015
+    },
+    {
+      "epoch": 0.5418666666666667,
+      "grad_norm": 0.6794293506762635,
+      "learning_rate": 9.137050261863324e-05,
+      "loss": 0.8719,
+      "step": 1016
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.6088738162385567,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.8365,
+      "step": 1017
+    },
+    {
+      "epoch": 0.5429333333333334,
+      "grad_norm": 0.5026043729795472,
+      "learning_rate": 9.102623434110028e-05,
+      "loss": 0.7689,
+      "step": 1018
+    },
+    {
+      "epoch": 0.5434666666666667,
+      "grad_norm": 0.4493042833001174,
+      "learning_rate": 9.085414014088369e-05,
+      "loss": 0.6673,
+      "step": 1019
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4475146656097217,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.6928,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5445333333333333,
+      "grad_norm": 0.4493959992386878,
+      "learning_rate": 9.051003418704565e-05,
+      "loss": 0.7361,
+      "step": 1021
+    },
+    {
+      "epoch": 0.5450666666666667,
+      "grad_norm": 0.5658826869183532,
+      "learning_rate": 9.033802346097682e-05,
+      "loss": 0.8859,
+      "step": 1022
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.49869889355059516,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.7865,
+      "step": 1023
+    },
+    {
+      "epoch": 0.5461333333333334,
+      "grad_norm": 0.4694526316805162,
+      "learning_rate": 8.999408907878877e-05,
+      "loss": 0.7074,
+      "step": 1024
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 0.601500875267324,
+      "learning_rate": 8.982216644970979e-05,
+      "loss": 0.7761,
+      "step": 1025
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.49570321716905996,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7538,
+      "step": 1026
+    },
+    {
+      "epoch": 0.5477333333333333,
+      "grad_norm": 0.5488827437321244,
+      "learning_rate": 8.947841288251568e-05,
+      "loss": 0.7841,
+      "step": 1027
+    },
+    {
+      "epoch": 0.5482666666666667,
+      "grad_norm": 0.4978453428689801,
+      "learning_rate": 8.930658297090091e-05,
+      "loss": 0.7965,
+      "step": 1028
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.5456067995327268,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.649,
+      "step": 1029
+    },
+    {
+      "epoch": 0.5493333333333333,
+      "grad_norm": 0.5890815714771167,
+      "learning_rate": 8.896301945718541e-05,
+      "loss": 0.7557,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5498666666666666,
+      "grad_norm": 0.4134396118443379,
+      "learning_rate": 8.879128688101749e-05,
+      "loss": 0.6983,
+      "step": 1031
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.4386094969310609,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7434,
+      "step": 1032
+    },
+    {
+      "epoch": 0.5509333333333334,
+      "grad_norm": 0.797563002235324,
+      "learning_rate": 8.844792265415738e-05,
+      "loss": 0.9317,
+      "step": 1033
+    },
+    {
+      "epoch": 0.5514666666666667,
+      "grad_norm": 0.4435294090073385,
+      "learning_rate": 8.827629202880293e-05,
+      "loss": 0.7477,
+      "step": 1034
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.4141554415288854,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.579,
+      "step": 1035
+    },
+    {
+      "epoch": 0.5525333333333333,
+      "grad_norm": 0.6895853306319035,
+      "learning_rate": 8.793313631681915e-05,
+      "loss": 0.8415,
+      "step": 1036
+    },
+    {
+      "epoch": 0.5530666666666667,
+      "grad_norm": 0.465860004697133,
+      "learning_rate": 8.776161225490489e-05,
+      "loss": 0.8201,
+      "step": 1037
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4413236626817581,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.765,
+      "step": 1038
+    },
+    {
+      "epoch": 0.5541333333333334,
+      "grad_norm": 0.5590586045272012,
+      "learning_rate": 8.741867428021446e-05,
+      "loss": 0.9927,
+      "step": 1039
+    },
+    {
+      "epoch": 0.5546666666666666,
+      "grad_norm": 0.5814757610274929,
+      "learning_rate": 8.724726139150318e-05,
+      "loss": 0.623,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.5456403867820663,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.8095,
+      "step": 1041
+    },
+    {
+      "epoch": 0.5557333333333333,
+      "grad_norm": 0.5698682014313946,
+      "learning_rate": 8.690455037067141e-05,
+      "loss": 0.8043,
+      "step": 1042
+    },
+    {
+      "epoch": 0.5562666666666667,
+      "grad_norm": 0.5645368222782049,
+      "learning_rate": 8.673325326193806e-05,
+      "loss": 0.7581,
+      "step": 1043
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.5377566999357087,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7506,
+      "step": 1044
+    },
+    {
+      "epoch": 0.5573333333333333,
+      "grad_norm": 0.6185626048287002,
+      "learning_rate": 8.639077840543077e-05,
+      "loss": 0.7798,
+      "step": 1045
+    },
+    {
+      "epoch": 0.5578666666666666,
+      "grad_norm": 0.4220564255020212,
+      "learning_rate": 8.621960168033867e-05,
+      "loss": 0.7496,
+      "step": 1046
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.5271457551016745,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.7927,
+      "step": 1047
+    },
+    {
+      "epoch": 0.5589333333333333,
+      "grad_norm": 0.5147393348972404,
+      "learning_rate": 8.587737219227462e-05,
+      "loss": 0.7178,
+      "step": 1048
+    },
+    {
+      "epoch": 0.5594666666666667,
+      "grad_norm": 0.46443478845199315,
+      "learning_rate": 8.570632045125185e-05,
+      "loss": 0.7054,
+      "step": 1049
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.7135634445888618,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.8207,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5605333333333333,
+      "grad_norm": 0.42761424683265864,
+      "learning_rate": 8.536434552915556e-05,
+      "loss": 0.6581,
+      "step": 1051
+    },
+    {
+      "epoch": 0.5610666666666667,
+      "grad_norm": 0.42313988160867755,
+      "learning_rate": 8.519342336927105e-05,
+      "loss": 0.7775,
+      "step": 1052
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.4538095633072262,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7687,
+      "step": 1053
+    },
+    {
+      "epoch": 0.5621333333333334,
+      "grad_norm": 0.533422559952103,
+      "learning_rate": 8.485171220382545e-05,
+      "loss": 0.7408,
+      "step": 1054
+    },
+    {
+      "epoch": 0.5626666666666666,
+      "grad_norm": 0.45460544867317326,
+      "learning_rate": 8.468092421866573e-05,
+      "loss": 0.7805,
+      "step": 1055
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.4527045555244706,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7938,
+      "step": 1056
+    },
+    {
+      "epoch": 0.5637333333333333,
+      "grad_norm": 0.5832336658005973,
+      "learning_rate": 8.433948599346516e-05,
+      "loss": 0.8422,
+      "step": 1057
+    },
+    {
+      "epoch": 0.5642666666666667,
+      "grad_norm": 0.48220157807827396,
+      "learning_rate": 8.416883677301069e-05,
+      "loss": 0.7968,
+      "step": 1058
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.6651001691852388,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 1.0221,
+      "step": 1059
+    },
+    {
+      "epoch": 0.5653333333333334,
+      "grad_norm": 0.4737128702852164,
+      "learning_rate": 8.382768066431425e-05,
+      "loss": 0.742,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5658666666666666,
+      "grad_norm": 0.5965628803991697,
+      "learning_rate": 8.36571747948162e-05,
+      "loss": 0.7871,
+      "step": 1061
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4601936919160773,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7287,
+      "step": 1062
+    },
+    {
+      "epoch": 0.5669333333333333,
+      "grad_norm": 0.5502984364373679,
+      "learning_rate": 8.33163099713009e-05,
+      "loss": 0.7854,
+      "step": 1063
+    },
+    {
+      "epoch": 0.5674666666666667,
+      "grad_norm": 0.6058604776648268,
+      "learning_rate": 8.31459520351578e-05,
+      "loss": 0.7834,
+      "step": 1064
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.6349157785706616,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.9179,
+      "step": 1065
+    },
+    {
+      "epoch": 0.5685333333333333,
+      "grad_norm": 0.5103853059024941,
+      "learning_rate": 8.280538765767235e-05,
+      "loss": 0.8123,
+      "step": 1066
+    },
+    {
+      "epoch": 0.5690666666666667,
+      "grad_norm": 0.5012969596052058,
+      "learning_rate": 8.263518223330697e-05,
+      "loss": 0.8938,
+      "step": 1067
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.49085332449675545,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7317,
+      "step": 1068
+    },
+    {
+      "epoch": 0.5701333333333334,
+      "grad_norm": 0.46192859143681336,
+      "learning_rate": 8.22949274546255e-05,
+      "loss": 0.7479,
+      "step": 1069
+    },
+    {
+      "epoch": 0.5706666666666667,
+      "grad_norm": 0.7572527685234165,
+      "learning_rate": 8.212487911636184e-05,
+      "loss": 0.799,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.43048805052859346,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6879,
+      "step": 1071
+    },
+    {
+      "epoch": 0.5717333333333333,
+      "grad_norm": 0.6290092415581088,
+      "learning_rate": 8.178494308093789e-05,
+      "loss": 0.8611,
+      "step": 1072
+    },
+    {
+      "epoch": 0.5722666666666667,
+      "grad_norm": 0.6896349768871635,
+      "learning_rate": 8.161505639887817e-05,
+      "loss": 0.8524,
+      "step": 1073
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.45147054762589583,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.744,
+      "step": 1074
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 0.5046745539072711,
+      "learning_rate": 8.127544824259889e-05,
+      "loss": 0.8061,
+      "step": 1075
+    },
+    {
+      "epoch": 0.5738666666666666,
+      "grad_norm": 0.4450247446607071,
+      "learning_rate": 8.110572778250085e-05,
+      "loss": 0.729,
+      "step": 1076
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.4991655316273437,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7505,
+      "step": 1077
+    },
+    {
+      "epoch": 0.5749333333333333,
+      "grad_norm": 0.6021083483612211,
+      "learning_rate": 8.076645663244168e-05,
+      "loss": 0.9119,
+      "step": 1078
+    },
+    {
+      "epoch": 0.5754666666666667,
+      "grad_norm": 0.4714342234434051,
+      "learning_rate": 8.059690695559568e-05,
+      "loss": 0.7631,
+      "step": 1079
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.5178330816276597,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.7748,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5765333333333333,
+      "grad_norm": 0.4132243325540413,
+      "learning_rate": 8.025798192977481e-05,
+      "loss": 0.613,
+      "step": 1081
+    },
+    {
+      "epoch": 0.5770666666666666,
+      "grad_norm": 0.44929399257605435,
+      "learning_rate": 8.008860759288147e-05,
+      "loss": 0.6669,
+      "step": 1082
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.5344871820450173,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.806,
+      "step": 1083
+    },
+    {
+      "epoch": 0.5781333333333334,
+      "grad_norm": 0.5158510559476444,
+      "learning_rate": 7.975003780001485e-05,
+      "loss": 0.704,
+      "step": 1084
+    },
+    {
+      "epoch": 0.5786666666666667,
+      "grad_norm": 0.5881917075890982,
+      "learning_rate": 7.958084335506239e-05,
+      "loss": 0.8393,
+      "step": 1085
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.4448194722905415,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.7278,
+      "step": 1086
+    },
+    {
+      "epoch": 0.5797333333333333,
+      "grad_norm": 0.4415817363764343,
+      "learning_rate": 7.924263789431912e-05,
+      "loss": 0.759,
+      "step": 1087
+    },
+    {
+      "epoch": 0.5802666666666667,
+      "grad_norm": 0.43254297501106265,
+      "learning_rate": 7.907362788846116e-05,
+      "loss": 0.7259,
+      "step": 1088
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.4955340970115768,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.7804,
+      "step": 1089
+    },
+    {
+      "epoch": 0.5813333333333334,
+      "grad_norm": 0.4127057054580811,
+      "learning_rate": 7.873579584921869e-05,
+      "loss": 0.5882,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5818666666666666,
+      "grad_norm": 0.4916154955577541,
+      "learning_rate": 7.856697482465196e-05,
+      "loss": 0.7531,
+      "step": 1091
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.6032246923125663,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.9182,
+      "step": 1092
+    },
+    {
+      "epoch": 0.5829333333333333,
+      "grad_norm": 0.5482636793168754,
+      "learning_rate": 7.822952528625191e-05,
+      "loss": 0.7806,
+      "step": 1093
+    },
+    {
+      "epoch": 0.5834666666666667,
+      "grad_norm": 0.5902884522150984,
+      "learning_rate": 7.806089778009421e-05,
+      "loss": 0.8417,
+      "step": 1094
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.6028871218261679,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.9686,
+      "step": 1095
+    },
+    {
+      "epoch": 0.5845333333333333,
+      "grad_norm": 0.5386609305695303,
+      "learning_rate": 7.772383981159849e-05,
+      "loss": 0.7806,
+      "step": 1096
+    },
+    {
+      "epoch": 0.5850666666666666,
+      "grad_norm": 0.5367112850841399,
+      "learning_rate": 7.755541035576677e-05,
+      "loss": 0.832,
+      "step": 1097
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.47813459747579856,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.7458,
+      "step": 1098
+    },
+    {
+      "epoch": 0.5861333333333333,
+      "grad_norm": 0.4460385373404532,
+      "learning_rate": 7.721875301571359e-05,
+      "loss": 0.7462,
+      "step": 1099
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.4456182927472116,
+      "learning_rate": 7.705052613680211e-05,
+      "loss": 0.6813,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.46856392814854575,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.7422,
+      "step": 1101
+    },
+    {
+      "epoch": 0.5877333333333333,
+      "grad_norm": 0.4539049855895284,
+      "learning_rate": 7.671427847296275e-05,
+      "loss": 0.689,
+      "step": 1102
+    },
+    {
+      "epoch": 0.5882666666666667,
+      "grad_norm": 0.46288916807437674,
+      "learning_rate": 7.654625869212146e-05,
+      "loss": 0.7945,
+      "step": 1103
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.5320408164886675,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.7668,
+      "step": 1104
+    },
+    {
+      "epoch": 0.5893333333333334,
+      "grad_norm": 0.5043155275745976,
+      "learning_rate": 7.6210429741257e-05,
+      "loss": 0.7325,
+      "step": 1105
+    },
+    {
+      "epoch": 0.5898666666666667,
+      "grad_norm": 0.46453023128837523,
+      "learning_rate": 7.604262157407007e-05,
+      "loss": 0.8079,
+      "step": 1106
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.4749269451246347,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.7462,
+      "step": 1107
+    },
+    {
+      "epoch": 0.5909333333333333,
+      "grad_norm": 0.514586186902165,
+      "learning_rate": 7.570722036168854e-05,
+      "loss": 0.7458,
+      "step": 1108
+    },
+    {
+      "epoch": 0.5914666666666667,
+      "grad_norm": 0.6714374876984359,
+      "learning_rate": 7.55396283180529e-05,
+      "loss": 0.9248,
+      "step": 1109
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4893196173882575,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.804,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5925333333333334,
+      "grad_norm": 0.4027566299206927,
+      "learning_rate": 7.520466385816671e-05,
+      "loss": 0.6574,
+      "step": 1111
+    },
+    {
+      "epoch": 0.5930666666666666,
+      "grad_norm": 0.46239830297712675,
+      "learning_rate": 7.503729244217086e-05,
+      "loss": 0.6244,
+      "step": 1112
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.5212034179279587,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.7334,
+      "step": 1113
+    },
+    {
+      "epoch": 0.5941333333333333,
+      "grad_norm": 0.5563132862800897,
+      "learning_rate": 7.470277373705461e-05,
+      "loss": 0.795,
+      "step": 1114
+    },
+    {
+      "epoch": 0.5946666666666667,
+      "grad_norm": 0.485130898303703,
+      "learning_rate": 7.453562744685778e-05,
+      "loss": 0.7568,
+      "step": 1115
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.6083852959382066,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.7011,
+      "step": 1116
+    },
+    {
+      "epoch": 0.5957333333333333,
+      "grad_norm": 0.4750973409821206,
+      "learning_rate": 7.42015634868062e-05,
+      "loss": 0.6985,
+      "step": 1117
+    },
+    {
+      "epoch": 0.5962666666666666,
+      "grad_norm": 0.5748068195170376,
+      "learning_rate": 7.403464681451715e-05,
+      "loss": 0.7349,
+      "step": 1118
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.5506747179570219,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.7912,
+      "step": 1119
+    },
+    {
+      "epoch": 0.5973333333333334,
+      "grad_norm": 0.41354805481557877,
+      "learning_rate": 7.370104657760361e-05,
+      "loss": 0.7004,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5978666666666667,
+      "grad_norm": 0.4160478551351224,
+      "learning_rate": 7.353436400916004e-05,
+      "loss": 0.6634,
+      "step": 1121
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.40059068613912047,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.7775,
+      "step": 1122
+    },
+    {
+      "epoch": 0.5989333333333333,
+      "grad_norm": 0.46017198399877646,
+      "learning_rate": 7.320123646099519e-05,
+      "loss": 0.7304,
+      "step": 1123
+    },
+    {
+      "epoch": 0.5994666666666667,
+      "grad_norm": 0.5902819462109755,
+      "learning_rate": 7.303479247604332e-05,
+      "loss": 0.8914,
+      "step": 1124
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.5406386880085818,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.8171,
+      "step": 1125
+    },
+    {
+      "epoch": 0.6005333333333334,
+      "grad_norm": 0.41901452797853955,
+      "learning_rate": 7.270214656953415e-05,
+      "loss": 0.6702,
+      "step": 1126
+    },
+    {
+      "epoch": 0.6010666666666666,
+      "grad_norm": 0.553698688570677,
+      "learning_rate": 7.253594564130804e-05,
+      "loss": 0.8016,
+      "step": 1127
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.5513780808148544,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.7856,
+      "step": 1128
+    },
+    {
+      "epoch": 0.6021333333333333,
+      "grad_norm": 0.46661180535210856,
+      "learning_rate": 7.22037903164173e-05,
+      "loss": 0.6399,
+      "step": 1129
+    },
+    {
+      "epoch": 0.6026666666666667,
+      "grad_norm": 0.5504463888041178,
+      "learning_rate": 7.203783691161883e-05,
+      "loss": 0.8997,
+      "step": 1130
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.5375101790560792,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.732,
+      "step": 1131
+    },
+    {
+      "epoch": 0.6037333333333333,
+      "grad_norm": 0.5098924546783888,
+      "learning_rate": 7.170618109512465e-05,
+      "loss": 0.7611,
+      "step": 1132
+    },
+    {
+      "epoch": 0.6042666666666666,
+      "grad_norm": 0.5384476107750127,
+      "learning_rate": 7.154047967380354e-05,
+      "loss": 0.852,
+      "step": 1133
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.5350115576094381,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.7675,
+      "step": 1134
+    },
+    {
+      "epoch": 0.6053333333333333,
+      "grad_norm": 0.40574771665868714,
+      "learning_rate": 7.12093322790597e-05,
+      "loss": 0.707,
+      "step": 1135
+    },
+    {
+      "epoch": 0.6058666666666667,
+      "grad_norm": 0.49779397012835314,
+      "learning_rate": 7.104388729449338e-05,
+      "loss": 0.7301,
+      "step": 1136
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.3731798846707171,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.6111,
+      "step": 1137
+    },
+    {
+      "epoch": 0.6069333333333333,
+      "grad_norm": 0.4850756352758892,
+      "learning_rate": 7.071325722118963e-05,
+      "loss": 0.7519,
+      "step": 1138
+    },
+    {
+      "epoch": 0.6074666666666667,
+      "grad_norm": 0.6139684422255738,
+      "learning_rate": 7.054807311976379e-05,
+      "loss": 0.7989,
+      "step": 1139
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4434011240395176,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6933,
+      "step": 1140
+    },
+    {
+      "epoch": 0.6085333333333334,
+      "grad_norm": 0.49808530175436067,
+      "learning_rate": 7.021796925368667e-05,
+      "loss": 0.8281,
+      "step": 1141
+    },
+    {
+      "epoch": 0.6090666666666666,
+      "grad_norm": 0.4681737004455478,
+      "learning_rate": 7.005305047477566e-05,
+      "loss": 0.7115,
+      "step": 1142
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.6399339214620106,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.7812,
+      "step": 1143
+    },
+    {
+      "epoch": 0.6101333333333333,
+      "grad_norm": 0.45419802550361577,
+      "learning_rate": 6.972348168756983e-05,
+      "loss": 0.7287,
+      "step": 1144
+    },
+    {
+      "epoch": 0.6106666666666667,
+      "grad_norm": 0.48160977267166555,
+      "learning_rate": 6.955883266341741e-05,
+      "loss": 0.694,
+      "step": 1145
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.47089987083089757,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.7516,
+      "step": 1146
+    },
+    {
+      "epoch": 0.6117333333333334,
+      "grad_norm": 0.4769968094972423,
+      "learning_rate": 6.922980781234699e-05,
+      "loss": 0.7607,
+      "step": 1147
+    },
+    {
+      "epoch": 0.6122666666666666,
+      "grad_norm": 0.5113445305578702,
+      "learning_rate": 6.906543296794714e-05,
+      "loss": 0.7669,
+      "step": 1148
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.5951036250978381,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.8113,
+      "step": 1149
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.4085341532710229,
+      "learning_rate": 6.873696089565786e-05,
+      "loss": 0.6495,
+      "step": 1150
+    },
+    {
+      "epoch": 0.6138666666666667,
+      "grad_norm": 0.45731725676548396,
+      "learning_rate": 6.85728646486359e-05,
+      "loss": 0.7541,
+      "step": 1151
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.47515240745984405,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.7613,
+      "step": 1152
+    },
+    {
+      "epoch": 0.6149333333333333,
+      "grad_norm": 0.45536664362280865,
+      "learning_rate": 6.82449541829174e-05,
+      "loss": 0.6845,
+      "step": 1153
+    },
+    {
+      "epoch": 0.6154666666666667,
+      "grad_norm": 0.5941606028102293,
+      "learning_rate": 6.80811409434113e-05,
+      "loss": 0.9209,
+      "step": 1154
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.4863295015502449,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.799,
+      "step": 1155
+    },
+    {
+      "epoch": 0.6165333333333334,
+      "grad_norm": 0.45670743525230284,
+      "learning_rate": 6.775380089695986e-05,
+      "loss": 0.6918,
+      "step": 1156
+    },
+    {
+      "epoch": 0.6170666666666667,
+      "grad_norm": 0.4927898296403875,
+      "learning_rate": 6.759027506750158e-05,
+      "loss": 0.7508,
+      "step": 1157
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.6233914888731245,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.8231,
+      "step": 1158
+    },
+    {
+      "epoch": 0.6181333333333333,
+      "grad_norm": 0.536198709515224,
+      "learning_rate": 6.726351423768322e-05,
+      "loss": 0.793,
+      "step": 1159
+    },
+    {
+      "epoch": 0.6186666666666667,
+      "grad_norm": 0.48756188330078964,
+      "learning_rate": 6.710028021308061e-05,
+      "loss": 0.6774,
+      "step": 1160
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.5712139069480937,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.8973,
+      "step": 1161
+    },
+    {
+      "epoch": 0.6197333333333334,
+      "grad_norm": 0.5492260719671762,
+      "learning_rate": 6.677410738169485e-05,
+      "loss": 0.7927,
+      "step": 1162
+    },
+    {
+      "epoch": 0.6202666666666666,
+      "grad_norm": 0.43602755004802324,
+      "learning_rate": 6.661116954891328e-05,
+      "loss": 0.7323,
+      "step": 1163
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.42252395128406706,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.7631,
+      "step": 1164
+    },
+    {
+      "epoch": 0.6213333333333333,
+      "grad_norm": 0.4062666723274316,
+      "learning_rate": 6.62855934819569e-05,
+      "loss": 0.6829,
+      "step": 1165
+    },
+    {
+      "epoch": 0.6218666666666667,
+      "grad_norm": 0.4744672048807581,
+      "learning_rate": 6.612295622000162e-05,
+      "loss": 0.776,
+      "step": 1166
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.49002783081861784,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7178,
+      "step": 1167
+    },
+    {
+      "epoch": 0.6229333333333333,
+      "grad_norm": 0.5824978436248245,
+      "learning_rate": 6.579798566743314e-05,
+      "loss": 0.75,
+      "step": 1168
+    },
+    {
+      "epoch": 0.6234666666666666,
+      "grad_norm": 0.46847652704699216,
+      "learning_rate": 6.563565334723134e-05,
+      "loss": 0.7302,
+      "step": 1169
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.6688715442609833,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.8042,
+      "step": 1170
+    },
+    {
+      "epoch": 0.6245333333333334,
+      "grad_norm": 0.6125074641738367,
+      "learning_rate": 6.531129704273604e-05,
+      "loss": 0.7949,
+      "step": 1171
+    },
+    {
+      "epoch": 0.6250666666666667,
+      "grad_norm": 0.5060782840915192,
+      "learning_rate": 6.514927402701964e-05,
+      "loss": 0.8076,
+      "step": 1172
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.6730189494287294,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.7791,
+      "step": 1173
+    },
+    {
+      "epoch": 0.6261333333333333,
+      "grad_norm": 0.645212680644595,
+      "learning_rate": 6.48255406877745e-05,
+      "loss": 0.7827,
+      "step": 1174
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 0.45367478391855637,
+      "learning_rate": 6.466383133096267e-05,
+      "loss": 0.7507,
+      "step": 1175
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.45461517927777706,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6742,
+      "step": 1176
+    },
+    {
+      "epoch": 0.6277333333333334,
+      "grad_norm": 0.5659063461270785,
+      "learning_rate": 6.434072965740242e-05,
+      "loss": 0.833,
+      "step": 1177
+    },
+    {
+      "epoch": 0.6282666666666666,
+      "grad_norm": 0.5025047825967893,
+      "learning_rate": 6.417933830548467e-05,
+      "loss": 0.7629,
+      "step": 1178
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.5556115753964004,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7558,
+      "step": 1179
+    },
+    {
+      "epoch": 0.6293333333333333,
+      "grad_norm": 0.49676283537854166,
+      "learning_rate": 6.385687698106781e-05,
+      "loss": 0.8382,
+      "step": 1180
+    },
+    {
+      "epoch": 0.6298666666666667,
+      "grad_norm": 0.47517253374761986,
+      "learning_rate": 6.369580797148718e-05,
+      "loss": 0.8045,
+      "step": 1181
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.4261562907967871,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6711,
+      "step": 1182
+    },
+    {
+      "epoch": 0.6309333333333333,
+      "grad_norm": 0.5090365893278739,
+      "learning_rate": 6.337399566246257e-05,
+      "loss": 0.7895,
+      "step": 1183
+    },
+    {
+      "epoch": 0.6314666666666666,
+      "grad_norm": 0.5179103729664175,
+      "learning_rate": 6.321325332399903e-05,
+      "loss": 0.6724,
+      "step": 1184
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.4885247064850998,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7453,
+      "step": 1185
+    },
+    {
+      "epoch": 0.6325333333333333,
+      "grad_norm": 0.40492027858978047,
+      "learning_rate": 6.289209867917312e-05,
+      "loss": 0.648,
+      "step": 1186
+    },
+    {
+      "epoch": 0.6330666666666667,
+      "grad_norm": 0.4311728644577337,
+      "learning_rate": 6.273168733182722e-05,
+      "loss": 0.7179,
+      "step": 1187
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.4371474337856531,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6316,
+      "step": 1188
+    },
+    {
+      "epoch": 0.6341333333333333,
+      "grad_norm": 0.4676358611503632,
+      "learning_rate": 6.241119898233144e-05,
+      "loss": 0.777,
+      "step": 1189
+    },
+    {
+      "epoch": 0.6346666666666667,
+      "grad_norm": 0.6847560977935614,
+      "learning_rate": 6.225112293720836e-05,
+      "loss": 0.7608,
+      "step": 1190
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.5544948643545228,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.7362,
+      "step": 1191
+    },
+    {
+      "epoch": 0.6357333333333334,
+      "grad_norm": 0.9224829073702737,
+      "learning_rate": 6.19313094962673e-05,
+      "loss": 0.9211,
+      "step": 1192
+    },
+    {
+      "epoch": 0.6362666666666666,
+      "grad_norm": 0.48080644250290394,
+      "learning_rate": 6.177157305546078e-05,
+      "loss": 0.7111,
+      "step": 1193
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.4739551772230298,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.778,
+      "step": 1194
+    },
+    {
+      "epoch": 0.6373333333333333,
+      "grad_norm": 0.5312235651493865,
+      "learning_rate": 6.145244311816063e-05,
+      "loss": 0.7499,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6378666666666667,
+      "grad_norm": 0.6559582873219216,
+      "learning_rate": 6.129305057463741e-05,
+      "loss": 0.8333,
+      "step": 1196
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.5586187196621849,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.7403,
+      "step": 1197
+    },
+    {
+      "epoch": 0.6389333333333334,
+      "grad_norm": 0.4835618153284476,
+      "learning_rate": 6.0974612717695004e-05,
+      "loss": 0.6013,
+      "step": 1198
+    },
+    {
+      "epoch": 0.6394666666666666,
+      "grad_norm": 0.44762718065527035,
+      "learning_rate": 6.0815568355179556e-05,
+      "loss": 0.7476,
+      "step": 1199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.4565399260657701,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.8032,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6405333333333333,
+      "grad_norm": 0.5357186140583011,
+      "learning_rate": 6.0497831136711836e-05,
+      "loss": 0.793,
+      "step": 1201
+    },
+    {
+      "epoch": 0.6410666666666667,
+      "grad_norm": 0.5570464751871326,
+      "learning_rate": 6.0339139229571116e-05,
+      "loss": 0.8484,
+      "step": 1202
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.4569021890750186,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.7876,
+      "step": 1203
+    },
+    {
+      "epoch": 0.6421333333333333,
+      "grad_norm": 0.4235006099986388,
+      "learning_rate": 6.002211118886514e-05,
+      "loss": 0.7685,
+      "step": 1204
+    },
+    {
+      "epoch": 0.6426666666666667,
+      "grad_norm": 0.4407742074198759,
+      "learning_rate": 5.986377600199371e-05,
+      "loss": 0.7324,
+      "step": 1205
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.5090580952169202,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.8099,
+      "step": 1206
+    },
+    {
+      "epoch": 0.6437333333333334,
+      "grad_norm": 0.4501523120592187,
+      "learning_rate": 5.9547465659277215e-05,
+      "loss": 0.8108,
+      "step": 1207
+    },
+    {
+      "epoch": 0.6442666666666667,
+      "grad_norm": 0.4390198907367184,
+      "learning_rate": 5.938949144798279e-05,
+      "loss": 0.6698,
+      "step": 1208
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.4872262737377709,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.7206,
+      "step": 1209
+    },
+    {
+      "epoch": 0.6453333333333333,
+      "grad_norm": 0.4078671554593861,
+      "learning_rate": 5.907390730419507e-05,
+      "loss": 0.6716,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6458666666666667,
+      "grad_norm": 0.5328962011820283,
+      "learning_rate": 5.8916298314083915e-05,
+      "loss": 0.8441,
+      "step": 1211
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4505548197220081,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.7188,
+      "step": 1212
+    },
+    {
+      "epoch": 0.6469333333333334,
+      "grad_norm": 0.5028372157738586,
+      "learning_rate": 5.860144885064751e-05,
+      "loss": 0.7702,
+      "step": 1213
+    },
+    {
+      "epoch": 0.6474666666666666,
+      "grad_norm": 0.5881035368653048,
+      "learning_rate": 5.8444209317510514e-05,
+      "loss": 0.8102,
+      "step": 1214
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.4644208673093524,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.7863,
+      "step": 1215
+    },
+    {
+      "epoch": 0.6485333333333333,
+      "grad_norm": 0.9184436029738751,
+      "learning_rate": 5.813010299610313e-05,
+      "loss": 0.7369,
+      "step": 1216
+    },
+    {
+      "epoch": 0.6490666666666667,
+      "grad_norm": 0.5238806668810477,
+      "learning_rate": 5.797323714580192e-05,
+      "loss": 0.8379,
+      "step": 1217
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.573507459169721,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.7635,
+      "step": 1218
+    },
+    {
+      "epoch": 0.6501333333333333,
+      "grad_norm": 0.43712869771798696,
+      "learning_rate": 5.765988240812921e-05,
+      "loss": 0.6759,
+      "step": 1219
+    },
+    {
+      "epoch": 0.6506666666666666,
+      "grad_norm": 0.51492141619251,
+      "learning_rate": 5.750339445648252e-05,
+      "loss": 0.7879,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.5589420565939197,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.7274,
+      "step": 1221
+    },
+    {
+      "epoch": 0.6517333333333334,
+      "grad_norm": 0.6130439212218944,
+      "learning_rate": 5.7190799724050924e-05,
+      "loss": 0.8891,
+      "step": 1222
+    },
+    {
+      "epoch": 0.6522666666666667,
+      "grad_norm": 0.4423169657489094,
+      "learning_rate": 5.7034693876721376e-05,
+      "loss": 0.7818,
+      "step": 1223
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.44932253972913216,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.7049,
+      "step": 1224
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 0.427777477401564,
+      "learning_rate": 5.6722867550612116e-05,
+      "loss": 0.7368,
+      "step": 1225
+    },
+    {
+      "epoch": 0.6538666666666667,
+      "grad_norm": 0.4174624721940442,
+      "learning_rate": 5.6567148002993164e-05,
+      "loss": 0.7708,
+      "step": 1226
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.5562260719661347,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.7334,
+      "step": 1227
+    },
+    {
+      "epoch": 0.6549333333333334,
+      "grad_norm": 0.632717622904955,
+      "learning_rate": 5.625609846363622e-05,
+      "loss": 0.7849,
+      "step": 1228
+    },
+    {
+      "epoch": 0.6554666666666666,
+      "grad_norm": 0.4359625588912055,
+      "learning_rate": 5.6100769400739383e-05,
+      "loss": 0.6654,
+      "step": 1229
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.41891582421355134,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.7579,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6565333333333333,
+      "grad_norm": 0.42159163914455067,
+      "learning_rate": 5.579050500768836e-05,
+      "loss": 0.6646,
+      "step": 1231
+    },
+    {
+      "epoch": 0.6570666666666667,
+      "grad_norm": 0.5811895061510105,
+      "learning_rate": 5.5635570604030705e-05,
+      "loss": 0.7739,
+      "step": 1232
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.582367703020442,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.8764,
+      "step": 1233
+    },
+    {
+      "epoch": 0.6581333333333333,
+      "grad_norm": 0.4975177901929358,
+      "learning_rate": 5.53260996957381e-05,
+      "loss": 0.7879,
+      "step": 1234
+    },
+    {
+      "epoch": 0.6586666666666666,
+      "grad_norm": 0.46707907109971164,
+      "learning_rate": 5.5171564115230254e-05,
+      "loss": 0.7714,
+      "step": 1235
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.40234271285614664,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6424,
+      "step": 1236
+    },
+    {
+      "epoch": 0.6597333333333333,
+      "grad_norm": 0.5354746693833636,
+      "learning_rate": 5.486289500882355e-05,
+      "loss": 0.7484,
+      "step": 1237
+    },
+    {
+      "epoch": 0.6602666666666667,
+      "grad_norm": 0.5277899940401004,
+      "learning_rate": 5.47087624046575e-05,
+      "loss": 0.7593,
+      "step": 1238
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.8114151816548788,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.7835,
+      "step": 1239
+    },
+    {
+      "epoch": 0.6613333333333333,
+      "grad_norm": 0.5238447886409662,
+      "learning_rate": 5.4400903395715366e-05,
+      "loss": 0.7476,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6618666666666667,
+      "grad_norm": 0.4034433634699191,
+      "learning_rate": 5.424717791025302e-05,
+      "loss": 0.6916,
+      "step": 1241
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.49701716112196237,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.7838,
+      "step": 1242
+    },
+    {
+      "epoch": 0.6629333333333334,
+      "grad_norm": 0.62062323297395,
+      "learning_rate": 5.394013727258254e-05,
+      "loss": 0.8154,
+      "step": 1243
+    },
+    {
+      "epoch": 0.6634666666666666,
+      "grad_norm": 0.5068903167924839,
+      "learning_rate": 5.378682303724435e-05,
+      "loss": 0.7607,
+      "step": 1244
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.460139403860215,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.7481,
+      "step": 1245
+    },
+    {
+      "epoch": 0.6645333333333333,
+      "grad_norm": 0.46487020590647377,
+      "learning_rate": 5.348060902265871e-05,
+      "loss": 0.7388,
+      "step": 1246
+    },
+    {
+      "epoch": 0.6650666666666667,
+      "grad_norm": 0.48953201444434585,
+      "learning_rate": 5.332771015781275e-05,
+      "loss": 0.7033,
+      "step": 1247
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.5136215721502551,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.7526,
+      "step": 1248
+    },
+    {
+      "epoch": 0.6661333333333334,
+      "grad_norm": 0.47109621415579506,
+      "learning_rate": 5.302233099590928e-05,
+      "loss": 0.741,
+      "step": 1249
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.5502634222197873,
+      "learning_rate": 5.286985161076029e-05,
+      "loss": 0.8411,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.5134976989258431,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.7553,
+      "step": 1251
+    },
+    {
+      "epoch": 0.6677333333333333,
+      "grad_norm": 0.4898318259638587,
+      "learning_rate": 5.2565315508699376e-05,
+      "loss": 0.6702,
+      "step": 1252
+    },
+    {
+      "epoch": 0.6682666666666667,
+      "grad_norm": 0.5227245331435747,
+      "learning_rate": 5.2413259701178505e-05,
+      "loss": 0.6482,
+      "step": 1253
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.5128301572867259,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.7159,
+      "step": 1254
+    },
+    {
+      "epoch": 0.6693333333333333,
+      "grad_norm": 0.576890268074048,
+      "learning_rate": 5.210957484346314e-05,
+      "loss": 0.7952,
+      "step": 1255
+    },
+    {
+      "epoch": 0.6698666666666667,
+      "grad_norm": 0.432253950885244,
+      "learning_rate": 5.195794670011776e-05,
+      "loss": 0.7143,
+      "step": 1256
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.41431838996966186,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6468,
+      "step": 1257
+    },
+    {
+      "epoch": 0.6709333333333334,
+      "grad_norm": 0.6219635281970626,
+      "learning_rate": 5.165512124837344e-05,
+      "loss": 0.7979,
+      "step": 1258
+    },
+    {
+      "epoch": 0.6714666666666667,
+      "grad_norm": 0.47001155233193037,
+      "learning_rate": 5.150392484425728e-05,
+      "loss": 0.7523,
+      "step": 1259
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4731698920729862,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.6733,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6725333333333333,
+      "grad_norm": 0.48348627346619377,
+      "learning_rate": 5.120196693701267e-05,
+      "loss": 0.6778,
+      "step": 1261
+    },
+    {
+      "epoch": 0.6730666666666667,
+      "grad_norm": 0.5373809731810197,
+      "learning_rate": 5.105120633557634e-05,
+      "loss": 0.7862,
+      "step": 1262
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.4302728310666555,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7408,
+      "step": 1263
+    },
+    {
+      "epoch": 0.6741333333333334,
+      "grad_norm": 0.4502300529686932,
+      "learning_rate": 5.075012408804458e-05,
+      "loss": 0.674,
+      "step": 1264
+    },
+    {
+      "epoch": 0.6746666666666666,
+      "grad_norm": 0.4749352377536877,
+      "learning_rate": 5.059980334102637e-05,
+      "loss": 0.6564,
+      "step": 1265
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4961234868019811,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.7162,
+      "step": 1266
+    },
+    {
+      "epoch": 0.6757333333333333,
+      "grad_norm": 0.5992313824361511,
+      "learning_rate": 5.0299604844886985e-05,
+      "loss": 0.8486,
+      "step": 1267
+    },
+    {
+      "epoch": 0.6762666666666667,
+      "grad_norm": 0.5125638267756718,
+      "learning_rate": 5.014972799220403e-05,
+      "loss": 0.6822,
+      "step": 1268
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.6175810913004475,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.8007,
+      "step": 1269
+    },
+    {
+      "epoch": 0.6773333333333333,
+      "grad_norm": 0.4591985784213636,
+      "learning_rate": 4.985042131538545e-05,
+      "loss": 0.6769,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6778666666666666,
+      "grad_norm": 0.39331101563430665,
+      "learning_rate": 4.9700992385024934e-05,
+      "loss": 0.7082,
+      "step": 1271
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.5101318179916204,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.7578,
+      "step": 1272
+    },
+    {
+      "epoch": 0.6789333333333334,
+      "grad_norm": 0.7519008983138528,
+      "learning_rate": 4.940258557148765e-05,
+      "loss": 0.7503,
+      "step": 1273
+    },
+    {
+      "epoch": 0.6794666666666667,
+      "grad_norm": 0.5437703016808688,
+      "learning_rate": 4.9253608579398855e-05,
+      "loss": 0.675,
+      "step": 1274
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.5086423764723533,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6744,
+      "step": 1275
+    },
+    {
+      "epoch": 0.6805333333333333,
+      "grad_norm": 0.43966489761898947,
+      "learning_rate": 4.895610964891923e-05,
+      "loss": 0.6865,
+      "step": 1276
+    },
+    {
+      "epoch": 0.6810666666666667,
+      "grad_norm": 0.6480433410332094,
+      "learning_rate": 4.880758859890536e-05,
+      "loss": 0.8626,
+      "step": 1277
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.5436825086049654,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.7899,
+      "step": 1278
+    },
+    {
+      "epoch": 0.6821333333333334,
+      "grad_norm": 0.44657371639546795,
+      "learning_rate": 4.851100554686021e-05,
+      "loss": 0.7399,
+      "step": 1279
+    },
+    {
+      "epoch": 0.6826666666666666,
+      "grad_norm": 0.5676457737282187,
+      "learning_rate": 4.836294443047088e-05,
+      "loss": 0.842,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.41702442978796533,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.67,
+      "step": 1281
+    },
+    {
+      "epoch": 0.6837333333333333,
+      "grad_norm": 0.4708337288708888,
+      "learning_rate": 4.8067285227622404e-05,
+      "loss": 0.7049,
+      "step": 1282
+    },
+    {
+      "epoch": 0.6842666666666667,
+      "grad_norm": 0.5326747190581289,
+      "learning_rate": 4.791968802404648e-05,
+      "loss": 0.7529,
+      "step": 1283
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.478308999115387,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6871,
+      "step": 1284
+    },
+    {
+      "epoch": 0.6853333333333333,
+      "grad_norm": 0.39650223263914064,
+      "learning_rate": 4.762496061632814e-05,
+      "loss": 0.6837,
+      "step": 1285
+    },
+    {
+      "epoch": 0.6858666666666666,
+      "grad_norm": 0.5370217535675965,
+      "learning_rate": 4.747783129228656e-05,
+      "loss": 0.7394,
+      "step": 1286
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.5541533985113412,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.8083,
+      "step": 1287
+    },
+    {
+      "epoch": 0.6869333333333333,
+      "grad_norm": 0.612301493210298,
+      "learning_rate": 4.718404360058966e-05,
+      "loss": 0.841,
+      "step": 1288
+    },
+    {
+      "epoch": 0.6874666666666667,
+      "grad_norm": 0.5565547179646171,
+      "learning_rate": 4.7037386110228985e-05,
+      "loss": 0.8259,
+      "step": 1289
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.560703868530753,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.8143,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6885333333333333,
+      "grad_norm": 0.5452429098540207,
+      "learning_rate": 4.6744546030189486e-05,
+      "loss": 0.657,
+      "step": 1291
+    },
+    {
+      "epoch": 0.6890666666666667,
+      "grad_norm": 0.7076748620068148,
+      "learning_rate": 4.659836431497563e-05,
+      "loss": 0.8688,
+      "step": 1292
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.4723531210225006,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.7277,
+      "step": 1293
+    },
+    {
+      "epoch": 0.6901333333333334,
+      "grad_norm": 0.38012171936952244,
+      "learning_rate": 4.630647971676232e-05,
+      "loss": 0.6425,
+      "step": 1294
+    },
+    {
+      "epoch": 0.6906666666666667,
+      "grad_norm": 0.603421292079894,
+      "learning_rate": 4.6160777705374524e-05,
+      "loss": 0.8161,
+      "step": 1295
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.5015500231918112,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.7039,
+      "step": 1296
+    },
+    {
+      "epoch": 0.6917333333333333,
+      "grad_norm": 0.4976613648134939,
+      "learning_rate": 4.586985643347717e-05,
+      "loss": 0.6392,
+      "step": 1297
+    },
+    {
+      "epoch": 0.6922666666666667,
+      "grad_norm": 0.42231826227758135,
+      "learning_rate": 4.572463804170263e-05,
+      "loss": 0.597,
+      "step": 1298
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.5261772097736712,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6993,
+      "step": 1299
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.5720749507167654,
+      "learning_rate": 4.543468791472131e-05,
+      "loss": 0.7378,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6938666666666666,
+      "grad_norm": 0.4854951914620495,
+      "learning_rate": 4.5289957045349653e-05,
+      "loss": 0.7636,
+      "step": 1301
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.6185097610107433,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.8268,
+      "step": 1302
+    },
+    {
+      "epoch": 0.6949333333333333,
+      "grad_norm": 0.4211057093206592,
+      "learning_rate": 4.5000985855784746e-05,
+      "loss": 0.6306,
+      "step": 1303
+    },
+    {
+      "epoch": 0.6954666666666667,
+      "grad_norm": 0.438023596550362,
+      "learning_rate": 4.485674639850333e-05,
+      "loss": 0.7062,
+      "step": 1304
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.5537641063198323,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7415,
+      "step": 1305
+    },
+    {
+      "epoch": 0.6965333333333333,
+      "grad_norm": 0.49633656204916216,
+      "learning_rate": 4.456876191254582e-05,
+      "loss": 0.7946,
+      "step": 1306
+    },
+    {
+      "epoch": 0.6970666666666666,
+      "grad_norm": 0.5343680050628211,
+      "learning_rate": 4.442501774383515e-05,
+      "loss": 0.774,
+      "step": 1307
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.6698593891009573,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.7576,
+      "step": 1308
+    },
+    {
+      "epoch": 0.6981333333333334,
+      "grad_norm": 0.40126281760654753,
+      "learning_rate": 4.413802770115816e-05,
+      "loss": 0.6637,
+      "step": 1309
+    },
+    {
+      "epoch": 0.6986666666666667,
+      "grad_norm": 0.4262446862725171,
+      "learning_rate": 4.399478268418771e-05,
+      "loss": 0.72,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.4350305002054411,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.6839,
+      "step": 1311
+    },
+    {
+      "epoch": 0.6997333333333333,
+      "grad_norm": 0.4811962513455146,
+      "learning_rate": 4.3708794797738375e-05,
+      "loss": 0.7616,
+      "step": 1312
+    },
+    {
+      "epoch": 0.7002666666666667,
+      "grad_norm": 0.3827499220134474,
+      "learning_rate": 4.3566052782262735e-05,
+      "loss": 0.6725,
+      "step": 1313
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.49317883787553995,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.7569,
+      "step": 1314
+    },
+    {
+      "epoch": 0.7013333333333334,
+      "grad_norm": 0.434883042566961,
+      "learning_rate": 4.328107473805487e-05,
+      "loss": 0.6309,
+      "step": 1315
+    },
+    {
+      "epoch": 0.7018666666666666,
+      "grad_norm": 0.5796238083418784,
+      "learning_rate": 4.3138839560310303e-05,
+      "loss": 0.8339,
+      "step": 1316
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.45368962247559386,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6974,
+      "step": 1317
+    },
+    {
+      "epoch": 0.7029333333333333,
+      "grad_norm": 0.4715066554294283,
+      "learning_rate": 4.2854879017217894e-05,
+      "loss": 0.6997,
+      "step": 1318
+    },
+    {
+      "epoch": 0.7034666666666667,
+      "grad_norm": 0.5473163389609469,
+      "learning_rate": 4.271315449981934e-05,
+      "loss": 0.7799,
+      "step": 1319
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.50186115330255,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.7963,
+      "step": 1320
+    },
+    {
+      "epoch": 0.7045333333333333,
+      "grad_norm": 0.4587261665217103,
+      "learning_rate": 4.2430219089370823e-05,
+      "loss": 0.7275,
+      "step": 1321
+    },
+    {
+      "epoch": 0.7050666666666666,
+      "grad_norm": 0.4527605528114259,
+      "learning_rate": 4.228900904120895e-05,
+      "loss": 0.7458,
+      "step": 1322
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.5838291596429387,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.8659,
+      "step": 1323
+    },
+    {
+      "epoch": 0.7061333333333333,
+      "grad_norm": 0.512717733298456,
+      "learning_rate": 4.200710636738189e-05,
+      "loss": 0.6596,
+      "step": 1324
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 0.5987342949739224,
+      "learning_rate": 4.1866414583520877e-05,
+      "loss": 0.6745,
+      "step": 1325
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.5129825298493389,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.7741,
+      "step": 1326
+    },
+    {
+      "epoch": 0.7077333333333333,
+      "grad_norm": 0.46113293033011576,
+      "learning_rate": 4.158555222253771e-05,
+      "loss": 0.7784,
+      "step": 1327
+    },
+    {
+      "epoch": 0.7082666666666667,
+      "grad_norm": 0.48071864996884056,
+      "learning_rate": 4.14453824841132e-05,
+      "loss": 0.6686,
+      "step": 1328
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.4857320789554157,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.7695,
+      "step": 1329
+    },
+    {
+      "epoch": 0.7093333333333334,
+      "grad_norm": 0.5172260493693579,
+      "learning_rate": 4.1165567984237764e-05,
+      "loss": 0.7837,
+      "step": 1330
+    },
+    {
+      "epoch": 0.7098666666666666,
+      "grad_norm": 0.4839080726160306,
+      "learning_rate": 4.102592405835536e-05,
+      "loss": 0.7814,
+      "step": 1331
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.5369781118832275,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7994,
+      "step": 1332
+    },
+    {
+      "epoch": 0.7109333333333333,
+      "grad_norm": 0.40285291682654806,
+      "learning_rate": 4.074716493968975e-05,
+      "loss": 0.6685,
+      "step": 1333
+    },
+    {
+      "epoch": 0.7114666666666667,
+      "grad_norm": 0.43913336406413467,
+      "learning_rate": 4.060805057932359e-05,
+      "loss": 0.6789,
+      "step": 1334
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.36850949745938216,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.5985,
+      "step": 1335
+    },
+    {
+      "epoch": 0.7125333333333334,
+      "grad_norm": 0.45256461892901884,
+      "learning_rate": 4.0330354333606234e-05,
+      "loss": 0.6901,
+      "step": 1336
+    },
+    {
+      "epoch": 0.7130666666666666,
+      "grad_norm": 0.502801265442768,
+      "learning_rate": 4.019177327749822e-05,
+      "loss": 0.7156,
+      "step": 1337
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.5754365140036534,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.8076,
+      "step": 1338
+    },
+    {
+      "epoch": 0.7141333333333333,
+      "grad_norm": 0.5397331045353989,
+      "learning_rate": 3.991514736790258e-05,
+      "loss": 0.7374,
+      "step": 1339
+    },
+    {
+      "epoch": 0.7146666666666667,
+      "grad_norm": 0.5325726924140836,
+      "learning_rate": 3.977710334046193e-05,
+      "loss": 0.7451,
+      "step": 1340
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.8225720471674686,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.9019,
+      "step": 1341
+    },
+    {
+      "epoch": 0.7157333333333333,
+      "grad_norm": 0.3863498805878031,
+      "learning_rate": 3.950155520139581e-05,
+      "loss": 0.6633,
+      "step": 1342
+    },
+    {
+      "epoch": 0.7162666666666667,
+      "grad_norm": 0.47018195009456326,
+      "learning_rate": 3.936405191259891e-05,
+      "loss": 0.7058,
+      "step": 1343
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.4848003787682862,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.8032,
+      "step": 1344
+    },
+    {
+      "epoch": 0.7173333333333334,
+      "grad_norm": 0.5708191505695828,
+      "learning_rate": 3.9089588949504655e-05,
+      "loss": 0.6495,
+      "step": 1345
+    },
+    {
+      "epoch": 0.7178666666666667,
+      "grad_norm": 0.4856249306646381,
+      "learning_rate": 3.895263009479534e-05,
+      "loss": 0.7603,
+      "step": 1346
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.5102703988217077,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.7876,
+      "step": 1347
+    },
+    {
+      "epoch": 0.7189333333333333,
+      "grad_norm": 0.47570898672635514,
+      "learning_rate": 3.867925968395085e-05,
+      "loss": 0.717,
+      "step": 1348
+    },
+    {
+      "epoch": 0.7194666666666667,
+      "grad_norm": 0.43657326247985034,
+      "learning_rate": 3.854284894414122e-05,
+      "loss": 0.6536,
+      "step": 1349
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.540699085510768,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.8676,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7205333333333334,
+      "grad_norm": 0.45587842900608117,
+      "learning_rate": 3.82705784324618e-05,
+      "loss": 0.6982,
+      "step": 1351
+    },
+    {
+      "epoch": 0.7210666666666666,
+      "grad_norm": 0.5504378504505844,
+      "learning_rate": 3.8134719473633094e-05,
+      "loss": 0.7597,
+      "step": 1352
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.5536472829823509,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6834,
+      "step": 1353
+    },
+    {
+      "epoch": 0.7221333333333333,
+      "grad_norm": 0.5188680535148331,
+      "learning_rate": 3.786355617847385e-05,
+      "loss": 0.6965,
+      "step": 1354
+    },
+    {
+      "epoch": 0.7226666666666667,
+      "grad_norm": 0.4464326809238666,
+      "learning_rate": 3.772825265187802e-05,
+      "loss": 0.7012,
+      "step": 1355
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.4429020145145274,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.715,
+      "step": 1356
+    },
+    {
+      "epoch": 0.7237333333333333,
+      "grad_norm": 0.49953986575210324,
+      "learning_rate": 3.7458203860837234e-05,
+      "loss": 0.719,
+      "step": 1357
+    },
+    {
+      "epoch": 0.7242666666666666,
+      "grad_norm": 0.5489530610657923,
+      "learning_rate": 3.732345940279893e-05,
+      "loss": 0.8231,
+      "step": 1358
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.5161147727463107,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.7433,
+      "step": 1359
+    },
+    {
+      "epoch": 0.7253333333333334,
+      "grad_norm": 0.5372609302833079,
+      "learning_rate": 3.705453237352227e-05,
+      "loss": 0.7723,
+      "step": 1360
+    },
+    {
+      "epoch": 0.7258666666666667,
+      "grad_norm": 0.6341247855319071,
+      "learning_rate": 3.692035060534088e-05,
+      "loss": 0.7647,
+      "step": 1361
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.4109938086219669,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.6744,
+      "step": 1362
+    },
+    {
+      "epoch": 0.7269333333333333,
+      "grad_norm": 0.505211435260148,
+      "learning_rate": 3.665255256532638e-05,
+      "loss": 0.6792,
+      "step": 1363
+    },
+    {
+      "epoch": 0.7274666666666667,
+      "grad_norm": 0.5648569797645943,
+      "learning_rate": 3.651893709317887e-05,
+      "loss": 0.7383,
+      "step": 1364
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4893528457125704,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6933,
+      "step": 1365
+    },
+    {
+      "epoch": 0.7285333333333334,
+      "grad_norm": 0.4045184960271441,
+      "learning_rate": 3.625227523958252e-05,
+      "loss": 0.6603,
+      "step": 1366
+    },
+    {
+      "epoch": 0.7290666666666666,
+      "grad_norm": 0.5861357493096865,
+      "learning_rate": 3.611922965442648e-05,
+      "loss": 0.822,
+      "step": 1367
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.38795550705368015,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.645,
+      "step": 1368
+    },
+    {
+      "epoch": 0.7301333333333333,
+      "grad_norm": 0.522397763584867,
+      "learning_rate": 3.5853711153868965e-05,
+      "loss": 0.7986,
+      "step": 1369
+    },
+    {
+      "epoch": 0.7306666666666667,
+      "grad_norm": 0.45390817918484644,
+      "learning_rate": 3.5721239031346066e-05,
+      "loss": 0.7065,
+      "step": 1370
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.5231333092724972,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.8067,
+      "step": 1371
+    },
+    {
+      "epoch": 0.7317333333333333,
+      "grad_norm": 0.43445809413391945,
+      "learning_rate": 3.545687101972013e-05,
+      "loss": 0.744,
+      "step": 1372
+    },
+    {
+      "epoch": 0.7322666666666666,
+      "grad_norm": 0.3844381213204554,
+      "learning_rate": 3.53249759200601e-05,
+      "loss": 0.6569,
+      "step": 1373
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4677279618543648,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.7083,
+      "step": 1374
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.4924843715111023,
+      "learning_rate": 3.506176550233863e-05,
+      "loss": 0.7512,
+      "step": 1375
+    },
+    {
+      "epoch": 0.7338666666666667,
+      "grad_norm": 0.4877946483819527,
+      "learning_rate": 3.4930450970263485e-05,
+      "loss": 0.6858,
+      "step": 1376
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.4464224687908001,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6764,
+      "step": 1377
+    },
+    {
+      "epoch": 0.7349333333333333,
+      "grad_norm": 0.563939766074518,
+      "learning_rate": 3.46684052203088e-05,
+      "loss": 0.8946,
+      "step": 1378
+    },
+    {
+      "epoch": 0.7354666666666667,
+      "grad_norm": 0.4595855410253218,
+      "learning_rate": 3.4537674784937614e-05,
+      "loss": 0.7412,
+      "step": 1379
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.5629112802578629,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.7561,
+      "step": 1380
+    },
+    {
+      "epoch": 0.7365333333333334,
+      "grad_norm": 0.4232588748038259,
+      "learning_rate": 3.427680074531113e-05,
+      "loss": 0.6763,
+      "step": 1381
+    },
+    {
+      "epoch": 0.7370666666666666,
+      "grad_norm": 0.5377103251563394,
+      "learning_rate": 3.4146657920065285e-05,
+      "loss": 0.7588,
+      "step": 1382
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.49726972943186487,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.6818,
+      "step": 1383
+    },
+    {
+      "epoch": 0.7381333333333333,
+      "grad_norm": 0.39578621570415834,
+      "learning_rate": 3.388696260183832e-05,
+      "loss": 0.6593,
+      "step": 1384
+    },
+    {
+      "epoch": 0.7386666666666667,
+      "grad_norm": 0.6296991797938082,
+      "learning_rate": 3.3757410884346894e-05,
+      "loss": 0.8284,
+      "step": 1385
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.4689559507210876,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6648,
+      "step": 1386
+    },
+    {
+      "epoch": 0.7397333333333334,
+      "grad_norm": 0.6139376355920769,
+      "learning_rate": 3.3498901266912396e-05,
+      "loss": 0.6431,
+      "step": 1387
+    },
+    {
+      "epoch": 0.7402666666666666,
+      "grad_norm": 0.502667099627591,
+      "learning_rate": 3.336994413891828e-05,
+      "loss": 0.7902,
+      "step": 1388
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.4718534691657566,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.6981,
+      "step": 1389
+    },
+    {
+      "epoch": 0.7413333333333333,
+      "grad_norm": 0.42320038156633955,
+      "learning_rate": 3.3112627169802946e-05,
+      "loss": 0.7106,
+      "step": 1390
+    },
+    {
+      "epoch": 0.7418666666666667,
+      "grad_norm": 0.527489408575544,
+      "learning_rate": 3.298426809706928e-05,
+      "loss": 0.7137,
+      "step": 1391
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.642591509139478,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.8138,
+      "step": 1392
+    },
+    {
+      "epoch": 0.7429333333333333,
+      "grad_norm": 0.5022455342624368,
+      "learning_rate": 3.2728150691747115e-05,
+      "loss": 0.7695,
+      "step": 1393
+    },
+    {
+      "epoch": 0.7434666666666667,
+      "grad_norm": 0.528433410560178,
+      "learning_rate": 3.2600393123964113e-05,
+      "loss": 0.7774,
+      "step": 1394
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.40181853778869725,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6488,
+      "step": 1395
+    },
+    {
+      "epoch": 0.7445333333333334,
+      "grad_norm": 0.5703147394371794,
+      "learning_rate": 3.234548216567049e-05,
+      "loss": 0.8427,
+      "step": 1396
+    },
+    {
+      "epoch": 0.7450666666666667,
+      "grad_norm": 0.6066274810399689,
+      "learning_rate": 3.2218329536362704e-05,
+      "loss": 0.7415,
+      "step": 1397
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.5318681701221755,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.7071,
+      "step": 1398
+    },
+    {
+      "epoch": 0.7461333333333333,
+      "grad_norm": 0.43522551039961643,
+      "learning_rate": 3.196463187590929e-05,
+      "loss": 0.6577,
+      "step": 1399
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.48382212436414673,
+      "learning_rate": 3.1838087602343344e-05,
+      "loss": 0.7464,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.47912591586128217,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.7578,
+      "step": 1401
+    },
+    {
+      "epoch": 0.7477333333333334,
+      "grad_norm": 0.5554221788061645,
+      "learning_rate": 3.158561005793402e-05,
+      "loss": 0.7379,
+      "step": 1402
+    },
+    {
+      "epoch": 0.7482666666666666,
+      "grad_norm": 0.46179248308158116,
+      "learning_rate": 3.145967754102691e-05,
+      "loss": 0.6997,
+      "step": 1403
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.5520896260902555,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.6774,
+      "step": 1404
+    },
+    {
+      "epoch": 0.7493333333333333,
+      "grad_norm": 0.716871579835956,
+      "learning_rate": 3.120842689807468e-05,
+      "loss": 0.7016,
+      "step": 1405
+    },
+    {
+      "epoch": 0.7498666666666667,
+      "grad_norm": 0.4688654344543536,
+      "learning_rate": 3.108310952230212e-05,
+      "loss": 0.6842,
+      "step": 1406
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.45952842117017356,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6954,
+      "step": 1407
+    },
+    {
+      "epoch": 0.7509333333333333,
+      "grad_norm": 0.6037020858318494,
+      "learning_rate": 3.083309253324651e-05,
+      "loss": 0.9216,
+      "step": 1408
+    },
+    {
+      "epoch": 0.7514666666666666,
+      "grad_norm": 0.5112284522906911,
+      "learning_rate": 3.070839366655215e-05,
+      "loss": 0.7414,
+      "step": 1409
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.6778229380656393,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.96,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7525333333333334,
+      "grad_norm": 0.5318024704287667,
+      "learning_rate": 3.0459617050677868e-05,
+      "loss": 0.8142,
+      "step": 1411
+    },
+    {
+      "epoch": 0.7530666666666667,
+      "grad_norm": 0.5504999061969572,
+      "learning_rate": 3.0335540044382694e-05,
+      "loss": 0.6299,
+      "step": 1412
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.7168605084150267,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.8434,
+      "step": 1413
+    },
+    {
+      "epoch": 0.7541333333333333,
+      "grad_norm": 0.5326176287349795,
+      "learning_rate": 3.008801048763914e-05,
+      "loss": 0.7535,
+      "step": 1414
+    },
+    {
+      "epoch": 0.7546666666666667,
+      "grad_norm": 0.5198798992229317,
+      "learning_rate": 2.996455867635155e-05,
+      "loss": 0.7693,
+      "step": 1415
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.5062151710695797,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.8153,
+      "step": 1416
+    },
+    {
+      "epoch": 0.7557333333333334,
+      "grad_norm": 0.5339691423415097,
+      "learning_rate": 2.9718282831172883e-05,
+      "loss": 0.8945,
+      "step": 1417
+    },
+    {
+      "epoch": 0.7562666666666666,
+      "grad_norm": 0.7436470699997376,
+      "learning_rate": 2.9595459532698854e-05,
+      "loss": 0.8291,
+      "step": 1418
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.5341457861939325,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.7916,
+      "step": 1419
+    },
+    {
+      "epoch": 0.7573333333333333,
+      "grad_norm": 0.49275760119798284,
+      "learning_rate": 2.9350444017825385e-05,
+      "loss": 0.7193,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7578666666666667,
+      "grad_norm": 0.5435704259557418,
+      "learning_rate": 2.922825253307947e-05,
+      "loss": 0.7764,
+      "step": 1421
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.5818631696995898,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.8297,
+      "step": 1422
+    },
+    {
+      "epoch": 0.7589333333333333,
+      "grad_norm": 0.383556959342635,
+      "learning_rate": 2.898450393337977e-05,
+      "loss": 0.737,
+      "step": 1423
+    },
+    {
+      "epoch": 0.7594666666666666,
+      "grad_norm": 0.453342843949634,
+      "learning_rate": 2.8862947546296315e-05,
+      "loss": 0.7155,
+      "step": 1424
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.4352936908218544,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.6515,
+      "step": 1425
+    },
+    {
+      "epoch": 0.7605333333333333,
+      "grad_norm": 0.5836509030705093,
+      "learning_rate": 2.8620472412590228e-05,
+      "loss": 0.8344,
+      "step": 1426
+    },
+    {
+      "epoch": 0.7610666666666667,
+      "grad_norm": 0.49040324968759913,
+      "learning_rate": 2.8499554390035143e-05,
+      "loss": 0.6792,
+      "step": 1427
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.4928716736478312,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.74,
+      "step": 1428
+    },
+    {
+      "epoch": 0.7621333333333333,
+      "grad_norm": 0.41012802144608346,
+      "learning_rate": 2.8258359238917665e-05,
+      "loss": 0.578,
+      "step": 1429
+    },
+    {
+      "epoch": 0.7626666666666667,
+      "grad_norm": 0.5183382299695679,
+      "learning_rate": 2.8138082830600554e-05,
+      "loss": 0.8857,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.5061749397926986,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.7264,
+      "step": 1431
+    },
+    {
+      "epoch": 0.7637333333333334,
+      "grad_norm": 0.5191869515210203,
+      "learning_rate": 2.7898174144266732e-05,
+      "loss": 0.706,
+      "step": 1432
+    },
+    {
+      "epoch": 0.7642666666666666,
+      "grad_norm": 0.7585326022201629,
+      "learning_rate": 2.7778542582653744e-05,
+      "loss": 0.7991,
+      "step": 1433
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.42278581011950195,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6932,
+      "step": 1434
+    },
+    {
+      "epoch": 0.7653333333333333,
+      "grad_norm": 0.4679848278643742,
+      "learning_rate": 2.753992680872457e-05,
+      "loss": 0.687,
+      "step": 1435
+    },
+    {
+      "epoch": 0.7658666666666667,
+      "grad_norm": 0.48323632128045796,
+      "learning_rate": 2.7420943308951284e-05,
+      "loss": 0.7556,
+      "step": 1436
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.4557423553247412,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.7141,
+      "step": 1437
+    },
+    {
+      "epoch": 0.7669333333333334,
+      "grad_norm": 0.5294898419459036,
+      "learning_rate": 2.7183626860300247e-05,
+      "loss": 0.8618,
+      "step": 1438
+    },
+    {
+      "epoch": 0.7674666666666666,
+      "grad_norm": 0.5274095672286221,
+      "learning_rate": 2.7065294620085424e-05,
+      "loss": 0.7719,
+      "step": 1439
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.502821859324942,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.7703,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7685333333333333,
+      "grad_norm": 0.5882461866882848,
+      "learning_rate": 2.6829283874666233e-05,
+      "loss": 0.7777,
+      "step": 1441
+    },
+    {
+      "epoch": 0.7690666666666667,
+      "grad_norm": 0.5100547868843539,
+      "learning_rate": 2.6711606074225782e-05,
+      "loss": 0.7625,
+      "step": 1442
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.6289157409884821,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.8166,
+      "step": 1443
+    },
+    {
+      "epoch": 0.7701333333333333,
+      "grad_norm": 0.5246731496364857,
+      "learning_rate": 2.647690737490106e-05,
+      "loss": 0.7315,
+      "step": 1444
+    },
+    {
+      "epoch": 0.7706666666666667,
+      "grad_norm": 0.5517644110475223,
+      "learning_rate": 2.6359887176862718e-05,
+      "loss": 0.7715,
+      "step": 1445
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.5750468069036211,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.7753,
+      "step": 1446
+    },
+    {
+      "epoch": 0.7717333333333334,
+      "grad_norm": 0.571760864960472,
+      "learning_rate": 2.6126506831233344e-05,
+      "loss": 0.7922,
+      "step": 1447
+    },
+    {
+      "epoch": 0.7722666666666667,
+      "grad_norm": 0.3844181462637176,
+      "learning_rate": 2.6010147380551475e-05,
+      "loss": 0.7023,
+      "step": 1448
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.43877818166129945,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6706,
+      "step": 1449
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.3868466487357469,
+      "learning_rate": 2.577809166078716e-05,
+      "loss": 0.6642,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7738666666666667,
+      "grad_norm": 0.492322352337929,
+      "learning_rate": 2.566239608465838e-05,
+      "loss": 0.6917,
+      "step": 1451
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.5446397964981483,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.7429,
+      "step": 1452
+    },
+    {
+      "epoch": 0.7749333333333334,
+      "grad_norm": 0.5543790229843352,
+      "learning_rate": 2.543167122732918e-05,
+      "loss": 0.7267,
+      "step": 1453
+    },
+    {
+      "epoch": 0.7754666666666666,
+      "grad_norm": 0.5247544238266016,
+      "learning_rate": 2.5316642635108244e-05,
+      "loss": 0.7613,
+      "step": 1454
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.5039706234528507,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.7619,
+      "step": 1455
+    },
+    {
+      "epoch": 0.7765333333333333,
+      "grad_norm": 0.5713153815073273,
+      "learning_rate": 2.508725484101684e-05,
+      "loss": 0.7059,
+      "step": 1456
+    },
+    {
+      "epoch": 0.7770666666666667,
+      "grad_norm": 0.44923373773430025,
+      "learning_rate": 2.4972896324133144e-05,
+      "loss": 0.7205,
+      "step": 1457
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.5263321786867668,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.7533,
+      "step": 1458
+    },
+    {
+      "epoch": 0.7781333333333333,
+      "grad_norm": 0.4240566695575976,
+      "learning_rate": 2.4744851758148156e-05,
+      "loss": 0.7257,
+      "step": 1459
+    },
+    {
+      "epoch": 0.7786666666666666,
+      "grad_norm": 0.4737882576220856,
+      "learning_rate": 2.4631166390022574e-05,
+      "loss": 0.7747,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.4316557468939239,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6621,
+      "step": 1461
+    },
+    {
+      "epoch": 0.7797333333333333,
+      "grad_norm": 0.5249481145085421,
+      "learning_rate": 2.4404471180913058e-05,
+      "loss": 0.7565,
+      "step": 1462
+    },
+    {
+      "epoch": 0.7802666666666667,
+      "grad_norm": 0.42462653959991414,
+      "learning_rate": 2.429146201687538e-05,
+      "loss": 0.72,
+      "step": 1463
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.4443353759436208,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.7001,
+      "step": 1464
+    },
+    {
+      "epoch": 0.7813333333333333,
+      "grad_norm": 0.573134473668386,
+      "learning_rate": 2.4066122257145894e-05,
+      "loss": 0.6942,
+      "step": 1465
+    },
+    {
+      "epoch": 0.7818666666666667,
+      "grad_norm": 0.4585178063664748,
+      "learning_rate": 2.3953792334352787e-05,
+      "loss": 0.7028,
+      "step": 1466
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.4948736403806755,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6499,
+      "step": 1467
+    },
+    {
+      "epoch": 0.7829333333333334,
+      "grad_norm": 0.49334388896702935,
+      "learning_rate": 2.3729814080079816e-05,
+      "loss": 0.7077,
+      "step": 1468
+    },
+    {
+      "epoch": 0.7834666666666666,
+      "grad_norm": 0.6176119339584434,
+      "learning_rate": 2.361816641743303e-05,
+      "loss": 0.7847,
+      "step": 1469
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.43537482237087494,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.6734,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7845333333333333,
+      "grad_norm": 0.4313016572978186,
+      "learning_rate": 2.339555568810221e-05,
+      "loss": 0.7047,
+      "step": 1471
+    },
+    {
+      "epoch": 0.7850666666666667,
+      "grad_norm": 0.440581929957751,
+      "learning_rate": 2.328459328616759e-05,
+      "loss": 0.7409,
+      "step": 1472
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.47207732516459877,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.7346,
+      "step": 1473
+    },
+    {
+      "epoch": 0.7861333333333334,
+      "grad_norm": 0.44963953263791284,
+      "learning_rate": 2.306335606451181e-05,
+      "loss": 0.6737,
+      "step": 1474
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 0.5650758578094732,
+      "learning_rate": 2.295308190543859e-05,
+      "loss": 0.7682,
+      "step": 1475
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.50788328603849,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.7383,
+      "step": 1476
+    },
+    {
+      "epoch": 0.7877333333333333,
+      "grad_norm": 0.5799854089956136,
+      "learning_rate": 2.2733224137277366e-05,
+      "loss": 0.8206,
+      "step": 1477
+    },
+    {
+      "epoch": 0.7882666666666667,
+      "grad_norm": 0.5175275043881233,
+      "learning_rate": 2.262364118471805e-05,
+      "loss": 0.6883,
+      "step": 1478
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.6954639855386038,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.802,
+      "step": 1479
+    },
+    {
+      "epoch": 0.7893333333333333,
+      "grad_norm": 0.5270117727580154,
+      "learning_rate": 2.2405168778797646e-05,
+      "loss": 0.7876,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7898666666666667,
+      "grad_norm": 0.44282138308302416,
+      "learning_rate": 2.2296279977828337e-05,
+      "loss": 0.6937,
+      "step": 1481
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.44350813061965494,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.7533,
+      "step": 1482
+    },
+    {
+      "epoch": 0.7909333333333334,
+      "grad_norm": 0.4290269336099863,
+      "learning_rate": 2.2079198805662914e-05,
+      "loss": 0.6326,
+      "step": 1483
+    },
+    {
+      "epoch": 0.7914666666666667,
+      "grad_norm": 0.47516800700789324,
+      "learning_rate": 2.1971007082704164e-05,
+      "loss": 0.6657,
+      "step": 1484
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.44226535940405953,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6882,
+      "step": 1485
+    },
+    {
+      "epoch": 0.7925333333333333,
+      "grad_norm": 0.6018934743437193,
+      "learning_rate": 2.1755322978418137e-05,
+      "loss": 0.8168,
+      "step": 1486
+    },
+    {
+      "epoch": 0.7930666666666667,
+      "grad_norm": 0.49401317098463216,
+      "learning_rate": 2.1647831241156302e-05,
+      "loss": 0.7495,
+      "step": 1487
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.46866523180432645,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6789,
+      "step": 1488
+    },
+    {
+      "epoch": 0.7941333333333334,
+      "grad_norm": 0.5246029946716454,
+      "learning_rate": 2.1433550001327373e-05,
+      "loss": 0.6717,
+      "step": 1489
+    },
+    {
+      "epoch": 0.7946666666666666,
+      "grad_norm": 0.5835799171355602,
+      "learning_rate": 2.1326761138636553e-05,
+      "loss": 0.7297,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.5048177896739099,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6995,
+      "step": 1491
+    },
+    {
+      "epoch": 0.7957333333333333,
+      "grad_norm": 0.4361443087723303,
+      "learning_rate": 2.111388852214001e-05,
+      "loss": 0.6656,
+      "step": 1492
+    },
+    {
+      "epoch": 0.7962666666666667,
+      "grad_norm": 0.6058728372984273,
+      "learning_rate": 2.1007805404004242e-05,
+      "loss": 0.7824,
+      "step": 1493
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.6374772059823162,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.8744,
+      "step": 1494
+    },
+    {
+      "epoch": 0.7973333333333333,
+      "grad_norm": 0.4282587409275975,
+      "learning_rate": 2.0796347131858186e-05,
+      "loss": 0.6471,
+      "step": 1495
+    },
+    {
+      "epoch": 0.7978666666666666,
+      "grad_norm": 0.429099311758806,
+      "learning_rate": 2.069097260929439e-05,
+      "loss": 0.6038,
+      "step": 1496
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.46101280173127135,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6908,
+      "step": 1497
+    },
+    {
+      "epoch": 0.7989333333333334,
+      "grad_norm": 0.4437725012180013,
+      "learning_rate": 2.048093436450603e-05,
+      "loss": 0.6629,
+      "step": 1498
+    },
+    {
+      "epoch": 0.7994666666666667,
+      "grad_norm": 0.7348344852509376,
+      "learning_rate": 2.0376271269487514e-05,
+      "loss": 0.9117,
+      "step": 1499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.5289879732485965,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.7368,
+      "step": 1500
+    },
+    {
+      "epoch": 0.8005333333333333,
+      "grad_norm": 0.4774681909052075,
+      "learning_rate": 2.0167658696900317e-05,
+      "loss": 0.7869,
+      "step": 1501
+    },
+    {
+      "epoch": 0.8010666666666667,
+      "grad_norm": 0.4391310441341285,
+      "learning_rate": 2.0063709842280432e-05,
+      "loss": 0.6039,
+      "step": 1502
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.6015495262618114,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.8394,
+      "step": 1503
+    },
+    {
+      "epoch": 0.8021333333333334,
+      "grad_norm": 0.5681477525141515,
+      "learning_rate": 1.985652854842247e-05,
+      "loss": 0.7568,
+      "step": 1504
+    },
+    {
+      "epoch": 0.8026666666666666,
+      "grad_norm": 0.5289834794894012,
+      "learning_rate": 1.9753296727859195e-05,
+      "loss": 0.7474,
+      "step": 1505
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.4754287143901146,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.7412,
+      "step": 1506
+    },
+    {
+      "epoch": 0.8037333333333333,
+      "grad_norm": 0.49668724912942624,
+      "learning_rate": 1.9547552280792524e-05,
+      "loss": 0.7087,
+      "step": 1507
+    },
+    {
+      "epoch": 0.8042666666666667,
+      "grad_norm": 0.47088805597233857,
+      "learning_rate": 1.9445040268673298e-05,
+      "loss": 0.6956,
+      "step": 1508
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.5627325429138672,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7543,
+      "step": 1509
+    },
+    {
+      "epoch": 0.8053333333333333,
+      "grad_norm": 0.4611693226301201,
+      "learning_rate": 1.9240738197844278e-05,
+      "loss": 0.6894,
+      "step": 1510
+    },
+    {
+      "epoch": 0.8058666666666666,
+      "grad_norm": 0.45336038424072905,
+      "learning_rate": 1.9138948749211472e-05,
+      "loss": 0.7619,
+      "step": 1511
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.48155579631399054,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.799,
+      "step": 1512
+    },
+    {
+      "epoch": 0.8069333333333333,
+      "grad_norm": 0.5930487351489279,
+      "learning_rate": 1.8936094545302095e-05,
+      "loss": 0.75,
+      "step": 1513
+    },
+    {
+      "epoch": 0.8074666666666667,
+      "grad_norm": 0.45997113728666444,
+      "learning_rate": 1.883503039577894e-05,
+      "loss": 0.7387,
+      "step": 1514
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.4772986674601558,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7531,
+      "step": 1515
+    },
+    {
+      "epoch": 0.8085333333333333,
+      "grad_norm": 0.342855727151531,
+      "learning_rate": 1.8633629510559314e-05,
+      "loss": 0.5577,
+      "step": 1516
+    },
+    {
+      "epoch": 0.8090666666666667,
+      "grad_norm": 0.5207852106715284,
+      "learning_rate": 1.8533293376276472e-05,
+      "loss": 0.7428,
+      "step": 1517
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.4480357947246842,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.6931,
+      "step": 1518
+    },
+    {
+      "epoch": 0.8101333333333334,
+      "grad_norm": 0.5086770454566198,
+      "learning_rate": 1.8333351222458407e-05,
+      "loss": 0.7593,
+      "step": 1519
+    },
+    {
+      "epoch": 0.8106666666666666,
+      "grad_norm": 0.4999233008690756,
+      "learning_rate": 1.8233745799980817e-05,
+      "loss": 0.7642,
+      "step": 1520
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.43508693288352246,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.7282,
+      "step": 1521
+    },
+    {
+      "epoch": 0.8117333333333333,
+      "grad_norm": 0.5929791884787343,
+      "learning_rate": 1.803526775107217e-05,
+      "loss": 0.8132,
+      "step": 1522
+    },
+    {
+      "epoch": 0.8122666666666667,
+      "grad_norm": 0.4793522536878096,
+      "learning_rate": 1.7936395717326704e-05,
+      "loss": 0.6612,
+      "step": 1523
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4082634546055162,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.7086,
+      "step": 1524
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 0.5522124987414765,
+      "learning_rate": 1.773938710748706e-05,
+      "loss": 0.7897,
+      "step": 1525
+    },
+    {
+      "epoch": 0.8138666666666666,
+      "grad_norm": 0.49094203430688615,
+      "learning_rate": 1.7641251119690505e-05,
+      "loss": 0.7265,
+      "step": 1526
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.5446964417982316,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.7531,
+      "step": 1527
+    },
+    {
+      "epoch": 0.8149333333333333,
+      "grad_norm": 0.47350783687561393,
+      "learning_rate": 1.744571724358789e-05,
+      "loss": 0.7659,
+      "step": 1528
+    },
+    {
+      "epoch": 0.8154666666666667,
+      "grad_norm": 0.47482720663461636,
+      "learning_rate": 1.7348319939175637e-05,
+      "loss": 0.7414,
+      "step": 1529
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.4908705406803905,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.7528,
+      "step": 1530
+    },
+    {
+      "epoch": 0.8165333333333333,
+      "grad_norm": 0.4250681206948574,
+      "learning_rate": 1.715426605184407e-05,
+      "loss": 0.6601,
+      "step": 1531
+    },
+    {
+      "epoch": 0.8170666666666667,
+      "grad_norm": 0.5343277827022553,
+      "learning_rate": 1.705761004839911e-05,
+      "loss": 0.7508,
+      "step": 1532
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.5335001480302303,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.6447,
+      "step": 1533
+    },
+    {
+      "epoch": 0.8181333333333334,
+      "grad_norm": 0.5026620625081853,
+      "learning_rate": 1.6865041365097435e-05,
+      "loss": 0.6882,
+      "step": 1534
+    },
+    {
+      "epoch": 0.8186666666666667,
+      "grad_norm": 0.5794331730821927,
+      "learning_rate": 1.676912926028007e-05,
+      "loss": 0.8791,
+      "step": 1535
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.44335294230259914,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.646,
+      "step": 1536
+    },
+    {
+      "epoch": 0.8197333333333333,
+      "grad_norm": 0.5489816260124318,
+      "learning_rate": 1.6578050956351886e-05,
+      "loss": 0.8286,
+      "step": 1537
+    },
+    {
+      "epoch": 0.8202666666666667,
+      "grad_norm": 0.5339060440220494,
+      "learning_rate": 1.6482885327829913e-05,
+      "loss": 0.776,
+      "step": 1538
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.3975566328563563,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.7191,
+      "step": 1539
+    },
+    {
+      "epoch": 0.8213333333333334,
+      "grad_norm": 0.7170117974700864,
+      "learning_rate": 1.6293302538564382e-05,
+      "loss": 0.6505,
+      "step": 1540
+    },
+    {
+      "epoch": 0.8218666666666666,
+      "grad_norm": 0.5443132706187244,
+      "learning_rate": 1.619888594394382e-05,
+      "loss": 0.7023,
+      "step": 1541
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.6813717234168647,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.8853,
+      "step": 1542
+    },
+    {
+      "epoch": 0.8229333333333333,
+      "grad_norm": 0.42647268339192634,
+      "learning_rate": 1.601080376443763e-05,
+      "loss": 0.6592,
+      "step": 1543
+    },
+    {
+      "epoch": 0.8234666666666667,
+      "grad_norm": 0.5076398265848955,
+      "learning_rate": 1.5917138741193973e-05,
+      "loss": 0.6896,
+      "step": 1544
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.5395449562699478,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.7446,
+      "step": 1545
+    },
+    {
+      "epoch": 0.8245333333333333,
+      "grad_norm": 0.4397096635647227,
+      "learning_rate": 1.573056222621453e-05,
+      "loss": 0.7337,
+      "step": 1546
+    },
+    {
+      "epoch": 0.8250666666666666,
+      "grad_norm": 0.4661381882198113,
+      "learning_rate": 1.5637651291624523e-05,
+      "loss": 0.7243,
+      "step": 1547
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.5304568695304118,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.686,
+      "step": 1548
+    },
+    {
+      "epoch": 0.8261333333333334,
+      "grad_norm": 0.5139764135159359,
+      "learning_rate": 1.5452585455473977e-05,
+      "loss": 0.7702,
+      "step": 1549
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.4789455046031435,
+      "learning_rate": 1.536043110654809e-05,
+      "loss": 0.6182,
+      "step": 1550
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.6320559508800366,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.7692,
+      "step": 1551
+    },
+    {
+      "epoch": 0.8277333333333333,
+      "grad_norm": 0.4280059561481728,
+      "learning_rate": 1.5176880922928616e-05,
+      "loss": 0.6595,
+      "step": 1552
+    },
+    {
+      "epoch": 0.8282666666666667,
+      "grad_norm": 0.5164363899718144,
+      "learning_rate": 1.5085485636343755e-05,
+      "loss": 0.7251,
+      "step": 1553
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.5049914807371246,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.684,
+      "step": 1554
+    },
+    {
+      "epoch": 0.8293333333333334,
+      "grad_norm": 0.5739698211966615,
+      "learning_rate": 1.4903456038223939e-05,
+      "loss": 0.7271,
+      "step": 1555
+    },
+    {
+      "epoch": 0.8298666666666666,
+      "grad_norm": 0.5201445604164033,
+      "learning_rate": 1.4812822270257009e-05,
+      "loss": 0.7586,
+      "step": 1556
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.5842073507004636,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.7876,
+      "step": 1557
+    },
+    {
+      "epoch": 0.8309333333333333,
+      "grad_norm": 0.5006665125106674,
+      "learning_rate": 1.4632318149739177e-05,
+      "loss": 0.75,
+      "step": 1558
+    },
+    {
+      "epoch": 0.8314666666666667,
+      "grad_norm": 0.44481322280762703,
+      "learning_rate": 1.454244833620102e-05,
+      "loss": 0.6589,
+      "step": 1559
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.4780565108382196,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.7133,
+      "step": 1560
+    },
+    {
+      "epoch": 0.8325333333333333,
+      "grad_norm": 0.6376601193425918,
+      "learning_rate": 1.4363474544389877e-05,
+      "loss": 0.7181,
+      "step": 1561
+    },
+    {
+      "epoch": 0.8330666666666666,
+      "grad_norm": 0.4881973656946014,
+      "learning_rate": 1.4274371100559791e-05,
+      "loss": 0.6112,
+      "step": 1562
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.4847015050679535,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.7145,
+      "step": 1563
+    },
+    {
+      "epoch": 0.8341333333333333,
+      "grad_norm": 0.42890771448161547,
+      "learning_rate": 1.409693244743192e-05,
+      "loss": 0.6892,
+      "step": 1564
+    },
+    {
+      "epoch": 0.8346666666666667,
+      "grad_norm": 0.609872097613963,
+      "learning_rate": 1.4008597767992871e-05,
+      "loss": 0.802,
+      "step": 1565
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.7133164250244552,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.8849,
+      "step": 1566
+    },
+    {
+      "epoch": 0.8357333333333333,
+      "grad_norm": 0.6242493206708636,
+      "learning_rate": 1.3832699022267515e-05,
+      "loss": 0.6744,
+      "step": 1567
+    },
+    {
+      "epoch": 0.8362666666666667,
+      "grad_norm": 0.5168845067044241,
+      "learning_rate": 1.37451354812416e-05,
+      "loss": 0.666,
+      "step": 1568
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.5621679580990633,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.7005,
+      "step": 1569
+    },
+    {
+      "epoch": 0.8373333333333334,
+      "grad_norm": 0.6475097082467258,
+      "learning_rate": 1.3570781370252582e-05,
+      "loss": 0.699,
+      "step": 1570
+    },
+    {
+      "epoch": 0.8378666666666666,
+      "grad_norm": 0.4777742852510045,
+      "learning_rate": 1.3483991320937306e-05,
+      "loss": 0.7145,
+      "step": 1571
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.5394317307286133,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.7658,
+      "step": 1572
+    },
+    {
+      "epoch": 0.8389333333333333,
+      "grad_norm": 0.507781052328838,
+      "learning_rate": 1.3311186530505838e-05,
+      "loss": 0.6443,
+      "step": 1573
+    },
+    {
+      "epoch": 0.8394666666666667,
+      "grad_norm": 0.4927595899122509,
+      "learning_rate": 1.322517230541096e-05,
+      "loss": 0.744,
+      "step": 1574
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.5229718780304551,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.7315,
+      "step": 1575
+    },
+    {
+      "epoch": 0.8405333333333334,
+      "grad_norm": 0.44984755894237033,
+      "learning_rate": 1.30539214797198e-05,
+      "loss": 0.6701,
+      "step": 1576
+    },
+    {
+      "epoch": 0.8410666666666666,
+      "grad_norm": 0.5030223372803909,
+      "learning_rate": 1.2968685390504465e-05,
+      "loss": 0.754,
+      "step": 1577
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.6429144293745496,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.7469,
+      "step": 1578
+    },
+    {
+      "epoch": 0.8421333333333333,
+      "grad_norm": 0.5339353774693559,
+      "learning_rate": 1.2798993131973091e-05,
+      "loss": 0.7461,
+      "step": 1579
+    },
+    {
+      "epoch": 0.8426666666666667,
+      "grad_norm": 0.5074697215392797,
+      "learning_rate": 1.2714537469383858e-05,
+      "loss": 0.7243,
+      "step": 1580
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.5787063907047957,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.8122,
+      "step": 1581
+    },
+    {
+      "epoch": 0.8437333333333333,
+      "grad_norm": 0.46474831994188626,
+      "learning_rate": 1.2546408338544769e-05,
+      "loss": 0.6343,
+      "step": 1582
+    },
+    {
+      "epoch": 0.8442666666666667,
+      "grad_norm": 0.4647701639052436,
+      "learning_rate": 1.2462735372353996e-05,
+      "loss": 0.5977,
+      "step": 1583
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.45392179343711336,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6932,
+      "step": 1584
+    },
+    {
+      "epoch": 0.8453333333333334,
+      "grad_norm": 0.541610814787558,
+      "learning_rate": 1.2296173887730123e-05,
+      "loss": 0.6834,
+      "step": 1585
+    },
+    {
+      "epoch": 0.8458666666666667,
+      "grad_norm": 0.46960670020822476,
+      "learning_rate": 1.2213285866674905e-05,
+      "loss": 0.6951,
+      "step": 1586
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.5288161163972965,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.734,
+      "step": 1587
+    },
+    {
+      "epoch": 0.8469333333333333,
+      "grad_norm": 0.44749151366752427,
+      "learning_rate": 1.2048296504658207e-05,
+      "loss": 0.7222,
+      "step": 1588
+    },
+    {
+      "epoch": 0.8474666666666667,
+      "grad_norm": 0.44108969518091085,
+      "learning_rate": 1.1966195656380031e-05,
+      "loss": 0.6188,
+      "step": 1589
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4740794448001084,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.5919,
+      "step": 1590
+    },
+    {
+      "epoch": 0.8485333333333334,
+      "grad_norm": 0.5574353313596002,
+      "learning_rate": 1.1802782851111205e-05,
+      "loss": 0.7609,
+      "step": 1591
+    },
+    {
+      "epoch": 0.8490666666666666,
+      "grad_norm": 0.46755971845860866,
+      "learning_rate": 1.1721471382096027e-05,
+      "loss": 0.6999,
+      "step": 1592
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.482119103447549,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.7095,
+      "step": 1593
+    },
+    {
+      "epoch": 0.8501333333333333,
+      "grad_norm": 0.3913894476048914,
+      "learning_rate": 1.1559639525345311e-05,
+      "loss": 0.6112,
+      "step": 1594
+    },
+    {
+      "epoch": 0.8506666666666667,
+      "grad_norm": 0.4993711521302121,
+      "learning_rate": 1.1479119620864276e-05,
+      "loss": 0.7427,
+      "step": 1595
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.47052895381251125,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6776,
+      "step": 1596
+    },
+    {
+      "epoch": 0.8517333333333333,
+      "grad_norm": 0.5618426674787269,
+      "learning_rate": 1.1318873061913405e-05,
+      "loss": 0.7462,
+      "step": 1597
+    },
+    {
+      "epoch": 0.8522666666666666,
+      "grad_norm": 0.5218744908763494,
+      "learning_rate": 1.123914688596409e-05,
+      "loss": 0.7113,
+      "step": 1598
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.6189167119642783,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.7796,
+      "step": 1599
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.5042245301115625,
+      "learning_rate": 1.1080489931489391e-05,
+      "loss": 0.772,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8538666666666667,
+      "grad_norm": 0.6297885943248943,
+      "learning_rate": 1.1001559626737756e-05,
+      "loss": 0.9115,
+      "step": 1601
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.5568572099403468,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.7492,
+      "step": 1602
+    },
+    {
+      "epoch": 0.8549333333333333,
+      "grad_norm": 0.6810557755957365,
+      "learning_rate": 1.0844496540694515e-05,
+      "loss": 0.8225,
+      "step": 1603
+    },
+    {
+      "epoch": 0.8554666666666667,
+      "grad_norm": 0.5607435337013624,
+      "learning_rate": 1.0766364228417148e-05,
+      "loss": 0.7527,
+      "step": 1604
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.4515840060236342,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6808,
+      "step": 1605
+    },
+    {
+      "epoch": 0.8565333333333334,
+      "grad_norm": 0.6801113082103843,
+      "learning_rate": 1.0610899231924886e-05,
+      "loss": 0.8237,
+      "step": 1606
+    },
+    {
+      "epoch": 0.8570666666666666,
+      "grad_norm": 0.4562347313207618,
+      "learning_rate": 1.0533567011952094e-05,
+      "loss": 0.7752,
+      "step": 1607
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.6472491506694351,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.7549,
+      "step": 1608
+    },
+    {
+      "epoch": 0.8581333333333333,
+      "grad_norm": 0.5321472350862856,
+      "learning_rate": 1.0379704283181179e-05,
+      "loss": 0.7525,
+      "step": 1609
+    },
+    {
+      "epoch": 0.8586666666666667,
+      "grad_norm": 0.4816956212107591,
+      "learning_rate": 1.0303174233840528e-05,
+      "loss": 0.6833,
+      "step": 1610
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.6643577154994802,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.8251,
+      "step": 1611
+    },
+    {
+      "epoch": 0.8597333333333333,
+      "grad_norm": 0.5978505249505609,
+      "learning_rate": 1.0150917907899926e-05,
+      "loss": 0.8071,
+      "step": 1612
+    },
+    {
+      "epoch": 0.8602666666666666,
+      "grad_norm": 0.5947337349744622,
+      "learning_rate": 1.007519208596045e-05,
+      "loss": 0.813,
+      "step": 1613
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.49178024775865825,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.7759,
+      "step": 1614
+    },
+    {
+      "epoch": 0.8613333333333333,
+      "grad_norm": 0.4975472909610797,
+      "learning_rate": 9.924546254786493e-06,
+      "loss": 0.7224,
+      "step": 1615
+    },
+    {
+      "epoch": 0.8618666666666667,
+      "grad_norm": 0.6241219647197276,
+      "learning_rate": 9.849626695403324e-06,
+      "loss": 0.7566,
+      "step": 1616
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.42948075558014465,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6733,
+      "step": 1617
+    },
+    {
+      "epoch": 0.8629333333333333,
+      "grad_norm": 0.3953019485702067,
+      "learning_rate": 9.700595407649805e-06,
+      "loss": 0.7235,
+      "step": 1618
+    },
+    {
+      "epoch": 0.8634666666666667,
+      "grad_norm": 0.43381927608250337,
+      "learning_rate": 9.62648412430951e-06,
+      "loss": 0.6811,
+      "step": 1619
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.4883812440087532,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.7771,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8645333333333334,
+      "grad_norm": 0.4012524999491689,
+      "learning_rate": 9.479071385238892e-06,
+      "loss": 0.564,
+      "step": 1621
+    },
+    {
+      "epoch": 0.8650666666666667,
+      "grad_norm": 0.45407652559344924,
+      "learning_rate": 9.40577036970538e-06,
+      "loss": 0.6878,
+      "step": 1622
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.4360734707364681,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.7452,
+      "step": 1623
+    },
+    {
+      "epoch": 0.8661333333333333,
+      "grad_norm": 0.4338415317459866,
+      "learning_rate": 9.259980141081115e-06,
+      "loss": 0.6691,
+      "step": 1624
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.46791105312666936,
+      "learning_rate": 9.187491363342093e-06,
+      "loss": 0.7133,
+      "step": 1625
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.6443437997468915,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.7591,
+      "step": 1626
+    },
+    {
+      "epoch": 0.8677333333333334,
+      "grad_norm": 0.5840070559987589,
+      "learning_rate": 9.043327563322112e-06,
+      "loss": 0.6346,
+      "step": 1627
+    },
+    {
+      "epoch": 0.8682666666666666,
+      "grad_norm": 0.5121836872551004,
+      "learning_rate": 8.971652971536148e-06,
+      "loss": 0.8103,
+      "step": 1628
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.42595043794524284,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.5986,
+      "step": 1629
+    },
+    {
+      "epoch": 0.8693333333333333,
+      "grad_norm": 0.6219160007133622,
+      "learning_rate": 8.829119474567671e-06,
+      "loss": 0.7577,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8698666666666667,
+      "grad_norm": 0.44806459453518815,
+      "learning_rate": 8.758260995011825e-06,
+      "loss": 0.725,
+      "step": 1631
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.42680013936557165,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6499,
+      "step": 1632
+    },
+    {
+      "epoch": 0.8709333333333333,
+      "grad_norm": 0.4873858389555336,
+      "learning_rate": 8.617361631727138e-06,
+      "loss": 0.7275,
+      "step": 1633
+    },
+    {
+      "epoch": 0.8714666666666666,
+      "grad_norm": 0.90636018002638,
+      "learning_rate": 8.547321168745193e-06,
+      "loss": 0.6766,
+      "step": 1634
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.5276246709013431,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.6333,
+      "step": 1635
+    },
+    {
+      "epoch": 0.8725333333333334,
+      "grad_norm": 0.4695454884594552,
+      "learning_rate": 8.408059725858719e-06,
+      "loss": 0.6974,
+      "step": 1636
+    },
+    {
+      "epoch": 0.8730666666666667,
+      "grad_norm": 0.48589883346143603,
+      "learning_rate": 8.338839161809997e-06,
+      "loss": 0.6775,
+      "step": 1637
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.5344726949607503,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.7416,
+      "step": 1638
+    },
+    {
+      "epoch": 0.8741333333333333,
+      "grad_norm": 0.4755854308811973,
+      "learning_rate": 8.201219382016556e-06,
+      "loss": 0.7098,
+      "step": 1639
+    },
+    {
+      "epoch": 0.8746666666666667,
+      "grad_norm": 0.561545222591497,
+      "learning_rate": 8.132820577225387e-06,
+      "loss": 0.7162,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.5080344993691123,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.7329,
+      "step": 1641
+    },
+    {
+      "epoch": 0.8757333333333334,
+      "grad_norm": 0.4208799586754482,
+      "learning_rate": 7.996846159099557e-06,
+      "loss": 0.6437,
+      "step": 1642
+    },
+    {
+      "epoch": 0.8762666666666666,
+      "grad_norm": 0.4594465001381088,
+      "learning_rate": 7.929270951805178e-06,
+      "loss": 0.6687,
+      "step": 1643
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.5416270281848486,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.7968,
+      "step": 1644
+    },
+    {
+      "epoch": 0.8773333333333333,
+      "grad_norm": 0.5311902983698358,
+      "learning_rate": 7.794945549701993e-06,
+      "loss": 0.763,
+      "step": 1645
+    },
+    {
+      "epoch": 0.8778666666666667,
+      "grad_norm": 0.518468997652413,
+      "learning_rate": 7.728195756009204e-06,
+      "loss": 0.7842,
+      "step": 1646
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.47316950851905404,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.7328,
+      "step": 1647
+    },
+    {
+      "epoch": 0.8789333333333333,
+      "grad_norm": 0.4072261792424843,
+      "learning_rate": 7.595522979965819e-06,
+      "loss": 0.7069,
+      "step": 1648
+    },
+    {
+      "epoch": 0.8794666666666666,
+      "grad_norm": 0.4818977614864567,
+      "learning_rate": 7.529600393796232e-06,
+      "loss": 0.7396,
+      "step": 1649
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.43408037155400875,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.648,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8805333333333333,
+      "grad_norm": 0.4554128660458298,
+      "learning_rate": 7.3985838094349444e-06,
+      "loss": 0.6514,
+      "step": 1651
+    },
+    {
+      "epoch": 0.8810666666666667,
+      "grad_norm": 0.40047115756802987,
+      "learning_rate": 7.333490202478666e-06,
+      "loss": 0.7047,
+      "step": 1652
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.45744072060633134,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6084,
+      "step": 1653
+    },
+    {
+      "epoch": 0.8821333333333333,
+      "grad_norm": 0.4125268252314731,
+      "learning_rate": 7.204133330911178e-06,
+      "loss": 0.6247,
+      "step": 1654
+    },
+    {
+      "epoch": 0.8826666666666667,
+      "grad_norm": 0.3556923600367426,
+      "learning_rate": 7.1398704525792e-06,
+      "loss": 0.6169,
+      "step": 1655
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.4399392218149216,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6586,
+      "step": 1656
+    },
+    {
+      "epoch": 0.8837333333333334,
+      "grad_norm": 0.4537009793185571,
+      "learning_rate": 7.012176770311862e-06,
+      "loss": 0.718,
+      "step": 1657
+    },
+    {
+      "epoch": 0.8842666666666666,
+      "grad_norm": 0.7025833802101566,
+      "learning_rate": 6.948746347689183e-06,
+      "loss": 0.7511,
+      "step": 1658
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.4361650334499131,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.7057,
+      "step": 1659
+    },
+    {
+      "epoch": 0.8853333333333333,
+      "grad_norm": 0.47627661834461577,
+      "learning_rate": 6.8227192865295995e-06,
+      "loss": 0.7142,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8858666666666667,
+      "grad_norm": 0.39433561933398675,
+      "learning_rate": 6.760123024328624e-06,
+      "loss": 0.7282,
+      "step": 1661
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.4600824531990041,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.6502,
+      "step": 1662
+    },
+    {
+      "epoch": 0.8869333333333334,
+      "grad_norm": 0.8783694858925974,
+      "learning_rate": 6.635765971293484e-06,
+      "loss": 0.7062,
+      "step": 1663
+    },
+    {
+      "epoch": 0.8874666666666666,
+      "grad_norm": 0.6199721366104255,
+      "learning_rate": 6.5740055518083375e-06,
+      "loss": 0.7433,
+      "step": 1664
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.4291551148985123,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6791,
+      "step": 1665
+    },
+    {
+      "epoch": 0.8885333333333333,
+      "grad_norm": 0.5532761091198096,
+      "learning_rate": 6.451321849032288e-06,
+      "loss": 0.7314,
+      "step": 1666
+    },
+    {
+      "epoch": 0.8890666666666667,
+      "grad_norm": 0.4680846478382772,
+      "learning_rate": 6.390398932093555e-06,
+      "loss": 0.6969,
+      "step": 1667
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.6169871804556015,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.8083,
+      "step": 1668
+    },
+    {
+      "epoch": 0.8901333333333333,
+      "grad_norm": 0.4785911706625698,
+      "learning_rate": 6.269391876739495e-06,
+      "loss": 0.7172,
+      "step": 1669
+    },
+    {
+      "epoch": 0.8906666666666667,
+      "grad_norm": 0.44216287726421416,
+      "learning_rate": 6.209308099669597e-06,
+      "loss": 0.6858,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.621762468829309,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.8043,
+      "step": 1671
+    },
+    {
+      "epoch": 0.8917333333333334,
+      "grad_norm": 0.4971053975514178,
+      "learning_rate": 6.089980943839924e-06,
+      "loss": 0.7076,
+      "step": 1672
+    },
+    {
+      "epoch": 0.8922666666666667,
+      "grad_norm": 0.499530457165973,
+      "learning_rate": 6.030737921409169e-06,
+      "loss": 0.6667,
+      "step": 1673
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.5551557815670419,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6791,
+      "step": 1674
+    },
+    {
+      "epoch": 0.8933333333333333,
+      "grad_norm": 0.5502462406809632,
+      "learning_rate": 5.913093872058528e-06,
+      "loss": 0.6928,
+      "step": 1675
+    },
+    {
+      "epoch": 0.8938666666666667,
+      "grad_norm": 0.4185991301854179,
+      "learning_rate": 5.854693196441641e-06,
+      "loss": 0.681,
+      "step": 1676
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.6102472819045014,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.7735,
+      "step": 1677
+    },
+    {
+      "epoch": 0.8949333333333334,
+      "grad_norm": 0.4038728596972177,
+      "learning_rate": 5.738735415290642e-06,
+      "loss": 0.6997,
+      "step": 1678
+    },
+    {
+      "epoch": 0.8954666666666666,
+      "grad_norm": 0.6112272850635387,
+      "learning_rate": 5.681178656024055e-06,
+      "loss": 0.7906,
+      "step": 1679
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.6823050571999114,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.7393,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8965333333333333,
+      "grad_norm": 0.48044910575690514,
+      "learning_rate": 5.566910259474289e-06,
+      "loss": 0.6921,
+      "step": 1681
+    },
+    {
+      "epoch": 0.8970666666666667,
+      "grad_norm": 0.5099273325096785,
+      "learning_rate": 5.510198963413881e-06,
+      "loss": 0.8236,
+      "step": 1682
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.4322407642986071,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6794,
+      "step": 1683
+    },
+    {
+      "epoch": 0.8981333333333333,
+      "grad_norm": 0.44129741138352807,
+      "learning_rate": 5.397623022464226e-06,
+      "loss": 0.7084,
+      "step": 1684
+    },
+    {
+      "epoch": 0.8986666666666666,
+      "grad_norm": 0.4753167614877353,
+      "learning_rate": 5.341758713743828e-06,
+      "loss": 0.6671,
+      "step": 1685
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.5739284289831437,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6935,
+      "step": 1686
+    },
+    {
+      "epoch": 0.8997333333333334,
+      "grad_norm": 0.4229774415123507,
+      "learning_rate": 5.230878253907912e-06,
+      "loss": 0.6782,
+      "step": 1687
+    },
+    {
+      "epoch": 0.9002666666666667,
+      "grad_norm": 0.5193351176220204,
+      "learning_rate": 5.175862433898282e-06,
+      "loss": 0.6292,
+      "step": 1688
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.8117702174014183,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.9323,
+      "step": 1689
+    },
+    {
+      "epoch": 0.9013333333333333,
+      "grad_norm": 0.47062059840601245,
+      "learning_rate": 5.066680435123106e-06,
+      "loss": 0.7921,
+      "step": 1690
+    },
+    {
+      "epoch": 0.9018666666666667,
+      "grad_norm": 0.5121954255095074,
+      "learning_rate": 5.012514582391592e-06,
+      "loss": 0.7148,
+      "step": 1691
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.4282829992510614,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6855,
+      "step": 1692
+    },
+    {
+      "epoch": 0.9029333333333334,
+      "grad_norm": 0.41795780800866367,
+      "learning_rate": 4.905033978977491e-06,
+      "loss": 0.6131,
+      "step": 1693
+    },
+    {
+      "epoch": 0.9034666666666666,
+      "grad_norm": 0.6207093528235793,
+      "learning_rate": 4.851719549248301e-06,
+      "loss": 0.8102,
+      "step": 1694
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.5090173373571176,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.7484,
+      "step": 1695
+    },
+    {
+      "epoch": 0.9045333333333333,
+      "grad_norm": 0.4538782522384936,
+      "learning_rate": 4.745943229770122e-06,
+      "loss": 0.6717,
+      "step": 1696
+    },
+    {
+      "epoch": 0.9050666666666667,
+      "grad_norm": 0.60000436823127,
+      "learning_rate": 4.693481655885257e-06,
+      "loss": 0.8122,
+      "step": 1697
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.4250546805274872,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6265,
+      "step": 1698
+    },
+    {
+      "epoch": 0.9061333333333333,
+      "grad_norm": 0.5040520980980161,
+      "learning_rate": 4.58941246311464e-06,
+      "loss": 0.6753,
+      "step": 1699
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.49996306159161275,
+      "learning_rate": 4.537805154995278e-06,
+      "loss": 0.6779,
+      "step": 1700
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.4712278746479505,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.68,
+      "step": 1701
+    },
+    {
+      "epoch": 0.9077333333333333,
+      "grad_norm": 0.46074573044129796,
+      "learning_rate": 4.435445885824285e-06,
+      "loss": 0.7227,
+      "step": 1702
+    },
+    {
+      "epoch": 0.9082666666666667,
+      "grad_norm": 0.493900195322433,
+      "learning_rate": 4.384694230432984e-06,
+      "loss": 0.6689,
+      "step": 1703
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.4344625130464455,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6555,
+      "step": 1704
+    },
+    {
+      "epoch": 0.9093333333333333,
+      "grad_norm": 0.5206790882786125,
+      "learning_rate": 4.2840476357989825e-06,
+      "loss": 0.7867,
+      "step": 1705
+    },
+    {
+      "epoch": 0.9098666666666667,
+      "grad_norm": 0.5289091037967014,
+      "learning_rate": 4.2341529971023255e-06,
+      "loss": 0.7056,
+      "step": 1706
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.4329363837209095,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6419,
+      "step": 1707
+    },
+    {
+      "epoch": 0.9109333333333334,
+      "grad_norm": 0.7364925592792445,
+      "learning_rate": 4.135221781914034e-06,
+      "loss": 0.7942,
+      "step": 1708
+    },
+    {
+      "epoch": 0.9114666666666666,
+      "grad_norm": 0.4287919792677331,
+      "learning_rate": 4.0861855008460405e-06,
+      "loss": 0.6089,
+      "step": 1709
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.5987593143476608,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.8248,
+      "step": 1710
+    },
+    {
+      "epoch": 0.9125333333333333,
+      "grad_norm": 0.47424660935512686,
+      "learning_rate": 3.988972323910778e-06,
+      "loss": 0.6991,
+      "step": 1711
+    },
+    {
+      "epoch": 0.9130666666666667,
+      "grad_norm": 0.4663440261234151,
+      "learning_rate": 3.9407957183368095e-06,
+      "loss": 0.6776,
+      "step": 1712
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.4624155823554257,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.7055,
+      "step": 1713
+    },
+    {
+      "epoch": 0.9141333333333334,
+      "grad_norm": 0.48475807611492766,
+      "learning_rate": 3.845303192289074e-06,
+      "loss": 0.8187,
+      "step": 1714
+    },
+    {
+      "epoch": 0.9146666666666666,
+      "grad_norm": 0.4494717541871752,
+      "learning_rate": 3.797987556970495e-06,
+      "loss": 0.7191,
+      "step": 1715
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.4835044715176101,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6616,
+      "step": 1716
+    },
+    {
+      "epoch": 0.9157333333333333,
+      "grad_norm": 0.4838683531014212,
+      "learning_rate": 3.7042182482018075e-06,
+      "loss": 0.8107,
+      "step": 1717
+    },
+    {
+      "epoch": 0.9162666666666667,
+      "grad_norm": 0.4725694544089572,
+      "learning_rate": 3.6577648547611033e-06,
+      "loss": 0.746,
+      "step": 1718
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.3979184087752747,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6655,
+      "step": 1719
+    },
+    {
+      "epoch": 0.9173333333333333,
+      "grad_norm": 0.3868242300055193,
+      "learning_rate": 3.565721283350931e-06,
+      "loss": 0.6417,
+      "step": 1720
+    },
+    {
+      "epoch": 0.9178666666666667,
+      "grad_norm": 0.46778212411889863,
+      "learning_rate": 3.5201313802375456e-06,
+      "loss": 0.7105,
+      "step": 1721
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.4598984573387436,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.7256,
+      "step": 1722
+    },
+    {
+      "epoch": 0.9189333333333334,
+      "grad_norm": 0.5866239669235088,
+      "learning_rate": 3.4298160198856568e-06,
+      "loss": 0.6933,
+      "step": 1723
+    },
+    {
+      "epoch": 0.9194666666666667,
+      "grad_norm": 0.4543808142362355,
+      "learning_rate": 3.3850908323424967e-06,
+      "loss": 0.706,
+      "step": 1724
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.44260354086610093,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6791,
+      "step": 1725
+    },
+    {
+      "epoch": 0.9205333333333333,
+      "grad_norm": 0.4170564242554412,
+      "learning_rate": 3.296506110302422e-06,
+      "loss": 0.639,
+      "step": 1726
+    },
+    {
+      "epoch": 0.9210666666666667,
+      "grad_norm": 0.4597202296846262,
+      "learning_rate": 3.252646840332918e-06,
+      "loss": 0.8017,
+      "step": 1727
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4529920738735563,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6999,
+      "step": 1728
+    },
+    {
+      "epoch": 0.9221333333333334,
+      "grad_norm": 0.657461638690797,
+      "learning_rate": 3.1657951373467497e-06,
+      "loss": 0.6636,
+      "step": 1729
+    },
+    {
+      "epoch": 0.9226666666666666,
+      "grad_norm": 0.41654680911541275,
+      "learning_rate": 3.1228029636824475e-06,
+      "loss": 0.7,
+      "step": 1730
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.45624516917161206,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6794,
+      "step": 1731
+    },
+    {
+      "epoch": 0.9237333333333333,
+      "grad_norm": 0.5615950013332655,
+      "learning_rate": 3.037686613916857e-06,
+      "loss": 0.742,
+      "step": 1732
+    },
+    {
+      "epoch": 0.9242666666666667,
+      "grad_norm": 0.575824823799937,
+      "learning_rate": 2.995562691985898e-06,
+      "loss": 0.8335,
+      "step": 1733
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.5046522891338086,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.7059,
+      "step": 1734
+    },
+    {
+      "epoch": 0.9253333333333333,
+      "grad_norm": 0.6991670890182904,
+      "learning_rate": 2.912183982969385e-06,
+      "loss": 0.907,
+      "step": 1735
+    },
+    {
+      "epoch": 0.9258666666666666,
+      "grad_norm": 0.43206580877235,
+      "learning_rate": 2.8709294448653225e-06,
+      "loss": 0.6466,
+      "step": 1736
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.44256457653611103,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6343,
+      "step": 1737
+    },
+    {
+      "epoch": 0.9269333333333334,
+      "grad_norm": 0.5287835942194266,
+      "learning_rate": 2.789290617426765e-06,
+      "loss": 0.7463,
+      "step": 1738
+    },
+    {
+      "epoch": 0.9274666666666667,
+      "grad_norm": 0.4843669519470727,
+      "learning_rate": 2.748906571878207e-06,
+      "loss": 0.6667,
+      "step": 1739
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4103340157544824,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.6974,
+      "step": 1740
+    },
+    {
+      "epoch": 0.9285333333333333,
+      "grad_norm": 0.5184298403298317,
+      "learning_rate": 2.6690098200866098e-06,
+      "loss": 0.7487,
+      "step": 1741
+    },
+    {
+      "epoch": 0.9290666666666667,
+      "grad_norm": 0.6082050912128165,
+      "learning_rate": 2.6294973524274125e-06,
+      "loss": 0.7298,
+      "step": 1742
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.47946874007511087,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6172,
+      "step": 1743
+    },
+    {
+      "epoch": 0.9301333333333334,
+      "grad_norm": 0.6112987394155805,
+      "learning_rate": 2.551344823532964e-06,
+      "loss": 0.7834,
+      "step": 1744
+    },
+    {
+      "epoch": 0.9306666666666666,
+      "grad_norm": 0.47366441421557115,
+      "learning_rate": 2.5127049956730207e-06,
+      "loss": 0.6871,
+      "step": 1745
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.5156334997582206,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.6881,
+      "step": 1746
+    },
+    {
+      "epoch": 0.9317333333333333,
+      "grad_norm": 0.5007794686429051,
+      "learning_rate": 2.436298790049363e-06,
+      "loss": 0.6776,
+      "step": 1747
+    },
+    {
+      "epoch": 0.9322666666666667,
+      "grad_norm": 0.41915625038203275,
+      "learning_rate": 2.3985326404461604e-06,
+      "loss": 0.665,
+      "step": 1748
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.6067230351998149,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.8348,
+      "step": 1749
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.6362295947710719,
+      "learning_rate": 2.3238748115339324e-06,
+      "loss": 0.8087,
+      "step": 1750
+    },
+    {
+      "epoch": 0.9338666666666666,
+      "grad_norm": 0.47269885739172784,
+      "learning_rate": 2.286983355164529e-06,
+      "loss": 0.6545,
+      "step": 1751
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.4550854258944117,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.5825,
+      "step": 1752
+    },
+    {
+      "epoch": 0.9349333333333333,
+      "grad_norm": 0.43939832729846506,
+      "learning_rate": 2.2140759094162467e-06,
+      "loss": 0.6521,
+      "step": 1753
+    },
+    {
+      "epoch": 0.9354666666666667,
+      "grad_norm": 0.4969689643834163,
+      "learning_rate": 2.178060137750071e-06,
+      "loss": 0.6967,
+      "step": 1754
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.5319636056438496,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.7467,
+      "step": 1755
+    },
+    {
+      "epoch": 0.9365333333333333,
+      "grad_norm": 0.3596476957935459,
+      "learning_rate": 2.106905034576112e-06,
+      "loss": 0.612,
+      "step": 1756
+    },
+    {
+      "epoch": 0.9370666666666667,
+      "grad_norm": 0.5949674960873456,
+      "learning_rate": 2.0717659155482738e-06,
+      "loss": 0.7606,
+      "step": 1757
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.4993480554368963,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.7188,
+      "step": 1758
+    },
+    {
+      "epoch": 0.9381333333333334,
+      "grad_norm": 0.4356732798225739,
+      "learning_rate": 2.002365067264289e-06,
+      "loss": 0.6958,
+      "step": 1759
+    },
+    {
+      "epoch": 0.9386666666666666,
+      "grad_norm": 0.6594548762626951,
+      "learning_rate": 1.968103545249611e-06,
+      "loss": 0.7899,
+      "step": 1760
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.42673152905831363,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6451,
+      "step": 1761
+    },
+    {
+      "epoch": 0.9397333333333333,
+      "grad_norm": 0.4180416578744134,
+      "learning_rate": 1.900458817025097e-06,
+      "loss": 0.6719,
+      "step": 1762
+    },
+    {
+      "epoch": 0.9402666666666667,
+      "grad_norm": 0.4319759353989045,
+      "learning_rate": 1.8670758128126909e-06,
+      "loss": 0.6994,
+      "step": 1763
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.511914801055679,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.7403,
+      "step": 1764
+    },
+    {
+      "epoch": 0.9413333333333334,
+      "grad_norm": 0.45594120324085713,
+      "learning_rate": 1.8011890226208527e-06,
+      "loss": 0.6978,
+      "step": 1765
+    },
+    {
+      "epoch": 0.9418666666666666,
+      "grad_norm": 0.5587672330196816,
+      "learning_rate": 1.7686854333893833e-06,
+      "loss": 0.8151,
+      "step": 1766
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.4672484923497262,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6433,
+      "step": 1767
+    },
+    {
+      "epoch": 0.9429333333333333,
+      "grad_norm": 0.4034606284395722,
+      "learning_rate": 1.7045583519583074e-06,
+      "loss": 0.599,
+      "step": 1768
+    },
+    {
+      "epoch": 0.9434666666666667,
+      "grad_norm": 0.4841435384502967,
+      "learning_rate": 1.6729350512519005e-06,
+      "loss": 0.7121,
+      "step": 1769
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4086861829838414,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6631,
+      "step": 1770
+    },
+    {
+      "epoch": 0.9445333333333333,
+      "grad_norm": 0.5084992483479394,
+      "learning_rate": 1.6105694020169593e-06,
+      "loss": 0.7367,
+      "step": 1771
+    },
+    {
+      "epoch": 0.9450666666666667,
+      "grad_norm": 0.4870556258038371,
+      "learning_rate": 1.5798272397217095e-06,
+      "loss": 0.7428,
+      "step": 1772
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.4242105121906902,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6565,
+      "step": 1773
+    },
+    {
+      "epoch": 0.9461333333333334,
+      "grad_norm": 0.43070180276909636,
+      "learning_rate": 1.5192246987791981e-06,
+      "loss": 0.6833,
+      "step": 1774
+    },
+    {
+      "epoch": 0.9466666666666667,
+      "grad_norm": 0.4769604410420578,
+      "learning_rate": 1.489364501100332e-06,
+      "loss": 0.7616,
+      "step": 1775
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.42477402577716145,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6216,
+      "step": 1776
+    },
+    {
+      "epoch": 0.9477333333333333,
+      "grad_norm": 0.4605481962020174,
+      "learning_rate": 1.430526697162482e-06,
+      "loss": 0.6322,
+      "step": 1777
+    },
+    {
+      "epoch": 0.9482666666666667,
+      "grad_norm": 0.4425138437152816,
+      "learning_rate": 1.4015492666021312e-06,
+      "loss": 0.6946,
+      "step": 1778
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.5096055721561739,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.7539,
+      "step": 1779
+    },
+    {
+      "epoch": 0.9493333333333334,
+      "grad_norm": 0.5304704102228746,
+      "learning_rate": 1.344477780953346e-06,
+      "loss": 0.6634,
+      "step": 1780
+    },
+    {
+      "epoch": 0.9498666666666666,
+      "grad_norm": 0.5496107480681083,
+      "learning_rate": 1.3163838962890195e-06,
+      "loss": 0.6355,
+      "step": 1781
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.5425962969344147,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.7227,
+      "step": 1782
+    },
+    {
+      "epoch": 0.9509333333333333,
+      "grad_norm": 0.5243601914848159,
+      "learning_rate": 1.261080262743297e-06,
+      "loss": 0.6934,
+      "step": 1783
+    },
+    {
+      "epoch": 0.9514666666666667,
+      "grad_norm": 0.48066922628094294,
+      "learning_rate": 1.2338706790069431e-06,
+      "loss": 0.7211,
+      "step": 1784
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.5655455844295989,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.7969,
+      "step": 1785
+    },
+    {
+      "epoch": 0.9525333333333333,
+      "grad_norm": 0.4283099962354034,
+      "learning_rate": 1.1803363838667092e-06,
+      "loss": 0.6584,
+      "step": 1786
+    },
+    {
+      "epoch": 0.9530666666666666,
+      "grad_norm": 0.4043136244440144,
+      "learning_rate": 1.1540118323243865e-06,
+      "loss": 0.6298,
+      "step": 1787
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.44838038555331095,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6608,
+      "step": 1788
+    },
+    {
+      "epoch": 0.9541333333333334,
+      "grad_norm": 0.7564798392992993,
+      "learning_rate": 1.1022483143405705e-06,
+      "loss": 0.6701,
+      "step": 1789
+    },
+    {
+      "epoch": 0.9546666666666667,
+      "grad_norm": 0.40676313098529043,
+      "learning_rate": 1.076809502472831e-06,
+      "loss": 0.6538,
+      "step": 1790
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.4795392201498022,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.7172,
+      "step": 1791
+    },
+    {
+      "epoch": 0.9557333333333333,
+      "grad_norm": 0.40742487024026375,
+      "learning_rate": 1.0268181528061749e-06,
+      "loss": 0.6306,
+      "step": 1792
+    },
+    {
+      "epoch": 0.9562666666666667,
+      "grad_norm": 0.45877808976714857,
+      "learning_rate": 1.0022657642890231e-06,
+      "loss": 0.734,
+      "step": 1793
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.5743267675772135,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.8098,
+      "step": 1794
+    },
+    {
+      "epoch": 0.9573333333333334,
+      "grad_norm": 0.5722405526518496,
+      "learning_rate": 9.540479264726676e-07,
+      "loss": 0.8646,
+      "step": 1795
+    },
+    {
+      "epoch": 0.9578666666666666,
+      "grad_norm": 0.4610864305157812,
+      "learning_rate": 9.303826211592315e-07,
+      "loss": 0.6968,
+      "step": 1796
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.3874492764828429,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6075,
+      "step": 1797
+    },
+    {
+      "epoch": 0.9589333333333333,
+      "grad_norm": 0.622146723828427,
+      "learning_rate": 8.839395910626213e-07,
+      "loss": 0.7744,
+      "step": 1798
+    },
+    {
+      "epoch": 0.9594666666666667,
+      "grad_norm": 0.5160608743139206,
+      "learning_rate": 8.611620049653879e-07,
+      "loss": 0.7703,
+      "step": 1799
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.6041189830556892,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6936,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9605333333333334,
+      "grad_norm": 0.5118544620614286,
+      "learning_rate": 8.16495030759501e-07,
+      "loss": 0.6941,
+      "step": 1801
+    },
+    {
+      "epoch": 0.9610666666666666,
+      "grad_norm": 0.5343848880071994,
+      "learning_rate": 7.946057760332193e-07,
+      "loss": 0.6392,
+      "step": 1802
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.49079751936485655,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.7257,
+      "step": 1803
+    },
+    {
+      "epoch": 0.9621333333333333,
+      "grad_norm": 0.4637486450647678,
+      "learning_rate": 7.517160581569372e-07,
+      "loss": 0.6565,
+      "step": 1804
+    },
+    {
+      "epoch": 0.9626666666666667,
+      "grad_norm": 0.35597581447012905,
+      "learning_rate": 7.307157230821426e-07,
+      "loss": 0.6244,
+      "step": 1805
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.5065092596901237,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.683,
+      "step": 1806
+    },
+    {
+      "epoch": 0.9637333333333333,
+      "grad_norm": 0.4674308576590083,
+      "learning_rate": 6.896044142100433e-07,
+      "loss": 0.6693,
+      "step": 1807
+    },
+    {
+      "epoch": 0.9642666666666667,
+      "grad_norm": 0.5667653011508343,
+      "learning_rate": 6.694935631773258e-07,
+      "loss": 0.7204,
+      "step": 1808
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.48981590277493864,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.756,
+      "step": 1809
+    },
+    {
+      "epoch": 0.9653333333333334,
+      "grad_norm": 0.462165485587369,
+      "learning_rate": 6.301617681886863e-07,
+      "loss": 0.6905,
+      "step": 1810
+    },
+    {
+      "epoch": 0.9658666666666667,
+      "grad_norm": 0.4940514688545709,
+      "learning_rate": 6.109409416834688e-07,
+      "loss": 0.7038,
+      "step": 1811
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.5048306343312814,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.776,
+      "step": 1812
+    },
+    {
+      "epoch": 0.9669333333333333,
+      "grad_norm": 0.5637897209501818,
+      "learning_rate": 5.733897176325665e-07,
+      "loss": 0.8129,
+      "step": 1813
+    },
+    {
+      "epoch": 0.9674666666666667,
+      "grad_norm": 0.5150800518400652,
+      "learning_rate": 5.550594322205504e-07,
+      "loss": 0.7644,
+      "step": 1814
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.4793702040563132,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.7541,
+      "step": 1815
+    },
+    {
+      "epoch": 0.9685333333333334,
+      "grad_norm": 0.4473462128597777,
+      "learning_rate": 5.192897883082747e-07,
+      "loss": 0.7954,
+      "step": 1816
+    },
+    {
+      "epoch": 0.9690666666666666,
+      "grad_norm": 0.4984617422783079,
+      "learning_rate": 5.018505366216175e-07,
+      "loss": 0.7314,
+      "step": 1817
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.5275269532360491,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.7556,
+      "step": 1818
+    },
+    {
+      "epoch": 0.9701333333333333,
+      "grad_norm": 0.4168873057000073,
+      "learning_rate": 4.678634341683252e-07,
+      "loss": 0.6481,
+      "step": 1819
+    },
+    {
+      "epoch": 0.9706666666666667,
+      "grad_norm": 0.5696623926762724,
+      "learning_rate": 4.5131568489236166e-07,
+      "loss": 0.6942,
+      "step": 1820
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.4851847663409578,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6983,
+      "step": 1821
+    },
+    {
+      "epoch": 0.9717333333333333,
+      "grad_norm": 0.47119487192422155,
+      "learning_rate": 4.191120373120749e-07,
+      "loss": 0.6302,
+      "step": 1822
+    },
+    {
+      "epoch": 0.9722666666666666,
+      "grad_norm": 0.5790827839290962,
+      "learning_rate": 4.034562351727389e-07,
+      "loss": 0.7274,
+      "step": 1823
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.398730184091536,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.5725,
+      "step": 1824
+    },
+    {
+      "epoch": 0.9733333333333334,
+      "grad_norm": 0.47337038159620887,
+      "learning_rate": 3.73036907948543e-07,
+      "loss": 0.6636,
+      "step": 1825
+    },
+    {
+      "epoch": 0.9738666666666667,
+      "grad_norm": 0.49541788249273355,
+      "learning_rate": 3.582734737004101e-07,
+      "loss": 0.7282,
+      "step": 1826
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.46621975936895355,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.6791,
+      "step": 1827
+    },
+    {
+      "epoch": 0.9749333333333333,
+      "grad_norm": 0.4817558813921772,
+      "learning_rate": 3.296392843612273e-07,
+      "loss": 0.7099,
+      "step": 1828
+    },
+    {
+      "epoch": 0.9754666666666667,
+      "grad_norm": 0.40107644088369515,
+      "learning_rate": 3.1576861477621287e-07,
+      "loss": 0.6479,
+      "step": 1829
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4679791613158336,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.7141,
+      "step": 1830
+    },
+    {
+      "epoch": 0.9765333333333334,
+      "grad_norm": 0.5619103417673081,
+      "learning_rate": 2.889203328748424e-07,
+      "loss": 0.7529,
+      "step": 1831
+    },
+    {
+      "epoch": 0.9770666666666666,
+      "grad_norm": 0.6447959335340104,
+      "learning_rate": 2.759428007315212e-07,
+      "loss": 0.8111,
+      "step": 1832
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.43316735390269656,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6778,
+      "step": 1833
+    },
+    {
+      "epoch": 0.9781333333333333,
+      "grad_norm": 0.44721925797070633,
+      "learning_rate": 2.5088114782392257e-07,
+      "loss": 0.7709,
+      "step": 1834
+    },
+    {
+      "epoch": 0.9786666666666667,
+      "grad_norm": 0.44745049139874443,
+      "learning_rate": 2.3879710189753656e-07,
+      "loss": 0.7194,
+      "step": 1835
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.5097152855164827,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6915,
+      "step": 1836
+    },
+    {
+      "epoch": 0.9797333333333333,
+      "grad_norm": 0.48789010654704096,
+      "learning_rate": 2.15522751523467e-07,
+      "loss": 0.6994,
+      "step": 1837
+    },
+    {
+      "epoch": 0.9802666666666666,
+      "grad_norm": 0.5431939629329696,
+      "learning_rate": 2.0433251657653308e-07,
+      "loss": 0.6987,
+      "step": 1838
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.4845379187086868,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6198,
+      "step": 1839
+    },
+    {
+      "epoch": 0.9813333333333333,
+      "grad_norm": 0.4397358122971218,
+      "learning_rate": 1.8284609424142895e-07,
+      "loss": 0.6953,
+      "step": 1840
+    },
+    {
+      "epoch": 0.9818666666666667,
+      "grad_norm": 0.5341035258238761,
+      "learning_rate": 1.7254997101500137e-07,
+      "loss": 0.7675,
+      "step": 1841
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.5041287666272612,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6628,
+      "step": 1842
+    },
+    {
+      "epoch": 0.9829333333333333,
+      "grad_norm": 0.5034487183315479,
+      "learning_rate": 1.5285205417319149e-07,
+      "loss": 0.7357,
+      "step": 1843
+    },
+    {
+      "epoch": 0.9834666666666667,
+      "grad_norm": 0.41166319860577366,
+      "learning_rate": 1.4345031937879062e-07,
+      "loss": 0.6495,
+      "step": 1844
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.482859318866857,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.7002,
+      "step": 1845
+    },
+    {
+      "epoch": 0.9845333333333334,
+      "grad_norm": 0.511971705284019,
+      "learning_rate": 1.255414374179531e-07,
+      "loss": 0.7694,
+      "step": 1846
+    },
+    {
+      "epoch": 0.9850666666666666,
+      "grad_norm": 0.4246436530163462,
+      "learning_rate": 1.170343437301491e-07,
+      "loss": 0.6638,
+      "step": 1847
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.39750627270940414,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.6434,
+      "step": 1848
+    },
+    {
+      "epoch": 0.9861333333333333,
+      "grad_norm": 0.4285443608648601,
+      "learning_rate": 1.0091497795706728e-07,
+      "loss": 0.7199,
+      "step": 1849
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.47386557717487543,
+      "learning_rate": 9.330275400666332e-08,
+      "loss": 0.7379,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.4751932346525158,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6993,
+      "step": 1851
+    },
+    {
+      "epoch": 0.9877333333333334,
+      "grad_norm": 0.5829683665406566,
+      "learning_rate": 7.8973337634336e-08,
+      "loss": 0.7968,
+      "step": 1852
+    },
+    {
+      "epoch": 0.9882666666666666,
+      "grad_norm": 0.488540059171816,
+      "learning_rate": 7.225618800222877e-08,
+      "loss": 0.7717,
+      "step": 1853
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.4039688287508916,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6184,
+      "step": 1854
+    },
+    {
+      "epoch": 0.9893333333333333,
+      "grad_norm": 0.4376999076705189,
+      "learning_rate": 5.971710613821291e-08,
+      "loss": 0.7082,
+      "step": 1855
+    },
+    {
+      "epoch": 0.9898666666666667,
+      "grad_norm": 0.4754260162700392,
+      "learning_rate": 5.389521134989695e-08,
+      "loss": 0.6569,
+      "step": 1856
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.49060827160061726,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6498,
+      "step": 1857
+    },
+    {
+      "epoch": 0.9909333333333333,
+      "grad_norm": 0.5345195921909358,
+      "learning_rate": 4.314680098592705e-08,
+      "loss": 0.7227,
+      "step": 1858
+    },
+    {
+      "epoch": 0.9914666666666667,
+      "grad_norm": 0.5193383074244806,
+      "learning_rate": 3.8220317506654226e-08,
+      "loss": 0.7667,
+      "step": 1859
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.565638644612035,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.7899,
+      "step": 1860
+    },
+    {
+      "epoch": 0.9925333333333334,
+      "grad_norm": 0.5015026943257141,
+      "learning_rate": 2.9262867509605163e-08,
+      "loss": 0.7851,
+      "step": 1861
+    },
+    {
+      "epoch": 0.9930666666666667,
+      "grad_norm": 0.6270339345436331,
+      "learning_rate": 2.5231927740154704e-08,
+      "loss": 0.8221,
+      "step": 1862
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.4788575345151283,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.7483,
+      "step": 1863
+    },
+    {
+      "epoch": 0.9941333333333333,
+      "grad_norm": 0.4989941715945976,
+      "learning_rate": 1.8065678844314538e-08,
+      "loss": 0.7683,
+      "step": 1864
+    },
+    {
+      "epoch": 0.9946666666666667,
+      "grad_norm": 0.5400799400718792,
+      "learning_rate": 1.4930391117451426e-08,
+      "loss": 0.8127,
+      "step": 1865
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.47439414857581497,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.7164,
+      "step": 1866
+    },
+    {
+      "epoch": 0.9957333333333334,
+      "grad_norm": 0.6841191889191034,
+      "learning_rate": 9.555535917993297e-09,
+      "loss": 0.8663,
+      "step": 1867
+    },
+    {
+      "epoch": 0.9962666666666666,
+      "grad_norm": 0.6359287469277909,
+      "learning_rate": 7.315984495548378e-09,
+      "loss": 0.94,
+      "step": 1868
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.4492621197075347,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6509,
+      "step": 1869
+    },
+    {
+      "epoch": 0.9973333333333333,
+      "grad_norm": 0.5456428181444235,
+      "learning_rate": 3.732667443390181e-09,
+      "loss": 0.7309,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9978666666666667,
+      "grad_norm": 0.4118915710082582,
+      "learning_rate": 2.388912514017516e-09,
+      "loss": 0.7186,
+      "step": 1871
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.46232257898409884,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.6662,
+      "step": 1872
+    },
+    {
+      "epoch": 0.9989333333333333,
+      "grad_norm": 0.44837771621131517,
+      "learning_rate": 5.972299119250125e-10,
+      "loss": 0.5715,
+      "step": 1873
+    },
+    {
+      "epoch": 0.9994666666666666,
+      "grad_norm": 0.5570450286226029,
+      "learning_rate": 1.4930758944764479e-10,
+      "loss": 0.8057,
+      "step": 1874
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.3937926368941584,
+      "learning_rate": 0.0,
+      "loss": 0.6504,
+      "step": 1875
+    },
+    {
+      "epoch": 1.0,
+      "step": 1875,
+      "total_flos": 1530216586477568.0,
+      "train_loss": 0.7911407696406046,
+      "train_runtime": 27947.7114,
+      "train_samples_per_second": 1.073,
+      "train_steps_per_second": 0.067
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1530216586477568.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..849701cf3500261616829a852c4694138814644e
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..53a241e7eed6d1d4e53657fea6fce2bac122187a
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20f5fbfaf78f2660b5a7e389bd0cabaed01836ceef82cec08ccf83f292c1b145
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ca4541f44ba0763c9545308c17ed43f708e082c4
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d27eec83422eb89843f47c1c7465c54c9b0a49881f7bde3aa8eedd2417713e2
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..82e389d2975fa50bfab5506c7cc45bf0f9e3d15b
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9338962732953701,
+      "learning_rate": 2e-05,
+      "loss": 1.3626,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.7826216851674797,
+      "learning_rate": 4e-05,
+      "loss": 1.1873,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.8026087339200398,
+      "learning_rate": 6e-05,
+      "loss": 1.2351,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.8953557913806978,
+      "learning_rate": 8e-05,
+      "loss": 1.4091,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.6344660427218007,
+      "learning_rate": 0.0001,
+      "loss": 1.0512,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.7817986163856506,
+      "learning_rate": 0.00012,
+      "loss": 1.0443,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.9563762281220346,
+      "learning_rate": 0.00014,
+      "loss": 1.1946,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7861431922120908,
+      "learning_rate": 0.00016,
+      "loss": 1.1313,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.8557088838865845,
+      "learning_rate": 0.00018,
+      "loss": 1.0663,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.554668877411433,
+      "learning_rate": 0.0002,
+      "loss": 0.9227,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5924202455623732,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.8726,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5998948332967988,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.9337,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.6523668340417098,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.9656,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5911926471630946,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.894,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.6985684411115631,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 1.0268,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5709438499133971,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9594,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.7521237360681552,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.9985,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5597775297927315,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.8846,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.6123226618928314,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.9999,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.6191572504342515,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.9126,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.8294983017105749,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.9755,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.6354126575114571,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 1.0064,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.5424883492269076,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.9076,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5081500779540952,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8983,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.692718915554635,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.9351,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.52678134497159,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.9046,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.547165568628004,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.795,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5576260347576817,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.9572,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.6056494284984847,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 1.0077,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5656232329769246,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.9427,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.5604826934003616,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.9405,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5051155342791347,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.911,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.5019309977427193,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.9022,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.5184966962250892,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.9031,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.4651416151244891,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.8528,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.5841766928098853,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.9857,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.6030595623246762,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.9255,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.5559191895895258,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8589,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.4967165228070396,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.8613,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.5653765926413602,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.8859,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.5422013600282081,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.8828,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5826459751875703,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.9142,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5535975218457786,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.7845,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5295297855307182,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8461,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.5026378000178681,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.8962,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.5685172681516242,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.9051,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.5175059491207624,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.9218,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.599725245514314,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 1.0399,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.6362525861936286,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.8076,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.9030266676654153,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.9246,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5056758352509887,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.8316,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4744199957100715,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.8464,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.5331700094171545,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.9717,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.5265592779579443,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.9252,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.47205781532010516,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.873,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.5407857333191397,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.9118,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.5818653548795125,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.9831,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.5015172076464853,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.8565,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5694650083854068,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.8903,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.5972673674616251,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.8937,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.47173773575170286,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.8544,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.577412887943692,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.966,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.6044123923844257,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.8515,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.5600973637310778,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.8436,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4361774278276434,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.7677,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.6968930247271232,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.9854,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.6660101095375524,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 1.0091,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.557034072428792,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.929,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.5203200972002412,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.8478,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5104131949732492,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.865,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.5166062785055189,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.909,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.49552636377658477,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.7994,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.539523454982455,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.9213,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5627036368958536,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.9762,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.426658380905887,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.778,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.5399147516953222,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.9,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.5937823156558912,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.9843,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.5798363712102641,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.9274,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.6073609011014686,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.9492,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.5511641207054866,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.8819,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.48773260003605995,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.8691,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.5859319345899571,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.9468,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.5041094523292482,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.9376,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.44240497780643767,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8236,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.5075138902278976,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.8903,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4319744942169732,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.7573,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.47439141142404107,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.8451,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.5636368435856511,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.9061,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.507050223866283,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.8482,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.466920542665784,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8553,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.4871840557622525,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.7571,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4864553660708385,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.7767,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.5351561565292604,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.9148,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.5530476011968923,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.9433,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.5867933810167691,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.9014,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.43778812317460986,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.7746,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.5528980307406878,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.7819,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.5440716658453119,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.8859,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.5995595635001515,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.9349,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4999901123514004,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.8627,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.5823620592442721,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.8166,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.5364205602794443,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.9191,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.5082812495075835,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.9031,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.5758084411393675,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.8901,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.9777702250295537,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.9929,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.6783283770702636,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 1.0695,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.5271622038577951,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.8938,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.47357167917490556,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.8447,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.5171733446789377,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.8556,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5193440460225393,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.9351,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.49741905461575303,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.8259,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.4607711393942658,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.8085,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.5051630516846702,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.9029,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.47151404685858905,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8433,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4444990889893979,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.8156,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4812296562563869,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.8422,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.6292710246476524,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 1.0499,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.5310747533883055,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.8492,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.6780462054568485,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 1.0863,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4102062485457246,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.7337,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.5575101890539665,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.9218,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.5170278022858484,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.8053,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.4748794796067836,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.7251,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.6963277572293579,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.9772,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.538275454501302,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.921,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.6309655795674255,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.9811,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.5345156432452945,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.8148,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.5686942164334279,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.9561,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.578774730515947,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.9446,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.519069595704834,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.9614,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.5799529222159874,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.9756,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.5654645995189487,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.8901,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.44235999598674136,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.6974,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.5103462719597298,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.8175,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.44160518330831844,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.7435,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.5304405430077167,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.8126,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.5066854976154387,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.878,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.5771635476196476,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.8844,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.4815512394978152,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.8733,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4746860027433025,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.8286,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.4757866033805745,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.8025,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.5967798401225021,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.9161,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.4709504374032868,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.7988,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.4952465381629514,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.8296,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.48452681690806687,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.8093,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.40245520027582504,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.747,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.5140200022581295,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.9039,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.4499258564105942,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.8334,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.511561509908632,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.9037,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.485439072348685,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7892,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.4506842974593977,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.8416,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.4448082741644203,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.7594,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.5620993724610065,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.8232,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4547643790391853,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.8064,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4272377788377005,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.7958,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.39687457921679736,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7649,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.572994650249389,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.9101,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.4377518611761136,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7508,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.556986243572138,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.8583,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.5101680561816191,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.8799,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.48972383440758366,
+      "learning_rate": 0.0001,
+      "loss": 0.7847,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.444655021704483,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.8255,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.5474795246146188,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.7948,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4943924330481842,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.7877,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.46711979787555513,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.8394,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.5170746572254186,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.8582,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.5029829766987036,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.823,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.5138577160309612,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.8544,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.5702185113451829,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.8641,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.5424895654513993,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.8742,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.5012647906278516,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.8502,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.5609989577409452,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.8781,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.5198376762185971,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.9241,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.4794985714870261,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.8599,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5814781940803141,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.9525,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.4775512630920124,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.8309,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.5308946896737243,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.8173,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.6156962692255327,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.9557,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.4608465291703767,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.7859,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.46533608765721235,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.8186,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.41599063489185295,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.7726,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.503097645984635,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.8818,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.44746089254838045,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.8175,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.4726203165481472,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.8099,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.6634031625087123,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.9609,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.4572154396747571,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.8348,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.3567945592222823,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.6537,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.4585283897877737,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.8717,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.6248603786599073,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.9682,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.7495566278542247,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.8608,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.4420718156712989,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.787,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.5111917235368519,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.9175,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.672438508212136,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 1.0719,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.5116568664709935,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.8358,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.5203643567272366,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.9163,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.38870695570150277,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7233,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.48407545472042224,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.7864,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.4947500456056011,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7789,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.45109916588292803,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.7759,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.40198365763235006,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7593,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.6390697363623546,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.9247,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.41646129282270006,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.765,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.5981197840829817,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.9303,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.48170462062771713,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.9285,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.600508673442443,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.9781,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.716824076932732,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.8484,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4414738931347693,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.8276,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.48978516953737894,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.8365,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.471248437485331,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.7754,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.47196979483999146,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7898,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.5777611166728946,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.8558,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.5703589955979588,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.9313,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.47223463059194537,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.8815,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.5333634484913804,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.8196,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4788802228974212,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.7387,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.43697629339971344,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.8289,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.4259464304601749,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.7418,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.7166274928047626,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.7819,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.458133618567669,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.8053,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.46702827888913045,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.8292,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.48946019781873046,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.864,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.42289219273319145,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7392,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.5862973098247786,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.8668,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.4081715394623093,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.7641,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5725023324849362,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.9315,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.4608176923491287,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7366,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.8938209745291172,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.7719,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.4999641933480804,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.7636,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4888845540498063,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.7827,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.6261751303771892,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.8178,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.534356559184315,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.9203,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.48992129201536144,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.7421,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 1.9767206209015142,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.8938,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.5072459343961812,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.8005,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.5304781205118109,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.7819,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.4715227659427893,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.7934,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.5006365528434477,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.8368,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.4752713961405422,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7956,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.4642589867124423,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.7403,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4992929327151965,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.8374,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.5094978394096521,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.7637,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4381104288206429,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7848,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.5460531829956119,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.9106,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.49516879173004186,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.8651,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.5256022998581887,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.8802,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.5380446125266448,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.8876,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.5540982076688425,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.8525,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.45215703073694374,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.8408,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.5018383080597864,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.8539,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.38150102790816764,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.6628,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.5080972505410488,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.8006,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.5340085192497882,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.8429,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.49340574996400083,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.8355,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.45743507211166096,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.7672,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.4443719898917557,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.801,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.48772267575996897,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.841,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.4701443478298068,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.8341,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.49557956937738107,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.798,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.4841522849880639,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.7349,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.4338429266316266,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7883,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.5805364761275487,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.7449,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.42227066125837287,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7576,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.3803044681673021,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.7592,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.4688417390454173,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7686,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.5236357934238897,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.8839,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.4956689763739436,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.7857,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.4804928599152176,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.8408,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.4728394490027128,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.8468,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.5210750785762072,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.7312,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.5491370018515945,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.8933,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.5730445786288376,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.9124,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.3985859115466495,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.5921,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4964854620626818,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.8046,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.4720944108820016,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.7467,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4789031946168028,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.8,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.5470683104324127,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7722,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.5130628373845342,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.7779,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.5066102428875189,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.8747,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.5507520276541336,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.8001,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.4833001988157889,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.8805,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.5068186552958318,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.7485,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.36968280601492154,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7415,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.37324077086556595,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.6866,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.5088856898732558,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.8358,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.47327719164581333,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.8017,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.48267264651946334,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.7555,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.48571022308371015,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.7771,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4463965041878119,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.8369,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.5569870129168122,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.8017,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.44467112798772657,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.8143,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.5085062546038296,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.7453,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3821873068095259,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7297,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.48984798074690705,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.7694,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.6502396395599714,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.9617,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4375290928151512,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.7478,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.5455126380715353,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7853,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.45340283471262555,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.7779,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.5245684522683273,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7451,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.45518937318494923,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.7914,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.5603573831198544,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.9203,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.6514500429942826,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.8584,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.4977090458616739,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.8083,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.5298204260092336,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.8459,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.46824487234539475,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.7343,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4361963857398833,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.7546,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.5574599716334647,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.8743,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.53845101996606,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.9178,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.4454088008265816,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.7039,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.467478344259279,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.6734,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4817562231095009,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.778,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.5080354741119012,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.8394,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.49317028344655706,
+      "learning_rate": 0.0,
+      "loss": 0.8243,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 255672274780160.0,
+      "train_loss": 0.8613840270882998,
+      "train_runtime": 4649.5942,
+      "train_samples_per_second": 1.075,
+      "train_steps_per_second": 0.067
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 255672274780160.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dbf8c3c3e43325d0a92ae30695d75e92666a357a
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "o_proj",
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "down_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ec3affd8fee0cf8124510977cd932079f312c022
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f33d930eef25650a7f2de7b5a5ed4a089a1ef9ea2546996393d02cede2fe9434
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..be5855c6294be41aee73f584693169fac188ed25
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a7e9bfa0bcd8b06c4047a796844f01f5e1fb8e3bb3b0e071a6e0016f2957def
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..080e42279da6136bad733ba671a2b6e5690ab3bd
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,1134 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 156,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.7189928192176996,
+      "learning_rate": 4e-05,
+      "loss": 1.2749,
+      "step": 1
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.8319955833461842,
+      "learning_rate": 8e-05,
+      "loss": 1.3789,
+      "step": 2
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5809754662869122,
+      "learning_rate": 0.00012,
+      "loss": 1.1343,
+      "step": 3
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6245866824469333,
+      "learning_rate": 0.00016,
+      "loss": 1.2149,
+      "step": 4
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.9578086622724321,
+      "learning_rate": 0.0002,
+      "loss": 1.1344,
+      "step": 5
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.7566112530213828,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.9641,
+      "step": 6
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.7522029468406204,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.957,
+      "step": 7
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4688807143738283,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 1.0045,
+      "step": 8
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.4630914459649483,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.9609,
+      "step": 9
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4803070483118325,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.9719,
+      "step": 10
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 1.5275185100654902,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 1.0156,
+      "step": 11
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5104871457960992,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.9208,
+      "step": 12
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.5223297671798076,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.939,
+      "step": 13
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.410020402911693,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.881,
+      "step": 14
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.46121606847779834,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.9898,
+      "step": 15
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.43133386304573057,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.9396,
+      "step": 16
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.4242170622362895,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.9156,
+      "step": 17
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.39417215473442285,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.9156,
+      "step": 18
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.39962940402425023,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8878,
+      "step": 19
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.39626912636543593,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.8733,
+      "step": 20
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.39424977571732966,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.9044,
+      "step": 21
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4121165256946469,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.818,
+      "step": 22
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.3964086386880975,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.9001,
+      "step": 23
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.47469443270579836,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.9823,
+      "step": 24
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5560818463742494,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8658,
+      "step": 25
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.39498178221575936,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.8386,
+      "step": 26
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3986620736028704,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.9344,
+      "step": 27
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.37157498518297744,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.8847,
+      "step": 28
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.42098958243287943,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.9231,
+      "step": 29
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.3831312726488654,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.8833,
+      "step": 30
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.5491328097502954,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.905,
+      "step": 31
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4532999921906355,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.8356,
+      "step": 32
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.4442471402562814,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.8659,
+      "step": 33
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4626002552002207,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.9668,
+      "step": 34
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3658469189500881,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.8536,
+      "step": 35
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3770688330178575,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8451,
+      "step": 36
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.4180091117995455,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.9449,
+      "step": 37
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.35845299468910635,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8327,
+      "step": 38
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.44128783289492535,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.9457,
+      "step": 39
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4449729478342089,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.9068,
+      "step": 40
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.36216797916888466,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.8932,
+      "step": 41
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.34442983699908675,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8737,
+      "step": 42
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.33788923995084713,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.8122,
+      "step": 43
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.36266886627144307,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.8654,
+      "step": 44
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.35251625422571603,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8452,
+      "step": 45
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.35230681149405163,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.7604,
+      "step": 46
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.3975760127284591,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.9214,
+      "step": 47
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3925040831760101,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.823,
+      "step": 48
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.38400758684073305,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.8198,
+      "step": 49
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.38939087705052644,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.8895,
+      "step": 50
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.39585944742768486,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.8584,
+      "step": 51
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3987988094773504,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.8851,
+      "step": 52
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4572091526450788,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 1.0262,
+      "step": 53
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.44204532283729686,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.858,
+      "step": 54
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3763897025577905,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.8853,
+      "step": 55
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3336798916730397,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.8065,
+      "step": 56
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3457997656711169,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8587,
+      "step": 57
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.3428806539910002,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.8251,
+      "step": 58
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.4194432756457435,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.942,
+      "step": 59
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.3966009547130233,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.9017,
+      "step": 60
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.38883996037217816,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.8532,
+      "step": 61
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.38206215602078847,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.8401,
+      "step": 62
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.3991871520979649,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.9448,
+      "step": 63
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.37035665888756364,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.8757,
+      "step": 64
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3758459959811213,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.9447,
+      "step": 65
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.4454172241386201,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.9232,
+      "step": 66
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3888728546220646,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7479,
+      "step": 67
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.36157253364464487,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7748,
+      "step": 68
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.36980532953207046,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.8703,
+      "step": 69
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3626868146678862,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.8467,
+      "step": 70
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.38649257959123245,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.8564,
+      "step": 71
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.35396765028989813,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.8104,
+      "step": 72
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3286780911483914,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.7684,
+      "step": 73
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.35107142828820026,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.8605,
+      "step": 74
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.7889957756330359,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.841,
+      "step": 75
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3187277828580531,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.7969,
+      "step": 76
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.38348046342297426,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.813,
+      "step": 77
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.32316080082896587,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7801,
+      "step": 78
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3913372773736484,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.8262,
+      "step": 79
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3935015318770364,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.8697,
+      "step": 80
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3614006773996832,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.8044,
+      "step": 81
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.41466269228270225,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.7944,
+      "step": 82
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.35402314029575577,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.8434,
+      "step": 83
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3643060985822875,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.8359,
+      "step": 84
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4063956161169547,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.8702,
+      "step": 85
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.38678963629589486,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.8657,
+      "step": 86
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.39619346949074186,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.8951,
+      "step": 87
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.39717583396608747,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.8918,
+      "step": 88
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4461455486472923,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.8901,
+      "step": 89
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.34602688483693744,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.8002,
+      "step": 90
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.34393240692217825,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.8235,
+      "step": 91
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3359041764783406,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.8082,
+      "step": 92
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.4279331859091739,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.8953,
+      "step": 93
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.30882762174007244,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.7623,
+      "step": 94
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4584337001666346,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.9175,
+      "step": 95
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3639832701678242,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.8502,
+      "step": 96
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.5282506037927764,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.9513,
+      "step": 97
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3383739993204207,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.8183,
+      "step": 98
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.36540908539257766,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7855,
+      "step": 99
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.31612659924309955,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.768,
+      "step": 100
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4469664272821099,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.8475,
+      "step": 101
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3945021235718483,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.9279,
+      "step": 102
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.44505629821008874,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.9159,
+      "step": 103
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.34264801888493884,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.8375,
+      "step": 104
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3475400842702629,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7859,
+      "step": 105
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.42311657763812843,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.8983,
+      "step": 106
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.39636955055988876,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.8531,
+      "step": 107
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3448189962427086,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.7882,
+      "step": 108
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.32058411092475675,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.7672,
+      "step": 109
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3395674575435856,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.8186,
+      "step": 110
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3891736218202886,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.8075,
+      "step": 111
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3542055992556126,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.816,
+      "step": 112
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.38471493444035987,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.839,
+      "step": 113
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3386165881784285,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.7718,
+      "step": 114
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.4143057178990273,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.8071,
+      "step": 115
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.38135232392364804,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.8378,
+      "step": 116
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.40085764890401165,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.8594,
+      "step": 117
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.34240983952898807,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.7909,
+      "step": 118
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.34965884048536067,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.8224,
+      "step": 119
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.34677921296903913,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7951,
+      "step": 120
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.33923871553660206,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7814,
+      "step": 121
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.3638025165502641,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.8955,
+      "step": 122
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3819803215786024,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.8912,
+      "step": 123
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.37494607520697026,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.8524,
+      "step": 124
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3378930714935849,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7661,
+      "step": 125
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3613187167368175,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.8269,
+      "step": 126
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.3494977186160372,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.8133,
+      "step": 127
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3415589265976714,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.8294,
+      "step": 128
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.35122133620906343,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.8206,
+      "step": 129
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3304259243827347,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.766,
+      "step": 130
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.383219017788973,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7598,
+      "step": 131
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.31758087504525645,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7718,
+      "step": 132
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.3662500111889786,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.8434,
+      "step": 133
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3466527871535633,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.8492,
+      "step": 134
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.39455671394350805,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.8239,
+      "step": 135
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.360736850626209,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7559,
+      "step": 136
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.4631240715144929,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.7805,
+      "step": 137
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.3980967242407756,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7975,
+      "step": 138
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.36993486192618463,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.8351,
+      "step": 139
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3912263833359602,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.8489,
+      "step": 140
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.3150082668830467,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7527,
+      "step": 141
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.32796401994936486,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.7648,
+      "step": 142
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.343024029147899,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.7889,
+      "step": 143
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.3355843506617599,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.8135,
+      "step": 144
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.36927653189183046,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.8176,
+      "step": 145
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3325175287450438,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.745,
+      "step": 146
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.43019678727654465,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.872,
+      "step": 147
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3576768522844223,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7748,
+      "step": 148
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.3463866549168474,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.767,
+      "step": 149
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3724385710741351,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.8617,
+      "step": 150
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.4027111260210624,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.8413,
+      "step": 151
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.34338867544992413,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.7988,
+      "step": 152
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.35661896137582483,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.8237,
+      "step": 153
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3572073678993763,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.8177,
+      "step": 154
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.35003409191038015,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.7333,
+      "step": 155
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3632903263929621,
+      "learning_rate": 0.0,
+      "loss": 0.8419,
+      "step": 156
+    },
+    {
+      "epoch": 0.9984,
+      "step": 156,
+      "total_flos": 369874054676480.0,
+      "train_loss": 0.8646604896355898,
+      "train_runtime": 4609.8811,
+      "train_samples_per_second": 1.085,
+      "train_steps_per_second": 0.034
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 156,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 369874054676480.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a3249eb3c9cfb22b7b20ea510c3eef425094ff9
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "down_proj",
+    "up_proj",
+    "o_proj",
+    "gate_proj",
+    "k_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..529a874bcaea7ebc9ccf5de40bd31cbc029a6c9c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:303c2be5e4b7c4c2740bc8fe21a16489e7dc9ad5fd21459fc665aff86b5faf5d
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8013d9d088f7945a8538d7a44bfa35198708b62c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5389b9cc76ead40e1b7bc8b626a9b166a9dbb13ed161fd6c34d3e8d88509c73
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7418f65869cf29f8d6504adbb47bcf1aa5fabae8
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.856571687971295,
+      "learning_rate": 2e-05,
+      "loss": 1.2847,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.7055923454015887,
+      "learning_rate": 4e-05,
+      "loss": 1.112,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7337135243739601,
+      "learning_rate": 6e-05,
+      "loss": 1.2254,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.837585495614336,
+      "learning_rate": 8e-05,
+      "loss": 1.3822,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.6594271990911063,
+      "learning_rate": 0.0001,
+      "loss": 1.1207,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.1223985314428306,
+      "learning_rate": 0.00012,
+      "loss": 0.9881,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.9227094007590665,
+      "learning_rate": 0.00014,
+      "loss": 1.1223,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.8477658360206024,
+      "learning_rate": 0.00016,
+      "loss": 1.1024,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 1.0168041710695876,
+      "learning_rate": 0.00018,
+      "loss": 0.9368,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.7268498717360209,
+      "learning_rate": 0.0002,
+      "loss": 1.1298,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5215962705907371,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.9259,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.7553183446752619,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 1.0669,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5641728211000794,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.9391,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.8498327143480147,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 1.0197,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.7717724273877493,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.9622,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5153040789801788,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.8716,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.6215554177155925,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.9396,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.6368311748731851,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.9085,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5991974216920687,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.9664,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5880944284346221,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.8624,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.7255831004379246,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.9939,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.6374991872522399,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 1.0072,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.5546079581109932,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.9612,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5916519032202122,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.9761,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.6356011070115721,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.9587,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.6024071271891809,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.8811,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.49232008604791955,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.8569,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.6053773289823773,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.997,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.6642190058400091,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.964,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4782405704173689,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.8375,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.6334905713423813,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.9273,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.3998669898594021,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.8155,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.5351402876869878,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.845,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.5473592406773546,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.876,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.4714152405970396,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.8893,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.6233030845726812,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.9926,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5851125542623964,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.9213,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.48818739539340444,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8393,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.5942600495624568,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.9138,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.6155460135079603,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 1.0341,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.5354548010801756,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.851,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5029186103592496,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.9062,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5856045747194191,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.8925,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5165745934706759,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.9191,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.5104093507497887,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.9431,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.5324667462814716,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8927,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.5980004948399099,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.9729,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.5812454848443018,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.9417,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.6434345018371275,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.9758,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.8670925963606051,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.9639,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5142581304786665,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.8442,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4784472560862407,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.817,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.5192157296603549,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.9377,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.5345814539065935,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.9973,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.5212648979550981,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.944,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.5847828497713395,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.8486,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.6036505287588615,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.976,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.5185079411903789,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.8535,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5087496634629276,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.8656,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.6620924291595747,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 1.0019,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4165962007102062,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.8478,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.6000561388824549,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.9312,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.5645804945586722,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.8775,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.5457958238077547,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.932,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4563578229166194,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.7811,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.6237927980286077,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.9482,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.6096397902515523,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.9591,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.61532565982814,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.9338,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.5366533603653786,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.9319,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5165264282327887,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.935,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.5020374316473182,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.889,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.5429511056967373,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8124,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.5372888520436341,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.8855,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5477919990650789,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.9003,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4919770372900607,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.8582,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.49579325741634583,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8353,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.5782246538890948,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.9904,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.6235756903654298,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.9499,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.5559031253986597,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.9486,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.5161196463465147,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7823,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.48046584574072526,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.8563,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.5644927020207363,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.939,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.5262516802613253,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.9095,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.5162214092865526,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.9218,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.48066263602690296,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.8351,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.46511514585809155,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.8347,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.47764732381060815,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.8668,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.5391492918269589,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.9166,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.5452368893224666,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.9434,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.5068755875108313,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.9381,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.4561249918064821,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.7897,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.47303478518505127,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.7576,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.5364559677774835,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.9363,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.552194133620672,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.8602,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.6092151390114966,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.9392,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4469527891085572,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.8284,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.5466970963402339,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.8422,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4793293027604878,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.8016,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.5574466292258964,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.8667,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4607255432613611,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.8439,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.5861238704023553,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.8349,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.5067045689470819,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.8722,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.5745265335709175,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.9768,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.6483302411754751,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.8501,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5692191325540262,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.9932,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.7082524782346505,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.9979,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.4700388644962814,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.8249,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.4565116683642803,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.7952,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.5252540630460191,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.8311,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5200739744170301,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.8811,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.44622703844460526,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.8323,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.500576250224737,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.883,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.46685072068441685,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.8068,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.49083712897749426,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.9192,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.41453739036231607,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.7524,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4723495222814731,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.8452,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.6478344448359308,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 1.0142,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.5144296129212168,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.8394,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.6328551875817262,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 1.0413,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.42502713429252176,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.7571,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.544857543401765,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.9066,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.5182206067034031,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.8623,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.5418868636953043,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.8537,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.5521729198502818,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.944,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4584591055005934,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.8743,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.5864975864579389,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.9845,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.5078042178534576,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.8262,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4581746905213,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.827,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.5188671706161625,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.8164,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.5207471584962147,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.9056,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.480373287574047,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.8821,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.5873723875897096,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.9182,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.46572587728199677,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.7844,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.5329092227933953,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.9006,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.47219789926354094,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.8032,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.6436238273337977,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.8897,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.44969251141355987,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.8098,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.7065052879539846,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.9539,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.49430107652669947,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.8396,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.44278337903710574,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.7493,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.45657585903503006,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.8399,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.5594305788233783,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7511,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.4731245915223696,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.8638,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.44866720451901915,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7519,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.5049157207422073,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.8531,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.4622961459788511,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.7997,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.5011867957087826,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.8336,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.44473343514751135,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.8141,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.49174176435012285,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.9179,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4212233224278149,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.756,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.42131704733259884,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.774,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.43434312084011045,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.7475,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.5378430206230584,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.8769,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4227908343847277,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7987,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4613617814492636,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.8833,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.40191007335151063,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7724,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.5874932967179956,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.9077,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.47018012728633896,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.8459,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.5135569343532309,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.8724,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4711547922176534,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.8732,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4605391208636547,
+      "learning_rate": 0.0001,
+      "loss": 0.7958,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.5162693845942066,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.8359,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.6790590303772195,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.9214,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4154627258730887,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.7592,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.49204958681426475,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.8947,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.48299009755135697,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.8093,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 1.3967726735458745,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7723,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.47547860623397964,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.8072,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.5521765898351926,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.8858,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.5403612499302933,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7971,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.5678871132109066,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.8678,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.5085771385434323,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.8054,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.5265619363777059,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.9647,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.4905195302788644,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.868,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5282205277029001,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.8057,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.5404677713988845,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.919,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.5081939196669241,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.8112,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.701368188891065,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.9355,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.47644014020348635,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.8545,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4479800438378306,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.803,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.4568365659077264,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.721,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.5628625002310027,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.8664,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.39193950097624564,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.7784,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.44621968664447376,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.8036,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.6552546382007541,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.8681,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.4444323128940648,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.8249,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4096211039326677,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.8002,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.41374321969091965,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.8164,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.6486742789837998,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 1.0459,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.5897416604730881,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.8359,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.46776013489921014,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.7546,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.5175426743395212,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.8506,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.7039149380745073,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.9374,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.45067267248124315,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.8746,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.549831797921143,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.9139,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3967114644899662,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7546,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.46586065767273077,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.7676,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.5225256866262378,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7611,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.463024339644472,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.7903,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.41252826026591827,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7522,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.5835285744859474,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.8858,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.449279529912864,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.7245,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.6482288766459807,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.9016,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.4594998006796061,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.782,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.8646261954648369,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.8832,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.6800798821111771,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.9053,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4255077691036116,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.7375,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.41644886027642136,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.8249,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.5209064217397174,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.7697,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.5498352812442217,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7928,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.5413265947767266,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.8645,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.5288103110126229,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.8829,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.5011933130582199,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.842,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.45515517378700693,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.8259,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.6101065320239799,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.9956,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.46840904584275894,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.8131,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3663759497337274,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.6838,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.4937025705477826,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.834,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.4595324612373292,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.8017,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.47738605752675983,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.8044,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.5332094621243066,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.8264,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.41762127444184166,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7573,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.5026812226060394,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.8486,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.4665822142084424,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.7489,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.577246877272033,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.8626,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.5765141180700714,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7646,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.4777580846188267,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.7975,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.561248592643386,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.8662,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.5297059368674798,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.9775,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.7058474858470783,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.8818,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.5339336266613487,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.8618,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.4834312926665495,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.7587,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.581475851985833,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.8332,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.5109554184489444,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7822,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.4843338647071073,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.8022,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.44693796537617286,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.8174,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.43703907272285714,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.8016,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.49899162998005303,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7687,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.42034349795266884,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.7104,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4479587754924112,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7968,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.4244693351073528,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.6858,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4589141075325882,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7318,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.47161139886206915,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.8238,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.47576380142421015,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.7973,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.493338423791926,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.9004,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.5100402517048367,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7534,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.5112008725560001,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.8408,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.41032245286755115,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.7524,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.43838327663853965,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.7647,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.44398343084308467,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7664,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.4533979485061938,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.7551,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.40177835310822313,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7808,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.47981196343744154,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.7776,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4616612061088841,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.7976,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.5310570995829195,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.7568,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.4361654306179982,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.8239,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.4640496078272689,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.8167,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.443990966442962,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7608,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.43147696653912865,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.7399,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.4187729075670258,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7695,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.5066517649172814,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.7925,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.40619158877482436,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7577,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.366965454218498,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.7599,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.39262286158044624,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7374,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4832698354941956,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.8533,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.5287128800831273,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.8266,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.43576896063036646,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.8129,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.48231800241057715,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.7813,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.597075273021654,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.7735,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.6134868404814026,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.9541,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.5919719755939148,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.94,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4588676200704114,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7604,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4545424822405747,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.8459,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.5154234986338997,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.7062,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.6061436738637195,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.7213,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.5690397032723647,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.806,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.6929430015219535,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.9031,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.4636024177548296,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7045,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.5590861235770141,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.8568,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.49918272298276084,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.8063,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.5159744366222171,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.7546,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.4454372767204154,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7733,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.40366650002590854,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.7058,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.5773230798700426,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.8508,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.48932838121103617,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.7155,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.49731206361843744,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.8277,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.37277364446149414,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.7524,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4810886401899049,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.8317,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.45269090426882497,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.8463,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4737738227998511,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.7841,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.6310130183124479,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.9008,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.42158071300460925,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.6928,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.5367004719693059,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.7719,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.6060188160837613,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.9157,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4869523495961573,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.75,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.7323792302024977,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.8785,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.43686635101321636,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.8171,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.44734722608116156,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.8478,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.4646935536379244,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.7162,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.5157929065786432,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.8251,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.7275381901396758,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.9507,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.4183990280164646,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7522,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.5457365142535572,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.8629,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.49843708727416114,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.7987,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4146234731795237,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.8141,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.5053500587148759,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.8026,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.6588712440547313,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.8751,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.45240332151104545,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.7212,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.6437603542272565,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.7909,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.5013273996649436,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.8622,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.47420264885035235,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.7697,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.564032619501475,
+      "learning_rate": 0.0,
+      "loss": 0.7968,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 262275390177280.0,
+      "train_loss": 0.859515962501367,
+      "train_runtime": 4688.9181,
+      "train_samples_per_second": 1.066,
+      "train_steps_per_second": 0.067
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 262275390177280.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..351e80d947298d787e4461efdc69c87ae5993517
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj",
+    "down_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..af30dbbf5d30aef832bdcbae95f7fc31a15967f5
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:808d35cb6452a2953789a7d20aa8a1c96c5c3cedaf37f9a8e8c7ddcfdd38315a
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c5f876cec900beb9a602eca825f0509b847206dd
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c91de22c69677f2d751b762b1108d0eccbd85445f907b041626f609e8672a555
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..12eb455df334952bcb878c781a8e4de871bf4cba
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,1134 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 156,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.6531013943101484,
+      "learning_rate": 4e-05,
+      "loss": 1.1983,
+      "step": 1
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.7243027041411867,
+      "learning_rate": 8e-05,
+      "loss": 1.352,
+      "step": 2
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.6059556630796344,
+      "learning_rate": 0.00012,
+      "loss": 1.1418,
+      "step": 3
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5897789101722263,
+      "learning_rate": 0.00016,
+      "loss": 1.1574,
+      "step": 4
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.9907059972693204,
+      "learning_rate": 0.0002,
+      "loss": 1.1703,
+      "step": 5
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.7153029742668117,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 1.054,
+      "step": 6
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5162952305618812,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 1.0066,
+      "step": 7
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4560437547903583,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.935,
+      "step": 8
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.4591730374379789,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.9422,
+      "step": 9
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4316152901290879,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.9307,
+      "step": 10
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.5647897788044418,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 1.0211,
+      "step": 11
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.48294255234229355,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.9788,
+      "step": 12
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.5096172315529667,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.9426,
+      "step": 13
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.44088678017689265,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.9372,
+      "step": 14
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4682252458499051,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.9121,
+      "step": 15
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.39797730428554234,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.8869,
+      "step": 16
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.3864393862644587,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8581,
+      "step": 17
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4188942691273693,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.9515,
+      "step": 18
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.36548225855019706,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8751,
+      "step": 19
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.411547543546142,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.9719,
+      "step": 20
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.39409864447281595,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8776,
+      "step": 21
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4022259169634103,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.9015,
+      "step": 22
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.3754872706030276,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.9216,
+      "step": 23
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.4224595812307275,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.9599,
+      "step": 24
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5223489655078559,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.9646,
+      "step": 25
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.35308810704754584,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.8193,
+      "step": 26
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.388007287057223,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.9603,
+      "step": 27
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.38353615191183243,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.885,
+      "step": 28
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.3984587603283565,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.9116,
+      "step": 29
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.3895086416416209,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.9265,
+      "step": 30
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.39407543468034756,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.8866,
+      "step": 31
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.41353543458783343,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.8962,
+      "step": 32
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.3898100986337053,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.8536,
+      "step": 33
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.44138844308168373,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.9389,
+      "step": 34
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.38403391055992697,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.9249,
+      "step": 35
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3554715924546394,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8369,
+      "step": 36
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.38916688084355255,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.8834,
+      "step": 37
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.37673094982431743,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8339,
+      "step": 38
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.4410145022484197,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.9644,
+      "step": 39
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4219801789021562,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.8563,
+      "step": 40
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3878395192466562,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.8787,
+      "step": 41
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.380156955704002,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8989,
+      "step": 42
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.3337757040266273,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.8251,
+      "step": 43
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.373924455891821,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.8769,
+      "step": 44
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.40584629185302074,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.9348,
+      "step": 45
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.35282236451512666,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.7637,
+      "step": 46
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.40393619317262786,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.8946,
+      "step": 47
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.40971995700029845,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.8762,
+      "step": 48
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.3627260591138329,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.8106,
+      "step": 49
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3680533266179677,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.8472,
+      "step": 50
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4015581486261684,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.8441,
+      "step": 51
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.4100193735092169,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.898,
+      "step": 52
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.47180488656682523,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.9866,
+      "step": 53
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.3523624395079363,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.8007,
+      "step": 54
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.35726184720092874,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.8406,
+      "step": 55
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3530040185899825,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.8484,
+      "step": 56
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3528784008170458,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8558,
+      "step": 57
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.31937072880784756,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7895,
+      "step": 58
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.41956435966393973,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.9199,
+      "step": 59
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4139326323202461,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.8905,
+      "step": 60
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.40453670887947585,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.8775,
+      "step": 61
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4374110458546487,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.8928,
+      "step": 62
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.39023942568288056,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.918,
+      "step": 63
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.34857819904427056,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.8166,
+      "step": 64
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3685005339493162,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.8532,
+      "step": 65
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.38789784924823806,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.8865,
+      "step": 66
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4780610692107015,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.8379,
+      "step": 67
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4235659969652539,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.8419,
+      "step": 68
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.41517088760561804,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.8682,
+      "step": 69
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.34436625989604225,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.7835,
+      "step": 70
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3500455919830399,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7909,
+      "step": 71
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.34110031360566156,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.8002,
+      "step": 72
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.34510689610220674,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.8199,
+      "step": 73
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.36725453965384564,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.818,
+      "step": 74
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.34943402862495193,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.8318,
+      "step": 75
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.31133790718180526,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.7547,
+      "step": 76
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.3658484722990489,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.8356,
+      "step": 77
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.32129076781797744,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.8278,
+      "step": 78
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.38942837430839766,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.8707,
+      "step": 79
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.35123170973120066,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.8644,
+      "step": 80
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.5354527809050915,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.8145,
+      "step": 81
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.40673598485008916,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.8408,
+      "step": 82
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.3524570688187097,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.8466,
+      "step": 83
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.33636392603349363,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.7859,
+      "step": 84
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.39915672090481497,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.8393,
+      "step": 85
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.39886971290229895,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.8354,
+      "step": 86
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.37555348433730135,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.917,
+      "step": 87
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.37601475500174525,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.859,
+      "step": 88
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.38865844776125025,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.8678,
+      "step": 89
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3307037617178496,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.8275,
+      "step": 90
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.3666715522668213,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.7919,
+      "step": 91
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3084693433451157,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.7943,
+      "step": 92
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.37862887312694415,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.8477,
+      "step": 93
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.2885506950523641,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.8062,
+      "step": 94
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4362567348068058,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.9364,
+      "step": 95
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.35793369277964976,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.8017,
+      "step": 96
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.41198650847935414,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.9099,
+      "step": 97
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.34412292057512806,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.834,
+      "step": 98
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.36291105966317105,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7662,
+      "step": 99
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3190014750409097,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7709,
+      "step": 100
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.36727250643356163,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.8057,
+      "step": 101
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.4044973217971893,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.8477,
+      "step": 102
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.4458033062134064,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.8934,
+      "step": 103
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.3065643632920914,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7804,
+      "step": 104
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3775122540545165,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7831,
+      "step": 105
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.4038170954360674,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.8703,
+      "step": 106
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.34770328740695106,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.8374,
+      "step": 107
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.38937427611703035,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.9036,
+      "step": 108
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.31386321764753544,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.7616,
+      "step": 109
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.34254419298536803,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.8045,
+      "step": 110
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3363058538370427,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7922,
+      "step": 111
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3389140844461325,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.8007,
+      "step": 112
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.4126704473190358,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.8139,
+      "step": 113
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.5197748600439888,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.8356,
+      "step": 114
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.45172982428856284,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.9337,
+      "step": 115
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3537053348496251,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.8148,
+      "step": 116
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.39124529440222766,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.8159,
+      "step": 117
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3407576980401783,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.8123,
+      "step": 118
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.34222429427126,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7942,
+      "step": 119
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.319583365059282,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7581,
+      "step": 120
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.314408080260358,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7116,
+      "step": 121
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.3224958018920519,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.8122,
+      "step": 122
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.37349316101434166,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.832,
+      "step": 123
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.3438868250840104,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.8047,
+      "step": 124
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3197518559056604,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7704,
+      "step": 125
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.29119469695454026,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7735,
+      "step": 126
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.349918604093867,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.792,
+      "step": 127
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.30979893873259173,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.7985,
+      "step": 128
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.333052459268458,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7935,
+      "step": 129
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.29898578402757314,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7592,
+      "step": 130
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3381948350076578,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.779,
+      "step": 131
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.2753527457754542,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7558,
+      "step": 132
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.366685214156942,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.8436,
+      "step": 133
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3372299539032351,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.8017,
+      "step": 134
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.4175262755242993,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.8725,
+      "step": 135
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.37915892849178806,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.8524,
+      "step": 136
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.33639199375721807,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.7804,
+      "step": 137
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.36256260056872164,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7661,
+      "step": 138
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.40545845722573964,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.8072,
+      "step": 139
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.37483517096650165,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.8385,
+      "step": 140
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.34187683963694776,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7739,
+      "step": 141
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3604054800265124,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.7892,
+      "step": 142
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3396109345852707,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.7809,
+      "step": 143
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.30021987701543046,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.7996,
+      "step": 144
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.336807864327852,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.8249,
+      "step": 145
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3840835764416319,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.8061,
+      "step": 146
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.4008265556034786,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.851,
+      "step": 147
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.4656710783388933,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.8236,
+      "step": 148
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.323469647972122,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.837,
+      "step": 149
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.34449303085338007,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.7767,
+      "step": 150
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.6216286447446,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.8622,
+      "step": 151
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.38081216442894966,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.8385,
+      "step": 152
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.3392711812270996,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.8156,
+      "step": 153
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.433515843854761,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.8067,
+      "step": 154
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3707318180470919,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.8388,
+      "step": 155
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3630076689590227,
+      "learning_rate": 0.0,
+      "loss": 0.7866,
+      "step": 156
+    },
+    {
+      "epoch": 0.9984,
+      "step": 156,
+      "total_flos": 374742464462848.0,
+      "train_loss": 0.8615743120511373,
+      "train_runtime": 4679.3893,
+      "train_samples_per_second": 1.069,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 156,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 374742464462848.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a4de3a79f7b1c90c854454a91043154efefedf88
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj",
+    "q_proj",
+    "v_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5806afdc2dc689aeafbfdbea7b4ab9c99efcb0d2
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b505efac2d259a2a47415dbc3f6c07333dda8a83f3d749a271eda8e07564ab4
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fcf0df93197629b22ecdd0ff452dea76300090c1
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:445dbb65de13638be7e2353171e362ec89018043af53abfb1f47ae272d3a7a39
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2bf1dd59b26fde221c3875db0bce8dfa8f9499c6
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9853113996014804,
+      "learning_rate": 2e-05,
+      "loss": 1.3897,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9506426475728363,
+      "learning_rate": 4e-05,
+      "loss": 1.2433,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7596966755410303,
+      "learning_rate": 6e-05,
+      "loss": 1.1961,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.861146899634231,
+      "learning_rate": 8e-05,
+      "loss": 1.3099,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.7573496664517325,
+      "learning_rate": 0.0001,
+      "loss": 1.1346,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.6407078764092344,
+      "learning_rate": 0.00012,
+      "loss": 0.9372,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.9333053900472983,
+      "learning_rate": 0.00014,
+      "loss": 1.1144,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.782816581375092,
+      "learning_rate": 0.00016,
+      "loss": 1.137,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.8777223543755927,
+      "learning_rate": 0.00018,
+      "loss": 1.0674,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.651514415869391,
+      "learning_rate": 0.0002,
+      "loss": 1.0372,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.6565878272211279,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.9806,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5886485478480157,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.9688,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.6815061121637854,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.9581,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.7281105653793167,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 1.036,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.6914179263697914,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 1.0213,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.6208634408736601,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9661,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.6139650553863405,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.9464,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5516620513298294,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.9267,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.6442118581390293,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.8928,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5685803755480052,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.9578,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.7428802362494477,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.9129,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.5841251723976262,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.962,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.5847566191321127,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.9113,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.602952800315588,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.9482,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.6574185458555385,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.9703,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.6092484268483226,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 1.0078,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5396328498627039,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.8152,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5950775899897824,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 1.0315,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.5400467879636469,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.9399,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5288979794543393,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.8464,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.5527743229596785,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.9577,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5049275012787766,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.9382,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.5091690367695947,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.8665,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.5318560761074015,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.9406,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.48159325375001594,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.9629,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.5552052505514862,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.931,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.600134726583001,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.9799,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.4560541668624478,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.831,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.47044662781439345,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.8897,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.5810552708976553,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.9074,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.5406383322389806,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.8683,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.4981624487582689,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.9491,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.689665279627117,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.9629,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5207432914579827,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.9627,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.5036481369902529,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.8391,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.7302880468144016,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.9593,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.592163044629266,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 1.0679,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.5177981539894086,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.9884,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.6349866730973226,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.9047,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.7610952787085876,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.9718,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.48386518707786086,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.7806,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.5318455453071542,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.8641,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.5160916396118949,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.9017,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.6283675091205313,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.9637,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.5174956315565866,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.941,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.5948477232434937,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.9142,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.6144641888097007,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.9617,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.5411374031190403,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.9119,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.4551477650356518,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.8273,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.7539290905710584,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.9478,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4455975550076356,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.8896,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.6358718236473844,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 1.0483,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.5049143290842381,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.8487,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4702456070831088,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.7622,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4834735307275798,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.7963,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5725717059675735,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.8986,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.6255809201436507,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.9931,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.5242574559465072,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.827,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.6646028707343711,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.8703,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5197197534999193,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.9064,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.5344677586822248,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.9058,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.5109696739058583,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8375,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.5431977826414388,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.8769,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5261171550626805,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.873,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5063472698603411,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.8654,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.5241634121899378,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8807,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.5089293613853708,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.797,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.7184272701984563,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 1.046,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.6345040707354779,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 1.0161,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.5519854710124368,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.8733,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.7052547552561359,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.8798,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.569885331187361,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.9914,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.47943110178092274,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.8458,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4777715513897952,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.9045,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4911601274902315,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.9083,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4280825855771144,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.7424,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.45240547298580386,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.8539,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.5183528433389661,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.8627,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.4560751891623164,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.8303,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4977422603317782,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8397,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.49015779978900487,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.8449,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.5016378352074524,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.7354,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.563044312167859,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.862,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.5275138404498927,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.8566,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.707942148471608,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 1.004,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4545736299260399,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.8028,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.5464208281333135,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.7689,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4754134333479607,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.7514,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.6104595469568889,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.9243,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5211974739858017,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.9002,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.6364402922521777,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.8828,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.48631460430578,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.8656,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.5447146038021392,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.9,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.6067643448503521,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.8729,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5425159273352543,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.9745,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.6479741166126189,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.9822,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.5558191947311435,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.7664,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.4586689678505516,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.6948,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.5896943261591338,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.8536,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5473809693844722,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.9329,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.506608463345342,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.7825,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.4707113693392528,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.8198,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.45623380739597186,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.8045,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.530383958977767,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.9472,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.5112192224272184,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.8063,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.5574429216569449,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.9055,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.6608037662364803,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.9565,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.539991861126441,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.8963,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.7379756808759644,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 1.091,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.45346975391486566,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.7861,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.5412071083205857,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.9098,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.5689976450882609,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.881,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.8071897773552312,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.8521,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.6079098990009498,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.9552,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5213294498819157,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.9346,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.6003728425562892,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.8904,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.5273050145621091,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.817,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.48841372848628767,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.8343,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.6471361649530032,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.9767,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.5408628642258498,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.9073,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.5114489126327357,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.8789,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.5470267907068778,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.9001,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.4802931290730935,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.7543,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.5839884280401538,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.9524,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4733573558278859,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.8034,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.6945277000815755,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.8441,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.47867356502676395,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.8375,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.5849597006842344,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.796,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.4918697723988388,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.8589,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.5010605168562106,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.8102,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.4891016643938588,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.8983,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.6208987333820601,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.8718,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.5285129096546756,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.9588,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.4776186363050293,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.8365,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.49270178475287485,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.793,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.4875495251241639,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.9373,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.545069187297972,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.8501,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.4626572610071826,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.8283,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.49533257627313665,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7813,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.510856931550238,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.8225,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.4392196971414571,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.8786,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.49520723985099946,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.7656,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.5306397017302849,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.8549,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.45603793666882725,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.804,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4244887206201728,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.7726,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.40132261986474643,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7778,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 1.2936543266675542,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.9148,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.47889573076766523,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.8071,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.5612261063347095,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.8822,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.5273885488245399,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.8306,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4717948350536635,
+      "learning_rate": 0.0001,
+      "loss": 0.69,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.4840408762889548,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.754,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.6221134254293921,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.879,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.45100027191191416,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.747,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.8339067998577471,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.8372,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.5589104243615118,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.8438,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.4114548028627618,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7871,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.480650441623415,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.8283,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.6432633248760891,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.9192,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.5534300778342244,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.9246,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.5234553987672043,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.803,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.5076566664174726,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.831,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.5590093677334266,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.9798,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.4781234381096328,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.8182,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5475350234616201,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.9335,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.5220333579306987,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.9167,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.5061996867916019,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.7998,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.7067608388548705,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.9451,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.4748422317049764,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.8049,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.5141359500013931,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.8825,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.4538449077038628,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.8246,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.5812532103713188,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.93,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.40602200816110934,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.7816,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.44425664814082716,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.8097,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.6300211393176885,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.8242,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.4531748578835238,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.7426,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.792622749855944,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.6975,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.39359733135436836,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.8199,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.6852950223585511,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 1.0539,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.5384589258458015,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.8291,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.4324933046911287,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.7547,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.468709727430381,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.7981,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.6803853686918507,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.9394,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.4825189840086367,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.8202,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.5144186539202125,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.7711,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.4263396250132595,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.8088,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.49941845430127096,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.8169,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.5682021495903111,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7438,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.537687101429346,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.8825,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.4635947680984124,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7918,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.558291688101491,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.8062,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4657827288564771,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.878,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.6475849514054947,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.8601,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.4721925507581555,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.8426,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.625925634093718,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.9321,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.7033747672815854,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.9268,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.39977159046362765,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.6701,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.46359917113099386,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.8442,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.4751301690283989,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.8074,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.5337082674747815,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.864,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.5196500203640496,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.8015,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.5112184951236864,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.8287,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.5304281425423548,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.8749,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.5528623490299155,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.8318,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.6111246948996016,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.9082,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.4635896756486311,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.8146,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.40035242880443134,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.7578,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.45841192895687083,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.8402,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.4856982628923901,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.883,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.47289227158961605,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.776,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.47530983676727984,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.8598,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4884455177461908,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7445,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.44316312426098886,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.734,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.5030904138998532,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.8837,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.6038132471996558,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.8938,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.5260197404776972,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7718,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.34691547214333723,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.6221,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.5275459072919113,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.7895,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.5570304110125733,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.8978,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.6065567996234003,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.8329,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.523924582372723,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.8316,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.5781104737849966,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.872,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.6195051512406304,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.8944,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.5125990141611612,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.8307,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.4176635222803518,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.7378,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.4996266471649693,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.7762,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.4586927896452429,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.8369,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.4747263380295228,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7604,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.4513328285687335,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.7674,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4824499498411078,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.8047,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.44532563946804304,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.7169,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4534563140397689,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7372,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.5092400552652228,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.878,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.48484441028887837,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.8378,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.49988786684234965,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.8989,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.5036027600886066,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.8252,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.5663754997840815,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.813,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.48522149898487005,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.7892,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.498470979551735,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.7893,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.43696460497769457,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.6798,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.5250712861371837,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.8043,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.38732002126004367,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.6926,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.5840927286804334,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.7869,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.41480769742238666,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.742,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.46402711009604686,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.8465,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.4769618114279666,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.7962,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.49153701380015713,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.8428,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.4583848547396149,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7666,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.4537547977690445,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.7135,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.4432071389256633,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.804,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.5707491083230274,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.8564,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3892745505208706,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.6756,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.4258721911969603,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.7472,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.46365446554984235,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.8042,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.456740543797104,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.8339,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.547086237192044,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.776,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.44636264565543116,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.8197,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.47417519053417406,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.8621,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.5291019588943826,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.9103,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.6565436691143302,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 1.0123,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.6482087047353086,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.9716,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.5350159462679187,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7482,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.5034986091969066,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.876,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.46017174925372106,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.8034,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5321986712064487,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.8218,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.4811734868233411,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7031,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.6346894353744286,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.9176,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.4838896705858339,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7323,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.5820365796986088,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.8958,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.4362300861578015,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.6991,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.5078369607852672,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.8024,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.40260744239183127,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7251,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.37593623442077545,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.6507,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.5456906879732161,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.8119,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.47180073161871877,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.7475,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.4958689938065469,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.8696,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.4308524491770669,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.7467,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.44350428945291354,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.766,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.47896882190111284,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.803,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.48900554232333243,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.7949,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.5539089973209115,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.8044,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.41239761860155166,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7603,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.4819436482045979,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.7729,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.672490555461374,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.9055,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.5497856014564447,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.7662,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.7769076374803492,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.935,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.45798290239142614,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.8093,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.45082261847579425,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7599,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.4849731846586711,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.8297,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.5562776995082223,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.8792,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.7246955746191942,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.8973,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.4508444268939336,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7073,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.4665854388410085,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.824,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.4544600876679446,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.7617,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.49710907207792343,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.7671,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.4276600021670932,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.6997,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.556101610395927,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.8193,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.429165560718107,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.6873,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.579147630975659,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.8657,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4764623215495487,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.8788,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.5039653874346466,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.8246,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.471304187050522,
+      "learning_rate": 0.0,
+      "loss": 0.734,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 254243116089344.0,
+      "train_loss": 0.8617107205283947,
+      "train_runtime": 4634.4498,
+      "train_samples_per_second": 1.079,
+      "train_steps_per_second": 0.067
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 254243116089344.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1b26538b2af09e21a3409e03eb76a8752d09a93
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "down_proj",
+    "k_proj",
+    "up_proj",
+    "o_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9a54a5c280e1f81567f50eae7b177ac61279635f
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63974bbae1ea49c85621abd1c04a4c1304ee81668d211a5baa370d67b275592c
+size 671150064
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..933e8161a20b468cfc065e95580dc2c31e3aba4e
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:014a18c938c8cb914531b9540f22824a872b48e0251fcb4bb5c302cd7fa4ce5b
+size 918507402
diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..88d324766ad908a6cba631af3d3351e763ed5bee
--- /dev/null
+++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,1134 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 156,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.805546322586046,
+      "learning_rate": 4e-05,
+      "loss": 1.3165,
+      "step": 1
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.7545839075789395,
+      "learning_rate": 8e-05,
+      "loss": 1.3077,
+      "step": 2
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5409861715811655,
+      "learning_rate": 0.00012,
+      "loss": 1.1229,
+      "step": 3
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6197681443183749,
+      "learning_rate": 0.00016,
+      "loss": 1.1844,
+      "step": 4
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.889292974821854,
+      "learning_rate": 0.0002,
+      "loss": 1.1847,
+      "step": 5
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.72954475633058,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 1.0351,
+      "step": 6
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.562481463872672,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 1.0284,
+      "step": 7
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4542647162185984,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 1.0089,
+      "step": 8
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.44881440081227303,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.9525,
+      "step": 9
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4573316397402673,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.934,
+      "step": 10
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.5731092163782907,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.9563,
+      "step": 11
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5582550935793705,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.9484,
+      "step": 12
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.48101884469079026,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 1.0022,
+      "step": 13
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.44489440028982064,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.9294,
+      "step": 14
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4140640546403325,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.9036,
+      "step": 15
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4452589302938284,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.9623,
+      "step": 16
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.39179819506926955,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.9067,
+      "step": 17
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.39366752200691585,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.9528,
+      "step": 18
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.39803846092630674,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.9095,
+      "step": 19
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4125166760813841,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.9084,
+      "step": 20
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.367045512037434,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.9115,
+      "step": 21
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4204643990614447,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.9652,
+      "step": 22
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.4234540671955711,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.9077,
+      "step": 23
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.41848674883765574,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 1.0333,
+      "step": 24
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5322952744746656,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.9422,
+      "step": 25
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.37120088149684727,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.8186,
+      "step": 26
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.6759481035130391,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.9278,
+      "step": 27
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.38228002308368947,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.9193,
+      "step": 28
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.3865683631565096,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.9207,
+      "step": 29
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.3965746057935295,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.8772,
+      "step": 30
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.3858422141117998,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.9648,
+      "step": 31
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.35659419025313677,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.8024,
+      "step": 32
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.39768086056016644,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.8391,
+      "step": 33
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4335853573049478,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.9007,
+      "step": 34
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4180259491270757,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.8795,
+      "step": 35
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.36587131891456703,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8591,
+      "step": 36
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.39498933797878183,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.8652,
+      "step": 37
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.3948791811440726,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8694,
+      "step": 38
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.4665878754601238,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.9099,
+      "step": 39
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4417260562598781,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.9309,
+      "step": 40
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.41926910617947233,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.9185,
+      "step": 41
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.33804324283430204,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8653,
+      "step": 42
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.3251706874587237,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.8152,
+      "step": 43
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.34873519250922863,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.8466,
+      "step": 44
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3374558679907544,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8324,
+      "step": 45
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.3370814914034644,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.7825,
+      "step": 46
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.39413321334376433,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.8575,
+      "step": 47
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.37798799795124327,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.8946,
+      "step": 48
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.37049011642321633,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.7573,
+      "step": 49
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.38681120648438766,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.9045,
+      "step": 50
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.40835086499060924,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.867,
+      "step": 51
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.413703584869729,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.8734,
+      "step": 52
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4309517025934206,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.9699,
+      "step": 53
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.36248918663875634,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.7183,
+      "step": 54
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.4062156464684849,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.8833,
+      "step": 55
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3370276280908736,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7916,
+      "step": 56
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.35014515823892844,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8683,
+      "step": 57
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.3783891730128175,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.8471,
+      "step": 58
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.4570925963839151,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.9177,
+      "step": 59
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4163069571860744,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.9249,
+      "step": 60
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.40422561275720553,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.8832,
+      "step": 61
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4052479343056615,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.8903,
+      "step": 62
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.38958133602024597,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.8973,
+      "step": 63
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.6475031242958649,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.821,
+      "step": 64
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.44626049108693816,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.9273,
+      "step": 65
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3812837364086625,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.8877,
+      "step": 66
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.36872511821962134,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.8404,
+      "step": 67
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.44849089068023396,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.8171,
+      "step": 68
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.3835328292811916,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.8085,
+      "step": 69
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3511263955012711,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.8205,
+      "step": 70
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.4207355319366397,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.8753,
+      "step": 71
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.37549463268898003,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.8874,
+      "step": 72
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.36121802031305045,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.8625,
+      "step": 73
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.37456200615779306,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.8337,
+      "step": 74
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3643015422631111,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7948,
+      "step": 75
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3254749605120823,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.8123,
+      "step": 76
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.36585814792272303,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.8231,
+      "step": 77
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.30338389028077356,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7762,
+      "step": 78
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.541897142047526,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.8549,
+      "step": 79
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3943300521754534,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.8574,
+      "step": 80
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3297066953221598,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7177,
+      "step": 81
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3831654106693363,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.8078,
+      "step": 82
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.34403521876366816,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.8362,
+      "step": 83
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3218284170360931,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.8062,
+      "step": 84
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.421195702498038,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.9156,
+      "step": 85
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.38098554789749073,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.8168,
+      "step": 86
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.40882368036803046,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.9008,
+      "step": 87
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.384069393195924,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.9222,
+      "step": 88
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4558671792764743,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.872,
+      "step": 89
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.35614041648667805,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.8417,
+      "step": 90
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.40081088790439307,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.8763,
+      "step": 91
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.32197931672413255,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.7946,
+      "step": 92
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.39582034739433736,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.783,
+      "step": 93
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.2901849264572683,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.761,
+      "step": 94
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.45644415157439366,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.946,
+      "step": 95
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.35184756415709373,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.7784,
+      "step": 96
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.41519392152425355,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.8843,
+      "step": 97
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.519841037292034,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7931,
+      "step": 98
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.38514886035821516,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7811,
+      "step": 99
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.38223327718531364,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.8398,
+      "step": 100
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.3620194195609992,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.8452,
+      "step": 101
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.40812166227656616,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.8517,
+      "step": 102
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.4850016643582673,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.9341,
+      "step": 103
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.31814201642044526,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7607,
+      "step": 104
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.38422423276934436,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.8354,
+      "step": 105
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.3749895963805748,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.8145,
+      "step": 106
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.3892782930574169,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.8538,
+      "step": 107
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.40519227449724154,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.866,
+      "step": 108
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.32488927233195825,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.801,
+      "step": 109
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3446736636811925,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.8335,
+      "step": 110
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.31596911798818506,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.8055,
+      "step": 111
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3309994721343404,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.8149,
+      "step": 112
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.3987584810803584,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.8379,
+      "step": 113
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3628992489997569,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.7083,
+      "step": 114
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.41185878963669303,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.8743,
+      "step": 115
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3731843655247591,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.8586,
+      "step": 116
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.38977168134862195,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.8659,
+      "step": 117
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.33038949792135575,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.7627,
+      "step": 118
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3300325442346146,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.8006,
+      "step": 119
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3463759373027119,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7917,
+      "step": 120
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.3311395087967572,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7334,
+      "step": 121
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.35885955818374293,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.865,
+      "step": 122
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3650198397583705,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.8657,
+      "step": 123
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.3825762896369597,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.8075,
+      "step": 124
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.33881715214348657,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7396,
+      "step": 125
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3410826447963918,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7526,
+      "step": 126
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.3430038284191712,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.7662,
+      "step": 127
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.33893345771891265,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.8261,
+      "step": 128
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3325405498591736,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.8134,
+      "step": 129
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.40576390349578906,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7655,
+      "step": 130
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3716539369347945,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7717,
+      "step": 131
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.32600043431681647,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7809,
+      "step": 132
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.37421174165734555,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.8124,
+      "step": 133
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3400461814317215,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.8469,
+      "step": 134
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.4235036346319738,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.9632,
+      "step": 135
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4244963028171556,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.8652,
+      "step": 136
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3594067107402808,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.8433,
+      "step": 137
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.39434917822992555,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7691,
+      "step": 138
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.38861442712413796,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.8303,
+      "step": 139
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.38875669196078927,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.8052,
+      "step": 140
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.3397586653857691,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.772,
+      "step": 141
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3341366296530402,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.7425,
+      "step": 142
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3713009002027508,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.8139,
+      "step": 143
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.30749923385219086,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.7641,
+      "step": 144
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.34419207913198757,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.8069,
+      "step": 145
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.35337499951942763,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7877,
+      "step": 146
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.44648853414876255,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.8463,
+      "step": 147
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.47502886516564236,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.8545,
+      "step": 148
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.33272757670347386,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7896,
+      "step": 149
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.37941906422833727,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.863,
+      "step": 150
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.4209815068037428,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.8091,
+      "step": 151
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.33933261841476176,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.7988,
+      "step": 152
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.4686490374433693,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.7405,
+      "step": 153
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.35585369588278914,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.7565,
+      "step": 154
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3957559739532033,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.8821,
+      "step": 155
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.34527612989533474,
+      "learning_rate": 0.0,
+      "loss": 0.7851,
+      "step": 156
+    },
+    {
+      "epoch": 0.9984,
+      "step": 156,
+      "total_flos": 367856468492288.0,
+      "train_loss": 0.8643133678497412,
+      "train_runtime": 4620.4934,
+      "train_samples_per_second": 1.082,
+      "train_steps_per_second": 0.034
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 156,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 367856468492288.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c08a7236f5b6ee67240b8aaf4dec60e97c614d1
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "down_proj",
+    "k_proj",
+    "v_proj",
+    "up_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9445e9b30cf8ce239c5df99919ce9826406e6938
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c4451942d3694fefe2770f4aa99d5a6f31c34de779fb84aa2e034f838cd4f98
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ac33173b3f4ccc1a4cc9e05dc4e953e6c99119b9
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:853d5c6dbc5cf587534c0e5def614db82b0e2b8e2ad193b90e147e0a8f33e495
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc459e2c1ad6d4ac59b40adbbbd0e9f52df04d34
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.9469230770785747,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.4071,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9282190567648156,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.4281,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.9139111931263875,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.3374,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.930828366559463,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.4627,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.8611553397462317,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.3902,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 1.0897129436057411,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.2479,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 1.1752358157877894,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.0697,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.6085116704511242,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 1.1268,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.9596327924478002,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 1.0964,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.692665038925857,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.9623,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.7441815967151375,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 1.0412,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.6316351307258408,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.9233,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.54827818826474,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9314,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5545383500276659,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.9406,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.5872883551061585,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.9104,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.508832380911569,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.9962,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.5630296535547067,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.9143,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5461364557349855,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8706,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.540240829170405,
+      "learning_rate": 0.0002,
+      "loss": 0.9249,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.6434196307881772,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.9895,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.5154473813851905,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.9361,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.4621933314943631,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8441,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.4913720833594254,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8488,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5071218302308363,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8088,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.557695660574659,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8954,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4843281772606149,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8793,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.5008949491258939,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8348,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.44404190337560834,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8792,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.44257702525298914,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.8304,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5651194264346859,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8417,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5898459615130699,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8225,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.46680207549711317,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.8872,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.5143799869364328,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.9118,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.49200438756318415,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.8039,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.45713307022560185,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.8551,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5201454100406916,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.851,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.4731751405020751,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8353,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.4295612250683847,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8234,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.435223588864598,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.8207,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.46549327734889817,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8709,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.4846646680861656,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.8483,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.5367439131274088,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.96,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.44352326173213347,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.8464,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.43119635750337504,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8601,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.4146044852574158,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.7923,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.5312928654759015,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8561,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.4335906008977014,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.7712,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.48655259735726974,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.837,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.3994093879455125,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.7707,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.4631361950212455,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8898,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.4435082887651816,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.7789,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.438265699705375,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.8627,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.430215665242078,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.7563,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5301972397142374,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.9061,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.5131899059813476,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.8849,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.41536687740839834,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.7832,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.47223814289391924,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.846,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4256995099898317,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7631,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.559990879732315,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.8867,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5043932569869316,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.7905,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.45011008673649056,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8417,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4812789673709414,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7674,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.6103731160662064,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.7577,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.45866189675326735,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.8128,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.44798586420124414,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.7607,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.42473200076232115,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.8232,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.5440976562417423,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.7857,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.43250714012844876,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.8152,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.4318040586865442,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.7289,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.4991345592151983,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.841,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.4307284725126805,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7983,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.44595098079392037,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.729,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.444848102324931,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7942,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5384661547454465,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8777,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.44493622975189756,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.8105,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.5191835461601148,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.7933,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.7669217869392135,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8539,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.4596224083145755,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7398,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.5209105604040403,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.788,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4603712507829723,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.8453,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.49223156237577387,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.7523,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.4432794323191188,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8132,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.4525001881259806,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8685,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.6363290906273378,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.917,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4221075035092066,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7898,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.45030339100473527,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.8956,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.5026018722226013,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8095,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4773694774912286,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8173,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.4132220827826047,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.8141,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.508349523133824,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8434,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.45444478321210935,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7478,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.4055452465399414,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.7595,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.4802390536211738,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7917,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.4611572439999133,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.804,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.5458792676014842,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.8774,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.41183351102035237,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.7337,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.5010290726400706,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.8712,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.5350843332707702,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.882,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.4810964147793547,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8291,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.42749800071113353,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.77,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.48480999886839776,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8878,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.4279853264104187,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.811,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.4649343861345453,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.8168,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4587482438792974,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.8322,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.45474669553836844,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.7818,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.4970454376768652,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7589,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.5520040221462187,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.9409,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.4346714649503268,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.7059,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.40193691976371565,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7715,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.46150868183579813,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.8476,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.5399890321178172,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.8239,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.43594284248019927,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7405,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.42116124999808713,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.7814,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.42252061052504514,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.8218,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.50537647594222,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7371,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.4369670225061863,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.7448,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.43656741472487437,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7608,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.4860084003029073,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7349,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.4580469630955301,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7933,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.48687340458779005,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.777,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.4045817052852119,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.7157,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.5247889494958011,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.7962,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.4530743148650035,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.7513,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.43938458385833445,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.8049,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5675799632624358,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.878,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.41877422216064497,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.7406,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.444603109052764,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.7418,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.46843700148456774,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.836,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.46480591805152116,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.8868,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4519091508052084,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.8586,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.4805941763013618,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.7041,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.46794476760137654,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.8415,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.4542489214214423,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.8097,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.41426350430862596,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7327,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5170816909021524,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7929,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.45509561524075265,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7645,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.6019848514623966,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.923,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4195549964082855,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7057,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.39764833596753507,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.7603,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.44713421891574273,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7406,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.4521681645533832,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.8014,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.407900068696158,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7613,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.39477291753373855,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7705,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.43973864277854025,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.8151,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.4410872672283872,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7594,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4513826551120051,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7791,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.3860701722288845,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7573,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.39793214731682597,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7646,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.4841101955135524,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7944,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.45124038050913845,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7447,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.4891956717263449,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.8382,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.5936939379837328,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.8837,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.4342447571796585,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.8167,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.4422936157743359,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.8069,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.4886946451030065,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.7844,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.4878581557072929,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8221,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.4571092467722902,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.7374,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4640542504506806,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.7281,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.45082713250627315,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.81,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.44702434061911905,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7366,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.42471669925252264,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7359,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.38147624028811195,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.7447,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.46598752304955565,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.7732,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.6092979174633653,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7664,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3820648925930006,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.7802,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.42073776928733125,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7872,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.4407185993406487,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.7418,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.45039700269020816,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.7387,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.5215052876963912,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.8291,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4654364806533688,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.784,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.4429785841820516,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7175,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.40214111772244654,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.709,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.4399252399588669,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.7974,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.45046986898972635,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7889,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.41692348879052327,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.726,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4141308420160933,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7045,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.43243963865867097,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7903,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.44779441827969435,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.7839,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.4408349153103571,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.778,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4601615754810862,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.7473,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.4743084256987827,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.813,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.5095671649899296,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7942,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.43686089823258445,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7383,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4496164103862414,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7573,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4173129820413503,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7601,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.5030615513929414,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.8161,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.4373938776604511,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.832,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.44732892527419604,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7348,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.4441106926184888,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7854,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.39418950871040026,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7245,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.4174735790493811,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7297,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4376586104045421,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7926,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.3934075321711763,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7813,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.3872906434772159,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7568,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.5199897600647463,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7671,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4333691356437578,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7723,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.4309426916565077,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.7338,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.4685066334457469,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7522,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.42189623934554843,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7371,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4001192071128831,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7481,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.4202041426268543,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.771,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.3901188956330055,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7511,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.41370302982075036,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7514,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4756093791617333,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.7016,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.4551131144843958,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7769,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.3996874527215859,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7532,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.43418596236815815,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7841,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.4435909439680395,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7421,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.4108923948917151,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7479,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.41523543356619474,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7494,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.42501622202702494,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7517,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.38516257244121765,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7106,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.3799701139245965,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7617,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.448632288852654,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.8401,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.45846707413489796,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7637,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.44923508813154894,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7353,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.40497548577982245,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.8084,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.45047625376539696,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7884,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.4339580624173808,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.7548,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3828440757921989,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.8069,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.4369514957845695,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8094,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.4575090185201534,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.776,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.4007444086963087,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7568,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.4223989380201151,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.742,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4622068135988167,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.7765,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4417443583939549,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7712,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.45797968462019545,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.6998,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.45427742761700235,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7705,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.43344392615771654,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.7197,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.43766179724002063,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7795,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.4063464894928757,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7191,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.48729880434325035,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.8138,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.43289463621923896,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.801,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4578743285606194,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.8091,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.4401419700620134,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7844,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.38232239438419785,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.7182,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.4780141596008564,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.754,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.5261301180906734,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7783,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.3953958826213999,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7113,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4261184843469951,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.7738,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.3876048475761003,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7446,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.563293848910372,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7773,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.49597633949832215,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.8199,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.40309914385407053,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7076,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.43205819932287276,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7428,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.4546813831054762,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7618,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.43547603025942927,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7604,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4393298509391637,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7502,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.3922378692984408,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7169,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4467339195309958,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7653,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.46472235365699377,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.7928,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.4158360910832021,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7727,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.4241191437999942,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7233,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.45320813593008313,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.7647,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.5029825382504564,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7287,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4030933346105237,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7515,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.40665107430457303,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.6766,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4285404381640957,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.6983,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.49332096614964,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7526,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4404654409534716,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7036,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.468121984035523,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7263,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.3844639852471911,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7058,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.44008454765882693,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7427,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.41257510588456725,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7404,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.4173626365969376,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7513,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.4203051905670372,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7553,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.4113015838169403,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7265,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.46800861215426764,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7643,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.4319840048370039,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7687,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4124014086022483,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.709,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.48343787736347105,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.7625,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.48913043881970025,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.7852,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.4253409113694964,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7754,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3938290515647137,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7756,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.43889257025048206,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7633,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.48717407281038577,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.7194,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.41399581850806794,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7448,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.4897078239482613,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.8479,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.3964396201971349,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.6876,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.5306089779892281,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7462,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.41523976943358804,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.6936,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.4443907381859673,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.75,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.4260433683841681,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7724,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.39736758898373775,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7437,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.4184822449319086,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.6769,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.4062379305029076,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7204,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.3856949432268323,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7199,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.4232113472632291,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.7487,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.4200357018800635,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7619,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.41314894755488163,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7396,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.46084270174612124,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7939,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.44597163842810134,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7792,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.4202879770262735,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7937,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3956833731748927,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.7087,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.5094714223654991,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.8162,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.4136715538531261,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7478,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.3834657609673381,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7467,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.3875899812227674,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.7558,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.5135466082811067,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7627,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.473434708183077,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7802,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.494591052056208,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.7858,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.38863767319710224,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.6249,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.5301734291936457,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7486,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3877927979879035,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7305,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.4758897119272412,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.7346,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3919966698163836,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.7181,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.5234209788796143,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.831,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4156827024701615,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.762,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.4257502618224722,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.6873,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3978561500511223,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7542,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.3641376062984227,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.6929,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.40315525921519396,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.7669,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.457976722456177,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.8074,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.3801884543600071,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.7231,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.3619662965224284,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.6541,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.46525565418240006,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7576,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.39408509550423587,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.6595,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.46135368683694594,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.6884,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.4455811109440872,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.7517,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3791900124265542,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.6718,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.37039564009804155,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.6653,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4378410753625376,
+      "learning_rate": 0.0001,
+      "loss": 0.7251,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.3904870365245172,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.6892,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.41294804069804675,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.7245,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.39447807767931187,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.6216,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3970473272293371,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.6458,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.43504285900316786,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.7541,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4312285737778755,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7436,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.48158541308310493,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7686,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.393596011504804,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.7185,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.45026029078386076,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.7472,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.5049887414853992,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.7649,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.41758997906643314,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.7075,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.5536933675395049,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.711,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.3803886284428118,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.6448,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.5064730268247098,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.8373,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.49463878600172456,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.7634,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.43325806075114826,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.6797,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.3825765694737797,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.6893,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3904857528958253,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.7152,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.3694232971992164,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.7213,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.4774550504808462,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.6972,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.4308484251364807,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.7624,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.4311846684950199,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7753,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.38112940625024455,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7379,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.41182473120367014,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.6582,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.4177078038129574,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.6951,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.42872519101710654,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7062,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.3851660668178274,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.7341,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.39399030180484224,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.7355,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.41334658070908564,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7165,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.40102653640641306,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.6956,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.35528223087853755,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6615,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.44321819856925604,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.8013,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.49562511557650124,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.6924,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4549211444589244,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.6976,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.4017316409897397,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.741,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.4255975709811976,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.7585,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.4326808155140898,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.647,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3730941650680891,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.6648,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.35951044826380923,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6726,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.4387350992288795,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.7258,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.5194060478073095,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.723,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.527440204002076,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.7425,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.41191859370959466,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.702,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.3956344631480436,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.7361,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.4110903358914057,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.6925,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.4082118542755697,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.6949,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.42478088922281826,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.7249,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4105158859597807,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.7342,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.37409171209139597,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6593,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3549294808565276,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.6655,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.3800051561278298,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6862,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.469817845398903,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.6136,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.39381855810199085,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.782,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.37178563261704844,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.6623,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.38961453445864674,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.6454,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.3965513908576212,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.7514,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.45064363977375316,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7614,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4060315144808456,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.7224,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.42948802633016125,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.7519,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.4148025815142497,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6638,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.42792810627652117,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6934,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.542604491432513,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.7424,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.38767158306259125,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.7282,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3760890979170099,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7117,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.35373027523959183,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.6741,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3988415205885269,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.6828,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.36702684550040354,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6673,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.40084524155492024,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.712,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.4455992821837806,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.8008,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.36874876966879794,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6666,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.4072947971147104,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.6708,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.44355072289605285,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.7068,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.4018794218851429,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7042,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.43150510748828014,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.7087,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.4109261083980075,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.6695,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.38117951752542334,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.6838,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.4078045343855914,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.8116,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3658698560410895,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6641,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.42265321140968654,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.7147,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.4207842507069505,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.7368,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.44143686362239726,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.7775,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4232521231647057,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6821,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.4082101853695771,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.7055,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.37104748761470746,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.7218,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.4465202504946128,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.641,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3548116300074221,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.6334,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.3829669115824023,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.637,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3945929599299863,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.6556,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.47611041407388976,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.7331,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.5093088875165239,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.7021,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.41670883842744694,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.755,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.38272799339439023,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.6569,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.39320048501785004,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.687,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.4897907430360386,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.6742,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.4211981946672176,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.7364,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.42346315382486294,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.7589,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.4201031380746793,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.742,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4367166279947478,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.6513,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.533525415958094,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7052,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4331409525050753,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.7169,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.41743183154937163,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7007,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.46284723769156155,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6717,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.4271272261398075,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6943,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.44590071641931517,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6587,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.4080036879402065,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.691,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.4304736034704596,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6721,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.4363828605874691,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.7489,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3724797976524618,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.7403,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.4202245709216763,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.6965,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.5034098519742084,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.7503,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.41678967174539944,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.704,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.40363765184253564,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.7043,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.4330321553411917,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7131,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.47084542795952505,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.7202,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.48829209103839877,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.7266,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.3697866460660551,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6773,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.35051950407660176,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.656,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.39675532074823067,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.6889,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.3695985738810603,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.6935,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.3928421642818454,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6362,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.4103553180916162,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.7086,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4635742963022557,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7242,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.4018941798692042,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6778,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.3861659712887641,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.7279,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.3698439057070995,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.6098,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.5075796063308272,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.7652,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.42888392159795086,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6949,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.4170296196556884,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.7358,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.40511114793948655,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6859,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.4332015020841273,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.7006,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.37324263563892507,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.6578,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.5006566374888571,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.7386,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4097760825770306,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6551,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3874633060404428,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.6617,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.41740833056447024,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.6585,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4059373537897428,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.7758,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.4230459093313156,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.7109,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.35983083399968596,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.6781,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.3862347845991195,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.6866,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.4056112846174515,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.7236,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.45792789391523614,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.666,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.48354667047008587,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.7148,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.3804556674907602,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6496,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.4395721364200357,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.7119,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.4762425500374446,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6955,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.6394644509549837,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.7075,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.40462114697933416,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6967,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.4150022302268185,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6575,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.40195532472344037,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.6472,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.41834785778762196,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.7084,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.3777055438883092,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6986,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.5231949258816744,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6752,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.4048637448045943,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.6512,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.43304092226186003,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.7268,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.3830447693951589,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6747,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.39990984281664904,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6838,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.4551108198746976,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.64,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.40599131447436326,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6628,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.41692510773925323,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6515,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3896403328440383,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6815,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.4291388697518044,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.7245,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4208608690176431,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.7182,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.4323144941925956,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.6819,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.44938435426259044,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.7289,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.5054956090695101,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.7507,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.4634209912186953,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.7356,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.4323898363207525,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6807,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.42919666902122455,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.6731,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.351617997178594,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6651,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.4722377856228152,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6616,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.4252233519419007,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6734,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.4154506646637047,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.7253,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.40310174664215564,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6727,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.4495369483718136,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.7257,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.42026969436548406,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6851,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.40616232046271444,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.6279,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.3827209722482836,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6716,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.4118601937748562,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.69,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.4397139733675419,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.6872,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.37243775954787023,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.709,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.4114136079364362,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.6889,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.508150224886995,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.7401,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.4181625831105891,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.6769,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.4349990879895087,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.8027,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.3862195368376329,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.6735,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.38251050049217683,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6535,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.4802151160705585,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.7028,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.43690518070600337,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.709,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.4638498877700915,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7093,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.41135108464616504,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6963,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.38004016685640524,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6639,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.49320068092481323,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.7185,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.4808864838448925,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.66,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.4326999840300077,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6572,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.4459055186196011,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.7321,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.367808932618989,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6917,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.36685490738498533,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.6452,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3612154568659507,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6537,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.4086733796279474,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.6707,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.3939845914942694,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.6971,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.5303334440248394,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.7362,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3859726418970462,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.7093,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.35427488135764695,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6892,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.4369389709783054,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6948,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.4117089123943465,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.7062,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3532018953005476,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6542,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.47543943329516386,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.7187,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.37992988360707813,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6526,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.3922241337332064,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6767,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.37884468931415927,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.704,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.3806859376653051,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6992,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.5610152838461234,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.7056,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.38028207171332495,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.5972,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.43887978891017,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.7067,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.439136811373697,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.7162,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.39613847015380077,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.7303,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.42107938942900913,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6763,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.4530792767638597,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6976,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.41667850895366476,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.7106,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.391418747510892,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6086,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.4360923992795412,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.722,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.5008453135264018,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.693,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.427954695595485,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.7155,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.42627186878443935,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.6476,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.45305074552722496,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.7455,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.43952552027728614,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6864,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.41860905053696146,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.7371,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4429300783648926,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.7027,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.37197811285497284,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6302,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.41765937724539,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6232,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.3800434334049614,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6631,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.4492829376654766,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.7038,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.37246083957263676,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6394,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.42616944527222084,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6429,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.3943748861470032,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.6851,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.4100631296136774,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6418,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.393456098253909,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6699,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.4874209025333082,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6806,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.5599644755055251,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.7563,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.36657647567910884,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6408,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.4017511875623015,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6812,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.5042786295890377,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.7726,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.4337272599736757,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6798,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.38384618798879977,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6932,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.4064222200337202,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6825,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.43759684106046515,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.74,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.5555321453003748,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6836,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.34867305100422014,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.683,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.402036471779896,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.6272,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.42220162411989065,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.5967,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.37826036175258104,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6512,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.5766733350158354,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.73,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.4183193873217483,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.7148,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.3390489055854859,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.657,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.35672282263007266,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6687,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.4094248270745932,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.6289,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.33325937099727404,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6354,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.38970910705502143,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.6519,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.39123053488286713,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6528,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.37504990575037767,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.6587,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.38612516322755125,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.7059,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.7522975311950446,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.7632,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.35756864831313834,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.6543,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.395755270390985,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.6614,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.4119893007021874,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.645,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.35649575503231873,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6647,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.3893059272331949,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6468,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3649968056430924,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.5978,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.41255239145320716,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6514,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.40189145448990304,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6615,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.3775003757047657,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.7187,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.376306862874467,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.663,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.49003531568940967,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.712,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.4148530340083136,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6954,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.3630434995850313,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6404,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.4403499211672423,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.7059,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.48847312558980904,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6619,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.49161581022992146,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.7541,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.3516916918805193,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6357,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.5427759861095616,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6259,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.3685168817488535,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6587,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.4160758218903834,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.7067,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.4053343246014304,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6882,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.35443403133857604,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.7107,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.37317721270640875,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.7032,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.3668062564893062,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6722,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.3784881471364597,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.675,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.39109606865640284,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.7203,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.39833714198682935,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6752,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.3939019094573661,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.64,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.43245303370695626,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6979,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3949825211480894,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6621,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.42296506088190255,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6012,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.44937484805390177,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.7867,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.4312758913114681,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6844,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.4110098544270641,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.7399,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.41300146967284723,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6621,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4152309631283235,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6686,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.3682227316320983,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6659,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.4004379938007532,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6414,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.4114899285709478,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.593,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.4332671285477825,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.7279,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.37424393127606453,
+      "learning_rate": 0.0,
+      "loss": 0.6289,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 547629306118144.0,
+      "train_loss": 0.7479768854141235,
+      "train_runtime": 9738.2912,
+      "train_samples_per_second": 1.027,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 547629306118144.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..229b618b1930d985a82c33272130f026ad31b7a3
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "q_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d08664656326d9091f068152cd49de2b53865195
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bd07edd55e8f5e15997a3d00f4965a83f16b364373ccecf2cf89692973e5047
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1684f6be61b650f482eade8930fa015247cbbe3e
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6400c7299a9addf5511fe535658efe82ea180b7f21b1da938b71c4109248476a
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d10bb89d76f26d90a18659b2ccc4097883586cee
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8665415082895096,
+      "learning_rate": 2e-05,
+      "loss": 1.4176,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9361044587519807,
+      "learning_rate": 4e-05,
+      "loss": 1.4488,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.8998720040196685,
+      "learning_rate": 6e-05,
+      "loss": 1.4943,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.8384372392831825,
+      "learning_rate": 8e-05,
+      "loss": 1.3387,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8772044641808151,
+      "learning_rate": 0.0001,
+      "loss": 1.1949,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.4529967231099963,
+      "learning_rate": 0.00012,
+      "loss": 1.108,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.713539918838152,
+      "learning_rate": 0.00014,
+      "loss": 1.0233,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6172981911132707,
+      "learning_rate": 0.00016,
+      "loss": 1.0161,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.48289881998891115,
+      "learning_rate": 0.00018,
+      "loss": 0.9326,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.4543512003770902,
+      "learning_rate": 0.0002,
+      "loss": 0.9921,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.3828461222080745,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.9254,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.39901807190795885,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.8614,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4345341826900339,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.9218,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.413160806313582,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.8823,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.45495385381233294,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.8646,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4281753953778973,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.8808,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.3994236722592952,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.8752,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.39117226882021455,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.8724,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.3833908827431505,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.8491,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.34870671989081414,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.8571,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.41481670018385486,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.9211,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.34126965482355054,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.8642,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.3940371915698765,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.8366,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.3415653902831056,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8105,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.3381889283103069,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.8403,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.32834606662461213,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.8287,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.37945938489812575,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.8416,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.3318956662864662,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.8376,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.39869959883085637,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.8111,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.3698233300253536,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.8455,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.33474133660285676,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.8069,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.3217236781724644,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.7918,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.3684982859817603,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.8004,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.5220395178503056,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8073,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.32558380289166644,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.7774,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.31661713173183603,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.7671,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.3298183068426966,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.8325,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.31976726870616595,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.7998,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.3125134823296456,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.7922,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.32774997145846696,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.814,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.3402682312947837,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.781,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.3964859103107029,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8899,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.323683473038952,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.8364,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.3564126507612505,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.805,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.32723558996618596,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.821,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.30144686701598333,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.7483,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.3597561681974569,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.7905,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.32634667281340346,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.8048,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.36244033086415745,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.8672,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3311512438280912,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.7923,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.35330775909171264,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.844,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.3289965565274748,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.8188,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.3427039009285118,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.7692,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3430056329984371,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8109,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.37118704716300105,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.8017,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.3725139392488711,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.7789,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.30844351687662164,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.7923,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.35305601337396303,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.7358,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.30221905832667595,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.7397,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.31222145864981415,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.7755,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.30463210235187144,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.7508,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.432146979637179,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.7729,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.38203631210937444,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.8021,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.322834544631692,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.7791,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.34536867964345297,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.8702,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.3558562140413582,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.7676,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.32140353205176714,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.7624,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.3496655157337084,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.7679,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.3514609843872738,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.7999,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.324248969730902,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.7395,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.3167679143411775,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.7693,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.297240390835269,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.7796,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.31563529561124276,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.759,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.2859278361998818,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.7517,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3320519979645269,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.7611,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.40254238852087254,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8519,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.32251277688662094,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.8,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.34644761845854694,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.7932,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.34182589813646663,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.7233,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.320009189583074,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7667,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.2848719281876875,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.7308,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.31940884011749177,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.7565,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.3060784615024359,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.7751,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.32952625214413267,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.7266,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.34413779971624914,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.7894,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.3110744615798355,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.7093,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.30408532243174596,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.7801,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.3211103718095818,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.7112,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.3172095159212251,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.7792,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3362603280371758,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.7528,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.34112146178712244,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.7945,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.32228872212014087,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.7355,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.34482854948240044,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.7762,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.3268373450822887,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.7742,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.3124781713850874,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.7461,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3160555408217458,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.7547,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.28533410896916955,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.761,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.3466820081651341,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.7622,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.3170141736712857,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.7368,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.32190854194201374,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.7318,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.3131113945513269,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.7466,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.30220107219934655,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.7164,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.30952899615755614,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.7578,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3327699148221942,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7523,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.2989396100887833,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.7417,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.30436033938363616,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.7162,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.31857262780407486,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.7948,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.3446000281880277,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.7482,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.3318185244789637,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.7877,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3124578801605435,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.776,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.3222820758875199,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.7861,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3357041321844993,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7441,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4364427407564158,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.7673,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.33772379371505923,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.7361,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.31324402461981987,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.7438,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.3284183728090651,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7591,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.3437955854964839,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.8042,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.29966031920462866,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.7445,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.372993751987034,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.7565,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.2909779283567334,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.7319,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.3400025202175588,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.7561,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.32075724784767795,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7587,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.31341798591594283,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.7454,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.30536177734067066,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.7477,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.30946114337713626,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.7363,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.35863090762013095,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.7739,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.3177347916404916,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.7341,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.3080407358241847,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.7344,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3091175540692471,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.6835,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3216071074288792,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.7278,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.30907097439090114,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.7087,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.29533175707516046,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7338,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.29859223953122266,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.7488,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3279670824045337,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7461,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.31258437498493835,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.7352,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.35164392189882304,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7742,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3149668288035313,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.768,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.3599061987303891,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7325,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.3306079740957748,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.7917,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.34636813413125606,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.7107,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3346405520896027,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.7139,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.2928458590251085,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7521,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3109268449994832,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.692,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.29609544220706363,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7294,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.30305951144341053,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.7481,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.33434639252903553,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.7815,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3004114971651247,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.7467,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3429547108807699,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.7806,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.282764607021962,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7453,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3683700182658041,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7675,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3231498352348244,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.7052,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.316548397742817,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.7333,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3251703264413508,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.7192,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.3271354998875943,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7945,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.2969502110605835,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.7196,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.2706588923709536,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7248,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.312629753367858,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.7611,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.29415145442436164,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7013,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.29003725066831476,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.6681,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.29780012282274543,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.707,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.2930838441084976,
+      "learning_rate": 0.0001,
+      "loss": 0.692,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.2830263152992799,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7016,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.2920987174195825,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.6349,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3383941646660978,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.7447,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3394665075780364,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.7448,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.36044537057037745,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.7565,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.34190965060795325,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7049,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3369093723997976,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.7361,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.33384262876410487,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.7205,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.27998672705210387,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.6973,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.30621372941067515,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.7052,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3216249063829598,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.7637,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.2932825307667918,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.6974,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.31375859729551603,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7034,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.29066252870835996,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.7268,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.29867126390130283,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7029,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.2959064734553686,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.7321,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3367281621157235,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.6896,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.30749267277288383,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.7423,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.6004758823211203,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.656,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3164898094208934,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.6995,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.3683208690733922,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.73,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.3003501429522181,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.7171,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3118763215594028,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.6925,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3176693090329536,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.7274,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.27688207991554936,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.6609,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.3245637087268195,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.6508,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.2803661278936095,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.725,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.2963398491469185,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.6988,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.34156377497721035,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.7411,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3047256704802507,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.7101,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.36331590536488,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.7214,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3412957404622521,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.72,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.2836530578324241,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.6792,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.2862139506723695,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.6922,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.29564303690434485,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7342,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.3306938327020271,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.6893,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.3019761746574809,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7106,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.27634579198568876,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.6771,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.28356734604700906,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7382,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.31036724528842696,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.7287,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.32971015681074933,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.7307,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.32640446318454774,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.7161,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.2943147473660576,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.6397,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.2868426565380326,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.6502,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.36398789995494085,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.7213,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.30021806944587576,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.7079,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.3142832482478888,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.6839,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3321794459631511,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.7524,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.30203696871415725,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.6999,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.31914058118188965,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.7103,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.30496302518566537,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.6886,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.32059109058279794,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.6794,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.30870403266829066,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.6887,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.2940897712048129,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.7463,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.33241866905107526,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.7255,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.31263803463835743,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.7073,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.32520913061756423,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.7225,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.32569839011013324,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.7089,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.27974184726308615,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.6762,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.3004702382270151,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.6745,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.31632043613545113,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7191,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.29698328654448874,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.709,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.31943276650095204,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.6898,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3190564551595284,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.7239,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.31518897840817817,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7002,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3360925029469203,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.7055,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.2995768437981988,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.6661,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.2946731621228957,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.721,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.2911787551058751,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.7028,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.2984102430978739,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.7129,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3799106909399135,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.6967,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.31291620109632406,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.6905,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.4372365695739739,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7094,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.2921039485526144,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.6839,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3069506592386268,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.6824,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.2755230014315186,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.6905,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3075196306711869,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.6977,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.28606807442592747,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.6891,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.29895449980208705,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.658,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.30074188335195584,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.6707,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.3112086183650853,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7298,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.3406386820425041,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.7152,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.3565063375536763,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.7518,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.33774418398181755,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.6895,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3012525960263884,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.6709,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.3349355781125803,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.7059,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.3183580630182977,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.7041,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.27245920061131146,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.665,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3081707374073957,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.6863,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3009755101021308,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.7033,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3524705284560279,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7222,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3252227822817956,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.7449,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.3182706619088614,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.6708,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3378575366682444,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.7109,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.34979777520246824,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.7123,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.3107408989330393,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.6951,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.35083458885711105,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.669,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.30289873995071404,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.719,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.27703663598852074,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.6579,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.2997192438597723,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.6932,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.37058705280304277,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7306,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.30789069679483144,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.6972,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.2839052945678128,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.6897,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3060387262874097,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.6945,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.28373072999622234,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.6981,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3470563517363565,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.7103,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.2974310593458591,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.6625,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3113942036487974,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.732,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.31112002998100463,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.6953,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.2885181475137938,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.6669,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.35733844424630434,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7176,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.3064149217986143,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.6889,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3333949354787685,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.7275,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.31134604025405793,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.7276,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.2734743339726324,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.6372,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.31277695488379526,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.6953,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.30762019412564784,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.6513,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.2902772414260849,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.6699,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.30134874847806303,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.6849,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3390718969814782,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.7058,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.3199046583330719,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7397,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.29107381202022176,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.6921,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.321685007765904,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.7163,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.29299845007697267,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.6925,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.2836179458349143,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.6154,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.33320743402639,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.6994,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.282111649725243,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.6906,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.2917477526657873,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.658,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.26813610476565086,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.6543,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.2893710331120229,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.6655,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3282785399000293,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7415,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.3914289158651545,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.664,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.2862620965554168,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.6638,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.27936560912003844,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.6317,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.29097538702075687,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.6657,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.28207822803976385,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.6987,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.3410965732242547,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7136,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3086319422171524,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.68,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3496626009727568,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.721,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.36261055877706444,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.6409,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.2903284042216192,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.6874,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.2948685039371921,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.7115,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.27829102739569256,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.6984,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.28770393639606623,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.7037,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.2873587388835803,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.6658,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3231525112021064,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.6904,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.32926237386900375,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.7014,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.3137732749319311,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.7192,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3129977563017675,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.6747,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.3062371164589154,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.662,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.29590600103168474,
+      "learning_rate": 0.0,
+      "loss": 0.6665,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 796663880482816.0,
+      "train_loss": 0.7521186127112462,
+      "train_runtime": 9643.3372,
+      "train_samples_per_second": 1.037,
+      "train_steps_per_second": 0.032
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 796663880482816.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d391007f29d780ba267573f8c39cc3ce5d1ab35
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "o_proj",
+    "up_proj",
+    "q_proj",
+    "v_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..40fbf5934c32a8304eecc40f24d599c549e0a4f6
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80f39ec3b5906fd72d240a40dbf2aa29de89e8405d5be1eb3dd7fa3e93e096a1
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a9e986407ba6a6e1b186fad3dd2e6b41959985be
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e653d5dcf7750dab6d448a93903f641d9915f3f808c87456068c49c58c954653
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e2302618b3a5966b1ac501634b46ef880010661
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.1517286163583604,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.6687,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8784673231706809,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.3376,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.855709195397984,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.3421,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8545594441581159,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.3895,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.9596426719614186,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.3588,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7660632146845626,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.1282,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.9171820232586981,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.1249,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.9058274429801229,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 1.0576,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.9248654267702412,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9737,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.717052598253239,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.8998,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.8750729713384804,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.9871,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.7595872834902654,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.9514,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.6110120148789119,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9019,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.6765927184994187,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.9269,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.5259268527470042,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8948,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5181980571589642,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.7939,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.5405157764502099,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8628,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5241165122992703,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8811,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.6112460052726951,
+      "learning_rate": 0.0002,
+      "loss": 0.8613,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.4723168313134422,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.7921,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.5454638138288851,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.8534,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5756397377455599,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.9456,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.5768150789347223,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.838,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5889828445999925,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8393,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.5731511487094392,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.9322,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4824620621492906,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8379,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.4662326949499265,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8727,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.45039686785700206,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.7945,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.4843308416362078,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.9525,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.47706475655670366,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8715,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5573219523232408,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8525,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.43732275764080164,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.8131,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.5557734558864179,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8559,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.4752817029198449,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.7842,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.48562282417593106,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.7702,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.4838085619463917,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8311,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.5665340239580058,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8535,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.45309016580075406,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8583,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.46852754945064595,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.8407,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4728208757643744,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8251,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.48544863508651903,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.8656,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4518664862237965,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.8666,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.44540747326702773,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.8326,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.45806596449842496,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8779,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5313631148317027,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.891,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.4518362124365464,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8882,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.3729338492969433,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.6883,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.4809884109220938,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.8261,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.5463583180814995,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8297,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5172391859428617,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.9155,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.4003614858527851,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.8124,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.40791569838786457,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.8116,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.4405159119511458,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8345,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.42517174461685453,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.7659,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.4460029687562156,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.7865,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5226197779801623,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.8699,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.4937701360268512,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8438,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.38499805535185944,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.705,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.3957225137739923,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.7261,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4436071355924252,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.8062,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.5060143537952035,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8937,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4666310340240579,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.8011,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.47516294685470856,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.7643,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.41948296178257,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.8029,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.9356137828629628,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8717,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.5003178590315507,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.8529,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.5574950858380237,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.7995,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.46550720319245187,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.8016,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.47864652266516605,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.7453,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5942420710638296,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8745,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.42090932279804627,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7448,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.5049145369376222,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.8252,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.5792342825330761,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.8019,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.44998541600410774,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.7326,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5365343085617726,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.7877,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.5795955132721454,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.8862,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.4675448423250248,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8409,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.4555860592918079,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7593,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.3889970617977604,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7132,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.49458897281250364,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.823,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.4715921148144629,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.803,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.5908116456128671,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.9233,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.4207991491358518,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8111,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.4299783137076167,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.7235,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.5059751236281711,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.8008,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.4622847840305097,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.8108,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.5036533790171484,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8419,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5170320322443288,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.7704,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.45757400492822115,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7597,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.44937124185934485,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8607,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.42185510062315396,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7624,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.4855170008842487,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.8145,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.4009385360390872,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.8596,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.4509916168171082,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.7805,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.4136792288096338,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.75,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.4085754322271624,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.7948,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.5408603055037793,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.8711,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.4645924737906978,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.8197,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.46107409721762693,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8253,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.42789001390976716,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.781,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.4273494191543858,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.7498,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5073261389731233,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.7952,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.499046365441512,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7693,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.5124867039670303,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.8373,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.48089266147860954,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.8139,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.42551932506628354,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7936,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.44217323707415573,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.8486,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.4117639798788838,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.8302,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.4542704177111429,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7589,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.414461489037142,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.8374,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.4153843596764285,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7532,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4529042454732251,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.8298,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.496025521210255,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8064,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.603587742296673,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.7193,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.3972364915263813,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7279,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.5480722321418081,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.7431,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.5253385197530391,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.8351,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.4463762105084938,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7938,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.5148792249433735,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7272,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4338337418999126,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7846,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.4448550964821197,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.7662,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4249486343036595,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.774,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.45294012807830697,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.7175,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.45204011098160635,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.8116,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5161306527497446,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.8453,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.47025758199126994,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8441,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.4283962123541972,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.7591,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4955025052770196,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.8125,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4227675300350439,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7694,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.48702055259309274,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.7718,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5549231795337493,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.8284,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.401594485185252,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.7444,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.48059717657336937,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.773,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.5327126518985816,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.857,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5284520114850126,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7846,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.41462380594660525,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7582,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.6397890378293987,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.856,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.44298566983966703,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.8,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.4410871408286474,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.8091,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4386374969077178,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.892,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.37786738170094414,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.6855,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.48666482876630224,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.811,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.4580786753482874,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7638,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.4573964941518131,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.8771,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.5037931767744466,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.8465,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.47089241182534414,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7532,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.4461847906516847,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7476,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5353422933484618,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.863,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.447860371365661,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7757,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.40415595292378664,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.8061,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.4591679956087192,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.8716,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.470229114598814,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7366,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.3916378327664192,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.8048,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.4713115111294236,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7729,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.4041785964985653,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.7474,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.48309315840712175,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.7717,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.41339871995879307,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.7527,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.45712444390966833,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.7328,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.44446203830573594,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7934,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3905715206203686,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7619,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.5228376785239668,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.8442,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.43727578992327143,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.8153,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.46439048899549623,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.7802,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.4268493145485142,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7251,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.4202729498940307,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.7419,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.40547040665996986,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.8219,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.40398281132481145,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.7293,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4271714328860988,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8337,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.47281130970675284,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.6725,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.5586025504356148,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7702,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.46412984129645973,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7665,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.5354794076427467,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.8185,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.47004583783231435,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.756,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.45787096718164827,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7976,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.4170042103828585,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.8544,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4236827748110516,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7225,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.48914571749208785,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.8141,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.450665610890202,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.8068,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.42246892048462525,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7663,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4148639789300373,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.7749,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.3979616117659437,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7507,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.40135218170704223,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7388,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.40606436029294757,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.8059,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.43380889159568203,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.8396,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4456063405476975,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.752,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.5226308379446547,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7995,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.4125504316119645,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.7514,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.48192708168537784,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7362,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.46832378682779585,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7209,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4853704734176963,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7761,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.4835689387307517,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7957,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.42821584240297866,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.769,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.5275649955045318,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7488,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.37896939611242514,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7908,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.5014224816022986,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.8664,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4302596939232914,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7176,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.37601956090573124,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.749,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.4470864095317222,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7675,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.38373759587270195,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7436,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.40096002153413757,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7605,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.41497898478455786,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7357,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.4627369083577583,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7713,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.43861119494110645,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7748,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4949192731053403,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.8108,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.3892530496626969,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.768,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.3678042621978389,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7824,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.4436263250027655,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7988,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.46979121158729337,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7931,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.42801092121142625,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.79,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.39917120835121755,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7071,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.5166072531894059,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.829,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.5034819696946465,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7299,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.394779804153212,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.73,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.40026759449463856,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7356,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.38774116439929845,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.695,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.4270754809977913,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7348,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.4531749309334632,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7713,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.4220126930063332,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.8162,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.41189207346382134,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.772,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5698244118323691,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7883,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.4197411432424747,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.7005,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.3813200333236524,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.736,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.42176921762452485,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7886,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.4148893867152742,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7268,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4420158919478724,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.7338,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.42334138064913807,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7971,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.446143852873466,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7752,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.8084509813454605,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7829,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.39565668696801404,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.733,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4336069889808771,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7274,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.41970839219759815,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7383,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.5212755834570003,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.7914,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.4267944087662684,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7299,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.37292182944345237,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.7079,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.4219115953204863,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7655,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.5386473654464474,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.8038,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.4967703925251337,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.6674,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.47150459624335544,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.717,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.5304487190547554,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.8314,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4605647276190697,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.7901,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.41255225424760517,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7681,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.4649401369258046,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7607,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.4110359298438914,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7164,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.3691954624781045,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7297,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.4302804306045596,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7509,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.3849755894161268,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7579,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.4639946277185081,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7576,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.38537982413559235,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7242,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.4697285691125311,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7474,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4255353982284123,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.6864,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.5310855141742439,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.7427,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.48918813587024496,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7555,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.5202997980251455,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7463,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.3944315462290684,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.7215,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.39552338258643244,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7065,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4582281893475035,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7638,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.4073482695204598,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7283,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4064157378706338,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7208,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.4210844965228265,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7104,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.5699449595260611,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.8177,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.4795970472350375,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7426,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.44336877516816464,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7849,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.4105266805460912,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7601,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.45130024637132543,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7319,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.4050160926419668,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.8104,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.5343629673832331,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.8162,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.3656076310630895,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.6918,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4494914220087584,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.865,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.3700847529319172,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.6773,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.43925354670515865,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7394,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.3716027483258037,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.7134,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.387749521659969,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.6993,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.4396050205960909,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.6753,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3859151616960672,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.6958,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.38685983405108443,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7733,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.477317920235237,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.7834,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.4718679694581572,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7171,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.4443648359335215,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7157,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.46705387836789647,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7559,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4324381846505133,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7417,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.3907974055741124,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.6932,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.41516503126378956,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7408,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.456137617001889,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7605,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.37970633126428244,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.688,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.47216579659845764,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.8465,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.46410904973970607,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7495,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.36707748031150655,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.6868,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.40764819091821336,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.7504,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.4904339723714431,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.8228,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.5845436331459538,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.6908,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.41557578398633754,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7081,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.46555867308061016,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.754,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.4218961421632011,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7925,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.4484719712824621,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.764,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4229767112102817,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.716,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.39536664986377285,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7021,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.4120722600837275,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7373,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.4195788289720658,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.7554,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.4223949634083843,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7553,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3558898854471773,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.6695,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.4118916695983032,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.7136,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.4129608908896074,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.7135,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.4002321698132183,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7304,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.43198375109351433,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.814,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.3784203007384405,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.6853,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3960736830144517,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.6968,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.40835552170155603,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7495,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.439404146351219,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.8095,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.420994128897334,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.6639,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.39225139081963223,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7461,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.4485862323178119,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.7733,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.4077803252456207,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.6944,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.4432056065485272,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7279,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.5329557517777089,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.7954,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.4091730598792052,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.7623,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.4810112612191591,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7825,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.6365890894725469,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7943,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.4111901388670336,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.7424,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.38625652502904867,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.727,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.39117761965345,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7207,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.44567761261716654,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.706,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.3842032489963624,
+      "learning_rate": 0.0001,
+      "loss": 0.7937,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.4005564577154571,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7397,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3686604462401117,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.6783,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.4209904332384653,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.6673,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.6986202466742053,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.8607,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.41210849915293474,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.7255,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3924116009259164,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7682,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.41531418086818506,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7064,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.5419711040812547,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.761,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.43484669445975627,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.7297,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.37971662203710665,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.7158,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.40222616165885955,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.7072,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3908896147908082,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.7098,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.4114180268402791,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.7265,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.4236879210610389,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.6556,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.4133354131488104,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.6925,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.44023366180001644,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.707,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.39383222477658913,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7393,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.425900591805592,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.6727,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.4410310504516789,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.786,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.4089530094326561,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7154,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.39906684367445927,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.6902,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3712457341250182,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.6959,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.42932068292116693,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7888,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.44063700408114603,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.757,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.3694752662614534,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.6822,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.4244994037695542,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7136,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.44566512429217003,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.7368,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4101734276384276,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.7224,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.3635602210799667,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6298,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.42048420552631677,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7032,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.3908678730309028,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6514,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.3779444987018777,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.667,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.3637402307722022,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.708,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3959290525281248,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.6877,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.37007921797782506,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.7668,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3743741676728308,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.749,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.3965351397620762,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7888,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4101892634153247,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.7952,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.4251085569177034,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.7083,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.5056503331618741,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.8109,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.3982125186699654,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.7216,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.39706312840546,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6526,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3799045657114476,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.667,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.4006507083721613,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6576,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.44500518273668843,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.7827,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.38343395517520396,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.7076,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.4071088842442927,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.7681,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3527984613730762,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.6879,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.4853538119951193,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.7359,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.4309492168388696,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.8066,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.4391508861892157,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.7025,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.42193316628985833,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.7399,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4234358749707796,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7547,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.38876566927042244,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.7054,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.4021044454948182,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.7651,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.48667600007586975,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.749,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.38254117015863065,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.6961,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.5928784195218275,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6982,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.4688124096954091,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6658,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.45186423242788126,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.7428,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.4462372246076824,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.7112,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.4111894574633264,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6578,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.39301804162906895,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.6932,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.45009492097394654,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.677,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.457100909318662,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.759,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.37777549149897655,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.7813,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.3641882091953124,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6425,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3944714771721567,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7111,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.44347824782901235,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.6993,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.4252854974071172,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.719,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.37910642726706467,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7513,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.40388732183288534,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6867,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.40774502665216267,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.6722,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.3565949655526324,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6753,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.40349754342358957,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.6782,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.39411785907761565,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.7194,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.3422061634288728,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.6673,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3806686347878848,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6397,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.4265471840720824,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.7629,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.42451095650718557,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.7231,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.3763223634525672,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6872,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.43155190822128586,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.7582,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.43223385365559036,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.743,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.41243673230459976,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6332,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.5021015889587451,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.704,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.4223139997737187,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.7268,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.5082179430384259,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.7041,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3647414345693038,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.6506,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.3619076769703431,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.6791,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.41988404271514806,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.7256,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.42540111458791796,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.7357,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.3883567454520023,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.6199,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.5739204044719651,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.7103,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.3814798108411745,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.655,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.4179213286850888,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6828,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.40706861671193456,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.7264,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.4135502510054031,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.7394,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.37420008958601003,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.7216,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.4236759742172755,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.693,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.36869906654334067,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6768,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.43738746207366114,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7576,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.37688900548958404,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.7024,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.4817127626688613,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6916,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.463120010250976,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.7071,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.40705314884824917,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.647,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.41894699513477546,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6749,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.41486614034921926,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.6849,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3987288082147527,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.727,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.40693878439217374,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.7404,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.48122356069610334,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.7152,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.4387112264116535,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6811,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.4329919657974128,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.7011,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.39117666474117113,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7468,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.37539949340381906,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.6666,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.4141843669969637,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.7111,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.465097596409026,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6923,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.42128752267401115,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6771,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.44301026272872385,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.6614,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.393343927430952,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.7273,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.356091850538638,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6423,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.38892981580001185,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.7036,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4270517879334578,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7552,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3487738626170391,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6323,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.3582878418210738,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6802,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.3927117232498071,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.7313,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3771596287275359,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6708,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.3887304108114345,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.7294,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.41132308825366887,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.6993,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.5604341510454696,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.7326,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.36800808851318567,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.6945,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.4551362270717662,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.7116,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3967014989182716,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.6713,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.41810976389240995,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6636,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.39831162237046414,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.7352,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.39152686726740915,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.6797,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.3702035892320069,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.642,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.4284658698643331,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.7011,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.4508346320756152,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.7802,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.4061254031044715,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.74,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.5494262708940839,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.7405,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.39341045162109545,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.7094,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3959131207736413,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6638,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.3694834374891536,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6627,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.4243882003836683,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.7158,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.3714137914163032,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.7098,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.42340250033689464,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.6826,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.40476712971856527,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6807,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.4776919565263879,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.7328,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.36582983940998925,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.6658,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.44933879235635416,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.7881,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.38568764690834195,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6641,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.4245513722777998,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6464,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3599442476497355,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.7028,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3974167086239789,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.7233,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.4024691565205865,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.68,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.45639176013861077,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.7159,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.5678650813149774,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.8147,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3655234186443611,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6329,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.3898767857363938,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6813,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.38366851029825955,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6019,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.41818273573782716,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6936,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.3844323639396766,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.6761,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.3782640038884151,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.6773,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.43045786152928817,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6985,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.38243215655916374,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6819,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.4222589671661771,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.7213,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.4139986346359861,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.7128,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.35693680243173304,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.6891,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.42730598393904784,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.744,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.35254768958907134,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6475,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.41770151723486093,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.7098,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.45886649927793793,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.6776,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.5696876266783285,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.773,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.4323839065907935,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.772,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.3566597351324305,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6078,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.40544828268521993,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.7531,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.38565418463244233,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6449,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3442537755701917,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6478,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.3874669780042446,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.7473,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.4234105363998634,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6769,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.38179326044497314,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.695,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.38276100717171035,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.7053,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.4696229659642793,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.8268,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.8555868502851657,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.6492,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.3947201336864822,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.7087,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.43168055797308075,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6682,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.3581726150858908,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.6454,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3762481743735936,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.686,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.40301944000895196,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7253,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.45011718247532684,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6537,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.5877291229030772,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6793,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.38161037625625915,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6606,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.39844438785050906,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6411,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.40349370889355574,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6796,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.42373407540587016,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6447,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.5181792587601858,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.7684,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.40229006497217784,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.6519,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.5298666495639766,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.762,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.3611511542028321,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.661,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.42073350377014856,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.7973,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.33934604125514983,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6651,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3954898300065919,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.6813,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.3859046411055772,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6729,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.41168273314796267,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.662,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.5329268906641638,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.677,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3751302125049051,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6476,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.3414502913086143,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6855,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.39370008735547357,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6613,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.4706464990815659,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.7725,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.43440588684790993,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6818,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.4641564370485342,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6561,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3547950187068299,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6595,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.4017536231519369,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6846,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.36796335900360183,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.7318,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.3835028765880275,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6391,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.4431301777922845,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6863,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.39326649560923305,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6428,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3960017804398548,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6963,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.3473501536168104,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6705,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.43796894089751437,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.8052,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.41778489241576106,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.7315,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4509501130995678,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.7519,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.4218560005017982,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.7389,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.5009005588292391,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.6953,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.40176857746830935,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6098,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.36381722317326654,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6311,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.387649987787867,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6955,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4089753780142505,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.7396,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.39354628384238033,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6935,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.3883473028832529,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.7029,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.42996333687152455,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6968,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.4498999480247996,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.6774,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.3914723199388177,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.7194,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3692776963674782,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6458,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.36240500344216386,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.7188,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.4345620651272169,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6915,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.3949763813975571,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6907,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.48200077738797686,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6957,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.40812750082411603,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6722,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.36757540821350676,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.7202,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.4209376972986613,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.7454,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.49240819082330106,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6703,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.42052473011611863,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6981,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.38666076352459755,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6339,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.5103278801828982,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6758,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3950790980482145,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.658,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.37367772953751666,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.681,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4722906048352251,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.7039,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.3952571039628059,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.6592,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.4054839394569938,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.7219,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.37853122272326006,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6838,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.4388537566773106,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6672,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.4075987757543486,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.697,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.3728087912376961,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6644,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.40000303295602,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6699,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3970615658991015,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.6135,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.4781533003822949,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6396,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.43818192697582575,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.7082,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.6072326147781507,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.7814,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.3991212490595163,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.6503,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.3763315614668962,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6345,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.43224625562244884,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6897,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.37483903559422577,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.5833,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.4294379674389417,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.7418,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.3844121390612277,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6844,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.49491590543852876,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6699,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.3817599359358462,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6983,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.388831097799436,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6938,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.4253964073682958,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.7043,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.4019786521611652,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.7003,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.41029257226138827,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.6478,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.40006702059796145,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6721,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.35776307548767483,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.6266,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.47105249718282655,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.701,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.4317895828275148,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6946,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.4054666687694706,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.7091,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.36373593809758037,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6332,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.39891340503921363,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6173,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.3989415289853632,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.7076,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.37187945556387686,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.7114,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.39326199413024704,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6956,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.40517977262446664,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.7022,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.3530929889532916,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6503,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.4051342459696674,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6113,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.4031360752836336,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6781,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.372718805792164,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6969,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.40443611518221334,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.6658,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3957897106920901,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6951,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.3675080072405774,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6447,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.4363103927205831,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6462,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.4000734569971733,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.7189,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.37295914146931103,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6138,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.4334530927924268,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.71,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.43964206335543465,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.7398,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.3556225139605389,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.681,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.35110072559816907,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.7005,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.39265638565024086,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6924,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3693180236955692,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6265,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.39527655043634513,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6872,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.35127237654272503,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6414,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.4139802895909404,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6779,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.46134657671608,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.6679,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.4182580697871817,
+      "learning_rate": 0.0,
+      "loss": 0.6586,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 544733249077248.0,
+      "train_loss": 0.7485533044815064,
+      "train_runtime": 9694.5152,
+      "train_samples_per_second": 1.032,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 544733249077248.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f2e2de7d050474c95de7e20903e415b441132d0e
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "down_proj",
+    "o_proj",
+    "q_proj",
+    "gate_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1e34f68cc8d9f40f9e783441361461dfd6c1f6fc
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06435b9e6264f2bb52e7961096b6f7286ca1a123ba7ae9d2c7eecf42946d69c6
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..15e0cd93d2ff97671fd55d157ca92d2eed94f708
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36794d35ab6631bbdad14bb7c69ee506c4c65a344907995f0d57be1cb24555dd
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..581725a51e41a51a184c556483c2007943a24d75
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9313031558184078,
+      "learning_rate": 2e-05,
+      "loss": 1.5031,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8263843841694538,
+      "learning_rate": 4e-05,
+      "loss": 1.4033,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.8911585947754022,
+      "learning_rate": 6e-05,
+      "loss": 1.4189,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.7762495657811989,
+      "learning_rate": 8e-05,
+      "loss": 1.3086,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.9567510474105789,
+      "learning_rate": 0.0001,
+      "loss": 1.1303,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.081815042749168,
+      "learning_rate": 0.00012,
+      "loss": 1.0796,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.8285018211345848,
+      "learning_rate": 0.00014,
+      "loss": 1.0065,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6531381930650225,
+      "learning_rate": 0.00016,
+      "loss": 0.91,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.48711720892258237,
+      "learning_rate": 0.00018,
+      "loss": 0.9225,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.4034594199818848,
+      "learning_rate": 0.0002,
+      "loss": 0.8634,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.42072047599714224,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.931,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5072449995615028,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.8797,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.46380549861086345,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.9179,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.385315677598143,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.8593,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.46137284387864597,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.9418,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.45126315868831723,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.862,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.42373725126240985,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.8423,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.36598981027567196,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.8193,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.44213039096300505,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.8726,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.39789444791501166,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.8407,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4092142360498181,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.881,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.3468050800538684,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.8674,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.4233807018872381,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.9065,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.33971286367406983,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.7654,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.4046955760630767,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.8813,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.34005001112275796,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.8236,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.4025294677541818,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.8051,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.3686358863169206,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.836,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.32331230707111247,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.7774,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.31282451699321023,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.7723,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.37439767878753266,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.8478,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.32333681034525386,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.7865,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.4432036670007486,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.8627,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.37900974331118864,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8027,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.36170264976324784,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.7984,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.3283349748815554,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.7789,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.38010110559542276,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.7616,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.35779184963693483,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8295,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.3188972855562048,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.7941,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.33885595331420143,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.761,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.6599626462124875,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.8581,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.30590437571330525,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.7641,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.32451038077710237,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.7923,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.31066761640154694,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.7954,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.3138858814462281,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.802,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.32614645591571356,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.7911,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.3024007790436311,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.8083,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.30440235198263627,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.7668,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.352744802698165,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.8415,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.31658794467190116,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8017,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.31492856970059707,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.7642,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.3537817488504811,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.8039,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.3148253491813399,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.7959,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.31850399118450406,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8316,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.3056092085262738,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.7915,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.30536451054961206,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.7883,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.34941693898355236,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.7571,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.296880968864248,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.7288,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.3322679426381215,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.8012,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.33260727442082016,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.7452,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.3081599535574942,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.7559,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.3244889807411165,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.7584,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.348431414936007,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.8334,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.33040662982295277,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.7763,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.304768597360976,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.7595,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.32114990202854404,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.7826,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.34670888584990867,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.8071,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.3135050010328246,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.7591,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4007165410841229,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.8222,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3098165798346559,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.8276,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.3366496343563119,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.74,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3446479624854017,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8105,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.32538947912217897,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.7902,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.3382513419761104,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.7953,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.2902528244962728,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.7831,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.33295380789582807,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.7932,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.32491079311016224,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.7834,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.3219914116664492,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.751,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.30558099102554004,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.7302,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.30736544484761913,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7667,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.34356528061341995,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.8141,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.32407625571977955,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.739,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.31730106115240775,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.7717,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.3242457387120635,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.776,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.37387503794119153,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.7173,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.36544910247697354,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.7854,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.3280297159655621,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.7668,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.3060306803673194,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.7784,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.340343694183801,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.8023,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.33147507262260134,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.7603,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.29967307880987004,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.7351,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.3070447135297137,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.8144,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.3371468656483107,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.7643,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.3301544042290975,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.7455,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.35732461105198227,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.7399,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3372671710177106,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.7744,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.32368170886745823,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.7558,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.347088503534774,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.7792,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.3098281533746313,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.7418,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.29763542841261936,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.7411,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.3131951129561186,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.7384,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.33576606144185733,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.7822,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.29032650311854463,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.7639,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.33954362645047376,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7833,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.302213671546099,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.7412,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.3472468128228165,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.7713,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.3009835196525144,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.7241,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.31496230188332786,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.7087,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.32468960647814604,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.7866,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.35142485947695556,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.7743,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.2984111047966567,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.7115,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3122067236813943,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7541,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.3061309555680417,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.7588,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3330261141466801,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.7725,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.2878265748509493,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.7239,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.32402311127004874,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7572,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.2885443877226311,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.7109,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.3325984376260274,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.7745,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.35276302988307434,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.6814,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.3743223954364047,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.8025,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.31250095434003256,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.7543,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.27751291074535644,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7146,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.2916739695563485,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.7413,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.30375483290863786,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.7346,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.33201388009152266,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.7116,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.3372896307941712,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.7415,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.3061689590079631,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.7273,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.31379168320638867,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.7291,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.2934848153355263,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.7222,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3441301895087707,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.7521,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.33242144922282324,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.758,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.2998764306620887,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7362,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.35453134249708435,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.8094,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.318190568854741,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7685,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.2924483481496768,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.701,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.28860962608238744,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7048,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3223093593911342,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.6787,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.32652124911022334,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7751,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.3378567546264251,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.7057,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3347874323081615,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.7427,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.2950258859285516,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.7103,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.29929474750451557,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7159,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3499940407779636,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.7881,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.30145031584516635,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.716,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.38401430285844806,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.7446,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.33361204379628423,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.7265,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.322019046159023,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.7728,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.30675719855675915,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.7094,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.31580729976842725,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7369,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.28023263956373423,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7057,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3095266279847274,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.7128,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.30435117677738777,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.7652,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.2750215878076213,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.683,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.29938683531824195,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7781,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.2739445261182683,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.7034,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3032647494750609,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.733,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.3588377553282501,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.7565,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3252058405483004,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7689,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.33937386250897783,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.7685,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.2748694284336673,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7181,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.30939208531233414,
+      "learning_rate": 0.0001,
+      "loss": 0.7431,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.2738534135072514,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7042,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3974092001806277,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.7634,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.298415119173645,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.7466,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.30699336831051954,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.7284,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.2901068669142659,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.719,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.289122044479363,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7059,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3030799207029953,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.6867,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.3027641169951838,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.6983,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.29915201634856275,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7066,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.33877171450167437,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.7477,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3019004859637966,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.6896,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.32101987397698073,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.7732,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.29608955613782456,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.6975,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3903341635561156,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.7325,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.2846065516794577,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.6659,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.27564468788525615,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.6608,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.27705456068017625,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.7026,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.27581471128823365,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.7589,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.297802511066448,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.7905,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3452322304584979,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.7608,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.2928432329233493,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.6883,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.29460004736129036,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.6631,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.28123407758344715,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.7508,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.2843635254634615,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.7282,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.34889895491799827,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.7743,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.3425113999228756,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.7221,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.30595814993670367,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.733,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.35840367615756424,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.7561,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.33104462323668715,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.6991,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3394753446738843,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.7099,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.32283853853312183,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.687,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3049519469814839,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.6855,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3095509339891271,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7694,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.2854844000449528,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.6779,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.30525546650522767,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7081,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.2833182748533542,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.7224,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.2629180405414573,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.6747,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.3037186612742775,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.7015,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.2579515333812123,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.6525,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.3015359549294114,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.7437,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.30328608937618484,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.7262,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.3131312484861202,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.6913,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3428352859498749,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.7192,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.32744225245031866,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.681,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.2789381341996287,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.7045,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.29037036499383584,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.6812,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.2985271031320008,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.6829,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3011009096610186,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.7078,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.303730403749863,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7333,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.2700747120155797,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.6845,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.2945402305786069,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.7313,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.3291065136259577,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.706,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.3089232604005034,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.6608,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.29043173518454235,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.7111,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3072972304823298,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.7309,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.29575293212282944,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.6941,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.272191115063121,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.7105,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.29872578885296647,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.7073,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.30797164640532304,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.6731,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.30888955330178547,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.6875,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3011025807384547,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7361,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.2634763705716138,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.6584,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.27702377174703136,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.7055,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.4738064858879691,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.7186,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.3216294244730204,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7234,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.31551523504983137,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.7019,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.29166142886120305,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.7063,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.27317702545893363,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.668,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.3192281891362774,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.7469,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.35440201327111076,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.7452,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.28502204185604074,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.6923,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.29945490920230516,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.6925,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.29575834955378427,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7027,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3291841843584088,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.7127,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.28583705498573,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.7333,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3181846960491521,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.658,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.2819914163211931,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7208,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.31755915879086516,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.7026,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.36200326820967205,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7343,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.27246946326238036,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.6493,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.30218563568514123,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.6929,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.28757076182540703,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.6935,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.27698386751868853,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.7087,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.28280472449769867,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.7087,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.2651273458522423,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.6997,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.33121071922944717,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.7048,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.38190444874022617,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.7801,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.27849697998142847,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.6905,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.26362848291419816,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.6528,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.29590890139488923,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.7182,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.2842179617272862,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7079,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3220612731779135,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.7432,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.31266632092760416,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.6949,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.28786957578583217,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.6744,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.29790664374884945,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.6948,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.3172355195642624,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.6821,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.2908686579144822,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.6693,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3241058543780178,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.7154,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3341389142195567,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7114,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.27604065366111885,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.7347,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.26797015474074914,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.6808,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.26746248447865817,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.6762,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.26681525542349926,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.6696,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.2788077513972515,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.6846,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.32515261075444146,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.7328,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.283302708290937,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.6677,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.27776316297126497,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.7168,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3150518012713109,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.6685,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.28243642250546974,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.6789,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.31271458346800596,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.7471,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.3162378069331692,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7485,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.3326107943100678,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.7241,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.2881515697148622,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.6297,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.29488227027303954,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.7256,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.2820180332448235,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7046,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.30787216006833723,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.6933,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3034242260555174,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.6961,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.34308439843199623,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.7172,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.32591428798684924,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.7055,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.2900530208408788,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.7069,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.3347468750592578,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7138,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.2977111126537549,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.6741,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.2874759354056903,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.6745,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.32717632776157757,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.7036,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.2965440455424347,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.6976,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.3032387027931161,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.6848,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.2757262556517274,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.6848,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.28723381852699836,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.6497,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3411539336294208,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.6842,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.3831063367203094,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.7287,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3119361218684643,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.6687,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.29122409002191013,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.6698,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.2933627610749106,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.6876,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.2781149714045268,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.7088,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3046275387540745,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7088,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3000535326506705,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.6684,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.3034992720452355,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.6749,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.2978087268151753,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.71,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.2943213594302957,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.631,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.2837772097273212,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.7203,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.27747160009113364,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7056,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.3021156038593126,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.6424,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.28447776946219105,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.6965,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.2904170570582678,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.688,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.28693854411619896,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.6536,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.2830799509701968,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.6752,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.43784320401705396,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.7293,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.25592207413800777,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.6987,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.2750153014419686,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.6692,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.2684110083374372,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.6751,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.32660971726192795,
+      "learning_rate": 0.0,
+      "loss": 0.6829,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 795733460516864.0,
+      "train_loss": 0.7523357088749225,
+      "train_runtime": 9661.7281,
+      "train_samples_per_second": 1.035,
+      "train_steps_per_second": 0.032
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 795733460516864.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e919187fbf913713e69fb440e8110e4d25b1ac30
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "k_proj",
+    "q_proj",
+    "down_proj",
+    "gate_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0b62957b9c7840f6cb568c86e2863a0f225c3555
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9d1227e4dd9a0cfde2fe69b24b286992bbae19535fa882a8f96b47d953c9801
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a8da24431df71599ffeb6b080b8db6bc7a04c380
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fef31742711d8236f4f4c850e0934aec980db88afeb35d572b9a68885e93e357
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f69d43c2a1ec9fb65d169acf964409d47d0ae3b
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.9307507358110347,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.3678,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 1.1911510039507764,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.5739,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 1.1329261461162867,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.5704,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.7765093235427704,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.2855,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.8969381600574139,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.2613,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.8953363951786134,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.192,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.9418089360353965,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.1738,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.3012452150233644,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 1.0476,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.8321289166655941,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9973,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8764791327346797,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 1.0245,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.6311716463900636,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.9404,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.6250088287070071,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.9513,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.5785281004669424,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9685,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.626495141490426,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.947,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.5326607517261951,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8805,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5238320523852994,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.9079,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.6599520510016996,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.9736,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5935929471408434,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.9538,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.5214400161127966,
+      "learning_rate": 0.0002,
+      "loss": 0.9133,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.49158434527043327,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8529,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.5761943028543544,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.8389,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5240754907632433,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8684,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.6319870835376165,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8933,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.4398463654248781,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8404,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.5354998313987915,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8627,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5296954730016616,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.9904,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.5545084227982735,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.827,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.49894046941791115,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8063,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.7330695616422173,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.8793,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5504392087500234,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.9445,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5138771247819872,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.9532,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5357310231971444,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.8373,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.436458649534939,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8142,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.43637696361672595,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.8147,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.47769382880518035,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.8219,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5691186156905623,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8832,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.4403064208176803,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8122,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.43879736958934057,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8792,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.4950759543534755,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.9137,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5544219940135219,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.7792,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.5696165767831806,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.8525,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.8056279621643284,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.8261,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.645373156488571,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.8033,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.5809821811726844,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 1.0064,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5237957226770983,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.8589,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.4787740915558479,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8778,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.45753284287165796,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.81,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.9398745375460666,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.8323,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.4529340832996081,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8218,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.666389495376452,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8682,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.512483360726018,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.8956,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.4615015180628129,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.8119,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.41822728845913026,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.781,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.439843201047384,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.8696,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.4296610243379561,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.8211,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5155399902660266,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.8404,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.4662602814333803,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8231,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4269443465627204,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.8061,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.5868211754520536,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.8667,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5401700531380792,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.8249,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.5275268741042621,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8194,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4950585563274784,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7906,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.4594902337967978,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.845,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5555461336900863,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.8507,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.46015831447117106,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8132,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.43769054593724865,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.8251,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.4309759425171532,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.8562,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.4556328773535883,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.7385,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.564216800136952,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8794,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5820634576821899,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8319,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.49068764874375215,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.8562,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4262934451493423,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.7794,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.43154854225837036,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7582,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.41467307557921185,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8181,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.43925945137208655,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.7741,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.46227512591367387,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.7864,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.4419905355200034,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.7695,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.5165657370156446,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7723,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.4403033444346165,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.8572,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.47897736403075963,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.8189,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.4599115930160895,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8252,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.40066554938838556,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.7582,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.4984653951622605,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8255,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5397071279329464,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8245,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.42163317045894194,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7135,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.4888518851875543,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7642,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.531963899827601,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.9528,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4354611813055882,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.784,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.4048432684336353,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.8194,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4086241078858841,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.7283,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.4336295842212704,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7621,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.5431483909710855,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.9061,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.42167546766193187,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7601,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.49652526503168953,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.7868,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.5042251597152386,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.8008,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.4660459696307034,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8621,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.5278479381348781,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.8426,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.5136081800388119,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.7843,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.49722508941229265,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.7822,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5125088004208799,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8668,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.5477111636891939,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.806,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5042188986966278,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.7822,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.4152580378172454,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.775,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.43906996296378337,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7744,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.43539466027214696,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.8257,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.4239378487862455,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7692,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.533212030471839,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.7879,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.41851795066862274,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.789,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.4936224317687835,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7451,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4150792579010742,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.783,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.49955970550870915,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7708,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.46522933096843166,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.8511,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.5077447088384153,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8148,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.461837784752004,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.8102,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.5274874349470041,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.8419,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.5119441434124525,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.8212,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.43939980505918763,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7677,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5049785758789185,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7267,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.4832395598923605,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7773,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.5285882253880008,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7878,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.44652155281764777,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.7555,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4384116771377595,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.7618,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.4503216438102163,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.8336,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.45399287100490054,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7825,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.4911803310924581,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.7849,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4475560318620542,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.7986,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.5410613805002472,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.7925,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.3728770521983605,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.6947,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4041103912264637,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7613,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.5740518300279095,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.8826,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5212422666013237,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.8067,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.4553858820536246,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.7779,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.4990540070803385,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.8145,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.4100061474777309,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7635,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.45392924863684636,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7389,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.43829101817979416,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7486,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.4467052785057778,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.7947,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4616964853806264,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.8073,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.4281905319998942,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.7307,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4473416989554217,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7899,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.4217308286391299,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.7788,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.4546622820724357,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.8489,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.4291947161668821,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.8209,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.4421224544920921,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7657,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.43378881989450496,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7967,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.42238741761879933,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7537,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.4152582962478556,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7753,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.4330144364111337,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7962,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.4108809995226594,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7966,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.40598672641581,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7633,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.4167982043506597,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7041,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.42014200520014094,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7193,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.4573788195799367,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7512,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.4803142053827793,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.8012,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.3632582425441092,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.7589,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.4271821317661616,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.6982,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.5370692376133122,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8071,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4204907018652622,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.785,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.4511978833789513,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7735,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.38997839389330263,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7175,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.45712195717321497,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7999,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.409363952347818,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.7441,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.4216699239112223,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.7585,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.4203526658961756,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.6942,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.4316640766886136,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.7799,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.3918992414000058,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7611,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.480952025030275,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.807,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.48029330507987417,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.7873,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.5007752530950041,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.8153,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.41193134414047006,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7823,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.3705783893043457,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.6882,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.5364714578308167,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.8123,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.4304840396947207,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.7226,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.39176723381890255,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.6553,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.473933800845934,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.8249,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4783955117626432,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7417,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.43812649136651893,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7362,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.42093229013970423,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.7559,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.43412761063692157,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7668,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4114487717141203,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.8001,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.5174273889532307,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.737,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.42200383733853786,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7676,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.37972005206251447,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.6844,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4187265512900597,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7754,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.39674951422645327,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7476,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.43085954716816965,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.787,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.5261687847463365,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.8032,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.38618780370611927,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.738,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.5396863057893749,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.843,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4704033989607021,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7423,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.47406714484336154,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.8133,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4427659708747952,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7881,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.42786510474475103,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.8183,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.44815258181100043,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7499,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.4253238194388484,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7833,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4141363592408863,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7751,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.46951635497507826,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.772,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.4398646647277728,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7551,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.527082865550504,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7694,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.39510138889240665,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7693,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.48632553909689946,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7257,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.43376164298659,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7286,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.4212103228080253,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7627,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.45252904440902125,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.7974,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.3962314240292195,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7366,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.4327265506871692,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.74,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.514933329734334,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7318,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.41587989020386307,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7962,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.4403701107153674,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7474,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4202181716022083,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7706,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.4120642937438657,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7493,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4480337234157597,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.725,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.5633757107881812,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.8487,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.46406095606304976,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7553,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.5049068598510368,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.8129,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.40075745127741647,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7375,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.5024374954187978,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.8115,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.4633484981898426,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7774,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.4162899199373428,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.7338,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.41669489939623083,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7608,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.4326687946907688,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.7278,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.40643779999339563,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7128,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.40288496941550733,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7538,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.42603406164838237,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7597,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4688532645240747,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.8368,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4332807404697522,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7325,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.476697765412092,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.8241,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.4436651962043295,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7151,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.45433089014881944,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.6574,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.43375495977111883,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.734,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.42738324793054794,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7318,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.40877520369426673,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.7612,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.408704133205752,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7062,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.5337340604069295,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.8101,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.4144611426632878,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7162,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.45012679041480336,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.7141,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.8796246533658749,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.743,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.45709338800740185,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7904,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.3579803058272898,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7025,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4345013477153899,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.6787,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.48881781965391935,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.799,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.4280144158388546,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.8191,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.39269162859411183,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7422,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.44400142267856024,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7515,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.3771035900564488,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7656,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.41880705442305655,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7471,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.46797066447683655,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.8872,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.47955664305877593,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7527,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.4275337022637046,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.752,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.447119769729004,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7691,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.4089115544742617,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.6738,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.44071585635928706,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7376,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.5365017095624967,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.8418,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.4151943479460226,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.7133,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.42272229703750513,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7689,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.41818895231074066,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7552,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.3745276342388048,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7207,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.450655111288209,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7831,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.4494743647552362,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7478,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.46038560262905603,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7465,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.3911465389838148,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.766,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.44758425742925684,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7222,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.433744583645182,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7889,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.44729644296451876,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7617,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.405956052730903,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7481,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.42197502885627625,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7914,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.4471034776086442,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7765,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4223092383131499,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7371,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.47622888335383845,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7389,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.44051030500926475,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7339,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.513880230702324,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.8257,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4168046060852622,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.6652,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.3802093771260786,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7385,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.5108787524389692,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.759,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.3876960955299403,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7061,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.40129654726182434,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.6962,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.420496259550952,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7369,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.37858645511244804,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.6701,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.4699367294388282,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7445,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3798508288727862,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.6693,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.4602324367778785,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.7183,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.4012131095306747,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.6667,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.4726607075165429,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7627,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.40340330901336047,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7308,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.412177457263275,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.74,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.42473218036945015,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7744,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.4275229972291327,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7734,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.44905637380247715,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.7404,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.4291634918113848,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.708,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4091422186714648,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7464,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.43783368283656954,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7346,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.4608607130020425,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.6936,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.3768622769412308,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7528,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.4209394222759887,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.7921,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4232745141703031,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.8449,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.6815490088866514,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.8462,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.4499466203511656,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.8488,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.3680896796550512,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.6303,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.3775579981479662,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7493,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.387847585549817,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.6977,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.40613518614587685,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.6898,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.43863627708276465,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.7185,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.3889560057701365,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.6758,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.4246062002636041,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7409,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.4391395598034235,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.6846,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.45434575132168414,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.7148,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.47366659509597836,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7921,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.38152089971111897,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.7133,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.38382088281979165,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7264,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4395521307482716,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.6851,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.4206238065270516,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.7188,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.40098244852313136,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.7042,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.4018951442478255,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7381,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.4033163134478128,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.7689,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.417080407377218,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.7526,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3655296102005045,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7822,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.3582719524840942,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.6148,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.6193022683975565,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.8375,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.5796649389110442,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.7782,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3660794304482504,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.6737,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.4020716164916512,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.7459,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4084027884383054,
+      "learning_rate": 0.0001,
+      "loss": 0.7405,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.36290419424211623,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.685,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.39503822991480864,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.6608,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3579770266047511,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7083,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.4087287034567777,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.7289,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.4839466948497212,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.7328,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3744868255230105,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.6718,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.40775717632462694,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.762,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4586551782279262,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.6693,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.4281910470889782,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.7011,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.43014491789502923,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.7538,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.4723560588807367,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.8245,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.4051222703401866,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.6607,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.4114137413271673,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.7228,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.4281954851576404,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7829,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.48289740774465234,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.7031,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.4028132723028769,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.6898,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.3980925287529609,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7098,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3909572327359679,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.7263,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.41392789312871603,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.7391,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.42630666106608667,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.6744,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.498629422281744,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.6997,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.4122424664938719,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7162,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.4913748005852001,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7293,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.3536713211110254,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.6023,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.420690620076411,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.7566,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.38799097318034,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7254,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.46951331944016117,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.7676,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5125519464479356,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.8276,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.4610250816054034,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7358,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.34069998754998165,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.6724,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.42034988016896346,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.743,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4525953625962461,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7148,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.3722953782287262,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.7316,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4600152810330939,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7374,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.386769626559561,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6661,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.4047703539716422,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.7078,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.528647696893897,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7029,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4660467946759398,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.7477,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.39172340459603056,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6979,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.41515909712934274,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6949,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.4338402676367551,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.7308,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.5016467346485571,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.7742,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3655294327494492,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.6789,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.3992235175217145,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6587,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.37379208794413066,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.7055,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.46885847173208745,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.7399,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.5101176189041192,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.7649,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3813931147515439,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.6728,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.4015163654121684,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6798,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.43697057995490984,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.7125,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.39582239698428895,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6943,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.43455119502189843,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.7419,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4743973053815229,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.754,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.4981920198315926,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.7306,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.3435811784703452,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.7041,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.4676722596596705,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.7233,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.4163922149187722,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7852,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4386868522058961,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6972,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.3580208548686132,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.72,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.45139710327502214,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6861,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.4009493880827884,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.7333,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.41241573495889516,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6938,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.37551723461841824,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.687,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.417073555060805,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7767,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.34454543212225475,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.6765,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.4176076168000247,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.7414,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.41253313320444646,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7218,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.36254702485396967,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.6998,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.4532720414077359,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.7028,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.38065089978604044,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6944,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.4576595436457372,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7512,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.42047403152676116,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.7768,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.3953060092081604,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7042,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.37263074777798094,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6913,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.4110666177393186,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.6652,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.4001093881378836,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.7271,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.39674680837721515,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.6991,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.36559532012893964,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6907,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.41818675806839695,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.6753,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.46023800073649845,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.7153,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.41876830239649976,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6728,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.37212098102703917,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6154,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.3886496475370002,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.7295,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.37728613039090586,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6951,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.3943481381878099,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.6747,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.4231106370387455,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.6679,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.4293879013145125,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.7042,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4127396745960478,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.6951,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.43868355880858784,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.6889,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.4203498442648587,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.7767,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.35312458805384117,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.7057,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4972641758757012,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.78,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.39503459426723275,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6531,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.39455726810213704,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.718,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.5168695038424747,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.7102,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3841881282938994,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6487,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.4540275111280096,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.7444,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.43833529799612947,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.7671,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.3945980102733958,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.6655,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4149063985363043,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.7909,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.49822598691621023,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7377,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.5097136232515275,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6614,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.4177782773828526,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.7057,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.37177402594667786,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.7117,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.3685581881197926,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.6873,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.46556776071888967,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.7756,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.35266770886849363,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.6912,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.36369085697142967,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.6978,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.43709851863195315,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.6757,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3829946382761662,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.692,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.3497755554871654,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.643,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3574329631618023,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.7318,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.3985924935783683,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.673,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.39976928781715837,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.6978,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.36214527971925986,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.7079,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.38255440557898723,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.7506,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.4503004187468137,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6702,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3841330427952996,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.7128,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.38175740334910613,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.6739,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.6598999413896037,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.7369,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.39589704447950774,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.7133,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4373451771733771,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7405,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.41202339742356464,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.7241,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.4147292403848051,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.7396,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.39756824380394074,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.7464,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.39536317275028776,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6789,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.4020658688068476,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6476,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3852940469110496,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.6535,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.37806788144113224,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6732,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.4771061846246495,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.7475,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.4137143209238499,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.6498,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.40453503233945426,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.6685,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.3286796782395469,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.635,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.45348754416461634,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.7635,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.34864467592078296,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.6743,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.3548118500505629,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6693,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.35704873920277636,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.718,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.3420733715452814,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.6425,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.4488889706342674,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7249,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.44498258061855306,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6507,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.38288433771548464,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.7291,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3925957323413787,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6717,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.3856360933870024,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.7038,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.4765791148040302,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.6525,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.3910871609045509,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6281,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.33589947019240074,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.6655,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.3542736538025536,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6567,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3897033488704761,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6661,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.4349004326943216,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.6375,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.36368302331411456,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.6337,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.4864399520526362,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.7209,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.392233826310996,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6715,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.41925245267701783,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.7298,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.37658896500636835,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6181,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.3882127187742822,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.7136,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.4896363428181879,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.7591,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.39284102468656,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.691,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.40202454679797983,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6698,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.41308583418409484,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6387,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.4498832761119042,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6923,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.42557242477883767,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6601,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.37082914710964937,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.7148,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.46399745598658604,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.6933,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.408244883944075,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.7124,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.46790313701561737,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.7316,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.3954420659794482,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6415,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.5160446694784617,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.7972,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.407372719888538,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.6934,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.43382835064241637,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.7157,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3667955374742887,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6437,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.36914259464831484,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6657,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.3894458809663143,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.6928,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.3899127756187008,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6363,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.40349842143611453,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.7303,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.5535080070125004,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.785,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.4492622366179551,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.6326,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.4241277805722848,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6273,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.36507025077689886,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6909,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.37298271092376206,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.6932,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.4567341972329579,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.7119,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.3626668007110665,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.6939,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.39980272585394583,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6755,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.47378634820576804,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.6768,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.38966727992559264,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.6876,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.406480465815636,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.708,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.383192054205226,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6526,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.4357434266593055,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.7447,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3716881348555911,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.6277,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.431139945516892,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7149,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.34663841079830454,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.617,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.4462568432993696,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6918,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.3713079668658502,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6789,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.5257320001228583,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.7435,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.42031223753723,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.7457,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.4131206466935399,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6651,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3632211990047449,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6186,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.3554901112331569,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.6931,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3863062310695504,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6086,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.5859591179126187,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.6622,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.34970699829024526,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.6294,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.42096407697177807,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.7137,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.38622205030468887,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.7311,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.4129989963301976,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6875,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.48636896826486653,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.7578,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.505134167661199,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.7281,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3736993059804803,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6995,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.3940804048202612,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6943,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3990774475655975,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6637,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.37042381505898464,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6543,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.40837938280234165,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6777,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.4043181426273858,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6471,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.39529515976684354,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.7021,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.44449771703885715,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6829,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3931107651957264,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.7135,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.39318479177043725,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6756,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3559067139522861,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6402,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.38875326655337555,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.702,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.4815111798863259,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6987,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.41201940639359147,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.7208,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3629621830209102,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6705,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.4038965497768723,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6447,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.3762118491410858,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.7129,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.3562134753536249,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.6769,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4391952855141163,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.676,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.43833627573033557,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6823,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.39781394659560937,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6684,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.4078308661940925,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6746,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4049144720844558,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6779,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.3536177694216831,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6594,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.4077623211082068,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.7325,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.4131997194370555,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.7321,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.578312625959774,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.7399,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.4083341879476406,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6443,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.43898568804268984,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6969,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.462696237901638,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.6729,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.3910910585414048,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6521,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.38550254011439905,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6647,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.38144526706104076,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6762,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.4118249960759194,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6232,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3615210025976185,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6161,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.4127086872608901,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6476,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.43217465575323877,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6634,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.4523386106539392,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.7196,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.4025003178681048,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6766,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.39382307042669457,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6541,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.4364109328096091,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6831,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.424692957408124,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.7126,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.46133510175309317,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.7628,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.4145843808424795,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.6789,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3634617890226472,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6781,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.389987881967966,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.717,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.46779116163036566,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6959,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.4602503447808005,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6954,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.40139265943375085,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.7356,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.4251079728922497,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6926,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3378013421472393,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.6677,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.5635726805244304,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.7779,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.400951001521151,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.6798,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.3753513048860841,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6961,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.3576763689201268,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.6694,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.3947477942729083,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6445,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.37178522809350467,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6269,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.3468678976188837,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.6385,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.5028190777600934,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.637,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.40916482529412307,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.7019,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.4308003991535694,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6857,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.3962208433985528,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6895,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4191855317242016,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6972,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.3605037686158801,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6957,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.37104812216124583,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6586,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.4917061399707436,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.6588,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.4208216791149571,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6511,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.3498415188467899,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.575,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.42668404532742993,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.723,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.3530768189661101,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6135,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.37232146648601894,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.6602,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.3458193930003985,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.7323,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.5055609079845153,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.7539,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.3779179889601207,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.706,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.4853083100791647,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6754,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.43823452893396114,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6982,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.37408287883821567,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6649,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.48208652361982396,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.7195,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.38707122975730224,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6615,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.3549767879530745,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6401,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.4459824380757412,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6886,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.3751741377717446,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.6353,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4888587128899068,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6977,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.553546441262832,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.7254,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.4598376545696746,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6825,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.390502964804027,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6519,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.4077541528941456,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6666,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.41081326066795504,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6526,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.34632048003078875,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.6367,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.39446972308909783,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6707,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.4391317041949358,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6925,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.42626044736103313,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.726,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3745101807288713,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6656,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.36825235507652904,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6548,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.35790882557698545,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.639,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.5101147263030844,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6959,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3990696683134663,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.7036,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.4254641164398092,
+      "learning_rate": 0.0,
+      "loss": 0.7022,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 548779134353408.0,
+      "train_loss": 0.747810267162323,
+      "train_runtime": 9756.0827,
+      "train_samples_per_second": 1.025,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 548779134353408.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ab0f7ae56d5d43cc5716a8a91b0d300d3d8791d
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "down_proj",
+    "q_proj",
+    "gate_proj",
+    "o_proj",
+    "v_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..af2aa63a7186a9fcf53846fe5731e09c69510983
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a272106dc68e4b6a96d4784e04d8ff9aeee98af4db7f27b1410290cfa9c1a443
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2af294c7a63c17bc6403781935c056c754d66c73
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f954291e8362b47f02bdf8c359893689eb2d775e62b6ec84a176fdb86474e51
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..231de36c6134f32fb0b237aad0a06d81d492e689
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9799427462773762,
+      "learning_rate": 2e-05,
+      "loss": 1.4708,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9651716724144156,
+      "learning_rate": 4e-05,
+      "loss": 1.4713,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.8214609444640415,
+      "learning_rate": 6e-05,
+      "loss": 1.3885,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.7555698733537519,
+      "learning_rate": 8e-05,
+      "loss": 1.3231,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.9791891234053539,
+      "learning_rate": 0.0001,
+      "loss": 1.192,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.2500279666276863,
+      "learning_rate": 0.00012,
+      "loss": 1.0631,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.8496873478151169,
+      "learning_rate": 0.00014,
+      "loss": 1.0561,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5582296792359828,
+      "learning_rate": 0.00016,
+      "loss": 0.9542,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.6141848460525696,
+      "learning_rate": 0.00018,
+      "loss": 1.0162,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.40565836326113536,
+      "learning_rate": 0.0002,
+      "loss": 0.9166,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.43961376645695954,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.8846,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.42249345100721597,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.901,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4997460360846223,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.9676,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.4378341620615057,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.8422,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.4484500519813879,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.9325,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4624548605603959,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9258,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.35728208889300955,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.8349,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.411935507304102,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.8717,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.3650781718408625,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.8646,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.39246560512723355,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.8595,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4030410211829131,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.8502,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.3837493639747341,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.9152,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.38342018894888275,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.8757,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.33016580945906,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8253,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.3529876724009927,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.8466,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.4185663760260027,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.8654,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.33589268180561294,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.8265,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.3790810727528171,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.8394,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.33991371443148344,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.8227,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.36628591292131574,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.849,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.34554827689022766,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.8071,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.3604273129042498,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.845,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.33081663785399623,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.8196,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.3343038694634959,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.7966,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.37088659377199995,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.8567,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.32460476013111056,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.8176,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.3165303118335729,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.7878,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.30565580680683524,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.778,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.30616328404606236,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.7669,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3372819522478746,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.8253,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.3381157784774621,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.7837,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.37360600798509136,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8234,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.33012256207534557,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.736,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.34389624056846985,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8622,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.288060189131907,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.7693,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.31397585630745445,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8236,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.3225054745417322,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.7671,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.32794264489492814,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.8245,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.33655424519373633,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.8104,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.33508411660895715,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8186,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.39563760749889826,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.7915,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.3265114128373369,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.7707,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.3157403257540551,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.7905,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3431574816357997,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.7795,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.31241247865104216,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.7573,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.31433971480001943,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.8022,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.3422687306764917,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.8084,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.3333545008622585,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.8171,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.31741446992978994,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.7369,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.3542194582461601,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.7742,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.32353442263225596,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.7485,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.3414106252962905,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.8001,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.3267608148988314,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.7764,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.3367374059916079,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.7331,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.3213805771349721,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.8088,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.37163104509465206,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.7812,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.32955187643987766,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.7802,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.29860414347032455,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.7319,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.31375066497346393,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.7866,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3188775915788622,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.746,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.3260432484403654,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.8036,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3155014669529756,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.7851,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.30508645394256523,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.7718,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.2923922402971162,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.7763,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.29774460327571245,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.7716,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.37219652290163485,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.6968,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.327765713563282,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.768,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.30579480239653173,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.7224,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.35549030323617814,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.7889,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.30604310949701513,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7344,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.3199842246271541,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.7592,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3188264146047309,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.7187,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.30584368371132103,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.7595,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.34947093817612324,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.7917,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3129331216116529,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.7874,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.32186276541072933,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.737,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.3206007968349657,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.6785,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.3440231116205369,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.7705,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.3038874492251255,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.7337,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3130447119031358,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.769,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.34626456768156516,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.7398,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.2930574912319025,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.7161,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.3077872285049125,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.7603,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.3276012866868929,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.7585,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.3576543436577549,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.7851,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.33090749089696053,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.7873,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.3125775160031755,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.7721,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.312867542143936,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.7688,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.33590251445511526,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.7539,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.2973307464223306,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.7521,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.3152236348770068,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.7154,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.3113881739746379,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.7717,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.31332601927708326,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.7288,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3397860539559529,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7561,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.323198798835529,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.7466,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.30623823493923746,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.727,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.37285660237595886,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.7894,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.3262848271857289,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.7651,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.3620365495484864,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.7745,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3057218497192761,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.7385,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.29327000158218175,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.7058,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3054427550394877,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7432,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.3408694429782927,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.7762,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3392121321032056,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.7593,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3360555533470809,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.6859,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.3031735858575293,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7343,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.3571389466995919,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.7475,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.31507833917832595,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.7129,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.349220769192541,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.7594,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.2992587393857802,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.6826,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.3354232604768008,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.8029,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.30436303254090713,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7381,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.29220299250608445,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.7514,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.3445834090727366,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.8141,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.31361830449707406,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.7502,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.3012805337810813,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.7005,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.33491038256274097,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.7707,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.2928412264150717,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.7496,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.2973820426225998,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.7435,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.33198600910882076,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.7374,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.30397289758816454,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.731,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3275161096163156,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7701,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.30942657238708327,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.7617,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.33949532944889904,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7467,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.3326053632110846,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.7319,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.33610166939644226,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7343,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3341282015985498,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.7447,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.30529532490111727,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7017,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.30194530957503607,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.6978,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.30784297599221105,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.7004,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.30632096068901793,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.6827,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.33150651532281467,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7397,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3053637242286552,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.7533,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.3442857177860226,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.755,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3012900869853622,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.7232,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3432431419991961,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.7128,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.29882942062533513,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.7693,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.41375178194456913,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.8417,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.308129486439542,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7332,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.2807674738390264,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7218,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3078248328225141,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.6961,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.290270799196629,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.7034,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.33081309037600143,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.6991,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.31433887184833464,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7441,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3286954369396406,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.7037,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.31183061999915007,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7096,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.30613772263369227,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.75,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.29338217061262856,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7616,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.38267250553954885,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.7247,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.29705960921210556,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7251,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.30418653265151313,
+      "learning_rate": 0.0001,
+      "loss": 0.7422,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.2766402451030531,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.6737,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.27765442045055555,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.7201,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.33469021959761414,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.702,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3007870516067576,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.716,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.3086653479924654,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.7244,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.31949221536573363,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7437,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.31639220416180613,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.7519,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.34356161115139805,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.697,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.29377139329903523,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7182,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.3061399495541095,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.698,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3350824147892866,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.7043,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.3162601083552609,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.6629,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.29888480684890745,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7368,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.36898201481823084,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.7991,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.32071719760353296,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7005,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.34119737881585444,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.7253,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3097059116211661,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.7352,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.2985655349964369,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.6863,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3099795875381943,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.7238,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.2960209581200614,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.6965,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.3447576819640971,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.7548,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.29867157227468866,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.672,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.2956036416108182,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.7172,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3177288874362381,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.7173,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.30018945978996536,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.6953,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.30053953471137596,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.7147,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.35338506088450466,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.7443,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.3080546389515865,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.7107,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.33340506284983606,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.7401,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.30467308149483496,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.6992,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.302039988507991,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.7113,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.29675341284227075,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.7337,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.28326891243017777,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7095,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.2841228427758416,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.712,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.2791429263057307,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.6982,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.33520026689446214,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.7683,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.2902002710909738,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7006,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.293451022306754,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.6983,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.30046635314909986,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.6992,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.3233258402376756,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.7009,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.2980725039304043,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.6507,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.2954458399003553,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.7199,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3570840084902058,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.674,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3101295016619717,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.703,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.30749681984973637,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.7338,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.3291093600999409,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.746,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.2720958629997114,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.6911,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.30753918488924453,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.6842,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3216825680175012,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7591,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.29659000572707517,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.7325,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.3962913645918868,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.7051,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.3026672237063192,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.7115,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.31177627875961095,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.7362,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.27031323704065807,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.6999,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3080115048245897,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.6881,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.26336675222056166,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.6917,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.2977353273084597,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.6918,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.2812550180221224,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.7328,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.2906984244863733,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.6946,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.30052517518374744,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.7149,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.30688289620855186,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7324,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.3054582615261841,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.7339,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.2944328823100095,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.7185,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3079056820382457,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.6519,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.30674447305985186,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7145,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3001577932422797,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.6646,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.27810720745161477,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.7027,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.2670136114510044,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.6764,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.26833863371496064,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.6835,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.3232697533624734,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.6975,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.28561088539631885,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.7052,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.2811361506677406,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.6856,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.27587824227529845,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.653,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.2727589576041019,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.6707,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.2790043641300064,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.6442,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3114250313351382,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.7008,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.28659185546993066,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.6823,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.333270423139455,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.7449,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3015137026076761,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.6864,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.2967636116610985,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.6717,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.29290983244483276,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.6942,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.33090797220012547,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.707,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.31240549536269197,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.6916,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.33318737481985666,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.7472,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.2991242050339852,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.6865,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.2869364055447715,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.6853,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.28846836401603,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.6878,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3751675371753753,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.7167,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.2946460631091525,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.6625,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3191164954661919,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.7096,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.2795783534552219,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.69,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.29593212497721605,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.6903,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.31815489083177395,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.6864,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.2966930688795833,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.6927,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.298021896337788,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.6759,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.3025964272635615,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.6929,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3548545029837512,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7519,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.2849756804123775,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.652,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.2737573510609918,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.6571,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.29796463169165993,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.6516,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3038486218868143,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7304,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.3283761528592076,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.7299,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3228101439752998,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7221,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.28615791551135733,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.6857,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.2798346418152692,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.67,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.2803680727826389,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.6805,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.300954219778753,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.7037,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.277934902739733,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.6681,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.31512482967960964,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.7092,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.28599544129060733,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.7065,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.27602944028769505,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.6853,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.3267933089852728,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.6858,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.313459679429886,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.6839,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3001825550345055,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.6879,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.31637271766164904,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7064,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.31838626264793163,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.7455,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3074064029178275,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.6752,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.304961915963322,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.6734,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.27696421295588336,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.682,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.29539939286952366,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.6274,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.30824123193852787,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.6613,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.30298284813410736,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.7081,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.38531225841242817,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.6766,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.33880034089518074,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.7444,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.2866062477084964,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.6876,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.31917192539126965,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.7148,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.33612170964556043,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.7244,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.2795209986859653,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.6897,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.34555833343813835,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.7329,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.273768301964365,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.6903,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3008109477758345,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.6472,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.272451739640845,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.6455,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.30995766008558534,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.7038,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.31021019826393165,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.7032,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.279226548216852,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.6872,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.2997198731268311,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.6625,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.2895908073868522,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.6554,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.26714971443515456,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.6454,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.33520987293435356,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.7539,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.31608583930661005,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.7005,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.3033775341477071,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.6894,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.32634303773898465,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.7045,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.28748004629627927,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.6712,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.31678645523713805,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.6722,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.31000107831499757,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.7094,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.2781674506548415,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.6657,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.26616307698787445,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.6561,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.3176419427961581,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.6898,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.2831916446551315,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.7033,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.256439033754437,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.656,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3055519184450505,
+      "learning_rate": 0.0,
+      "loss": 0.7088,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 802271864946688.0,
+      "train_loss": 0.7511108540571653,
+      "train_runtime": 9734.5246,
+      "train_samples_per_second": 1.027,
+      "train_steps_per_second": 0.032
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 802271864946688.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e273ae28008be5e0a49b27d582dd67ebc2779991
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "o_proj",
+    "up_proj",
+    "gate_proj",
+    "q_proj",
+    "k_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a4a9f958052539812c70a1b002544ce6ffd4f4f7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5c6b66af35ec9a9f7103ad1187b72969b06a01aac9ff7e94da2272f6a0f877b
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2dcef3fc36b8721ca0fb05c44e38f373b7fbb95b
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cf8cc0b1167f652ce9945a63057d9d7de52436c1f5aa0a53e58ca12f88d7d1e
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c6d99a977c1bd0ca7f89bd3f2b82c39adc2e8c5
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,8792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 1.0374368174554673,
+      "learning_rate": 5.263157894736842e-06,
+      "loss": 1.4765,
+      "step": 1
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.9647907368918112,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.5275,
+      "step": 2
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 1.0687221015355828,
+      "learning_rate": 1.5789473684210526e-05,
+      "loss": 1.4583,
+      "step": 3
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 1.1073894659931591,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.5343,
+      "step": 4
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.872939750982351,
+      "learning_rate": 2.6315789473684212e-05,
+      "loss": 1.4482,
+      "step": 5
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.7827606006770655,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.2609,
+      "step": 6
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.8767643959762481,
+      "learning_rate": 3.6842105263157895e-05,
+      "loss": 1.1594,
+      "step": 7
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9400756856208291,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.1574,
+      "step": 8
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.8173025011927325,
+      "learning_rate": 4.736842105263158e-05,
+      "loss": 1.1197,
+      "step": 9
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.024212238400255,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 0.9813,
+      "step": 10
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.8810021393418611,
+      "learning_rate": 5.789473684210527e-05,
+      "loss": 1.0469,
+      "step": 11
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.8402997948170003,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 0.9713,
+      "step": 12
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.8377451249568147,
+      "learning_rate": 6.842105263157895e-05,
+      "loss": 0.9829,
+      "step": 13
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.7458242310989486,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.9791,
+      "step": 14
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.7818629204315781,
+      "learning_rate": 7.894736842105263e-05,
+      "loss": 1.0254,
+      "step": 15
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.577254793980875,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.9164,
+      "step": 16
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.5196810929009288,
+      "learning_rate": 8.947368421052632e-05,
+      "loss": 0.8945,
+      "step": 17
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.5833759720354792,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.8972,
+      "step": 18
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.5250700596142646,
+      "learning_rate": 0.0001,
+      "loss": 0.8949,
+      "step": 19
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5768087471670845,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.9445,
+      "step": 20
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.6046200078061305,
+      "learning_rate": 0.0001105263157894737,
+      "loss": 0.8924,
+      "step": 21
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.5472188660850493,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.8678,
+      "step": 22
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.5140068334871463,
+      "learning_rate": 0.00012105263157894738,
+      "loss": 0.9087,
+      "step": 23
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5778034013843734,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8774,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.5291394827107463,
+      "learning_rate": 0.00013157894736842108,
+      "loss": 0.8625,
+      "step": 25
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.5334723147989654,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.8784,
+      "step": 26
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.6504839292539988,
+      "learning_rate": 0.00014210526315789474,
+      "loss": 0.8903,
+      "step": 27
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.4872484533928141,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.8226,
+      "step": 28
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.5109899744706795,
+      "learning_rate": 0.00015263157894736845,
+      "loss": 0.8469,
+      "step": 29
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.5177464368126972,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8804,
+      "step": 30
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.43544748073661604,
+      "learning_rate": 0.0001631578947368421,
+      "loss": 0.8103,
+      "step": 31
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.47195580204436993,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8169,
+      "step": 32
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.5172764639850093,
+      "learning_rate": 0.0001736842105263158,
+      "loss": 0.8867,
+      "step": 33
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.5374199599281689,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8723,
+      "step": 34
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.47035134318751337,
+      "learning_rate": 0.00018421052631578948,
+      "loss": 0.8921,
+      "step": 35
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5020079958011039,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.9284,
+      "step": 36
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.4757829581513172,
+      "learning_rate": 0.00019473684210526317,
+      "loss": 0.8465,
+      "step": 37
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.48388800543752086,
+      "learning_rate": 0.0002,
+      "loss": 0.8578,
+      "step": 38
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.49506459150484233,
+      "learning_rate": 0.00019999966405802826,
+      "loss": 0.8702,
+      "step": 39
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5956315840315756,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8123,
+      "step": 40
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.467793612449344,
+      "learning_rate": 0.00019999697653579705,
+      "loss": 0.8314,
+      "step": 41
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.5048076691281702,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.8696,
+      "step": 42
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.40294387287403305,
+      "learning_rate": 0.0001999916015635627,
+      "loss": 0.798,
+      "step": 43
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.46854643898417353,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8103,
+      "step": 44
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.49336650140129507,
+      "learning_rate": 0.00019998353928577919,
+      "loss": 0.8102,
+      "step": 45
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.47200531080017466,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8095,
+      "step": 46
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.5224134927367048,
+      "learning_rate": 0.0001999727899191228,
+      "loss": 0.9128,
+      "step": 47
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.500553057639183,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8842,
+      "step": 48
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.5027943918318422,
+      "learning_rate": 0.00019995935375248606,
+      "loss": 0.8701,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.45907904496011853,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.7932,
+      "step": 50
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.5887996842150158,
+      "learning_rate": 0.00019994323114697022,
+      "loss": 0.8548,
+      "step": 51
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4401221656214865,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.7761,
+      "step": 52
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.45742857334650894,
+      "learning_rate": 0.0001999244225358753,
+      "loss": 0.8287,
+      "step": 53
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.4594651521945782,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8803,
+      "step": 54
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.4751273716695581,
+      "learning_rate": 0.00019990292842468868,
+      "loss": 0.8337,
+      "step": 55
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5194707687606167,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8136,
+      "step": 56
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.46889516421407623,
+      "learning_rate": 0.0001998787493910712,
+      "loss": 0.8388,
+      "step": 57
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.49447907478543895,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.8274,
+      "step": 58
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.4982392434002533,
+      "learning_rate": 0.000199851886084842,
+      "loss": 0.7147,
+      "step": 59
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.49653869839949516,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.813,
+      "step": 60
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.46503031036415227,
+      "learning_rate": 0.00019982233922796085,
+      "loss": 0.8348,
+      "step": 61
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.4553314478017029,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8459,
+      "step": 62
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.5099452510535164,
+      "learning_rate": 0.00019979010961450878,
+      "loss": 0.8437,
+      "step": 63
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4450900508201653,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.7594,
+      "step": 64
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.5139840949808181,
+      "learning_rate": 0.00019975519811066663,
+      "loss": 0.8131,
+      "step": 65
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.4537897303649624,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8247,
+      "step": 66
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.4793891004260735,
+      "learning_rate": 0.0001997176056546921,
+      "loss": 0.7717,
+      "step": 67
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.4601117089424223,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.8088,
+      "step": 68
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.5067495430657267,
+      "learning_rate": 0.0001996773332568941,
+      "loss": 0.8491,
+      "step": 69
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.42039937044715653,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.7833,
+      "step": 70
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.50716358892272,
+      "learning_rate": 0.00019963438199960599,
+      "loss": 0.8387,
+      "step": 71
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.4437233808887912,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.7922,
+      "step": 72
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.49349683185308396,
+      "learning_rate": 0.00019958875303715615,
+      "loss": 0.7851,
+      "step": 73
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.4989796856683146,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.7571,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5592873172818614,
+      "learning_rate": 0.0001995404475958373,
+      "loss": 0.8056,
+      "step": 75
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.48178379099074353,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8195,
+      "step": 76
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.46034118570648497,
+      "learning_rate": 0.0001994894669738732,
+      "loss": 0.7944,
+      "step": 77
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.4792276771943578,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.7781,
+      "step": 78
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.4905112294899911,
+      "learning_rate": 0.0001994358125413841,
+      "loss": 0.8404,
+      "step": 79
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.511027253538957,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8649,
+      "step": 80
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.4502231765028985,
+      "learning_rate": 0.0001993794857403495,
+      "loss": 0.8047,
+      "step": 81
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.4729558428131548,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.7284,
+      "step": 82
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.551203178138253,
+      "learning_rate": 0.0001993204880845699,
+      "loss": 0.9007,
+      "step": 83
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4955467240801091,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.8179,
+      "step": 84
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.4973043930768317,
+      "learning_rate": 0.00019925882115962568,
+      "loss": 0.7517,
+      "step": 85
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.5749059417258057,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.917,
+      "step": 86
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.5071055341489035,
+      "learning_rate": 0.00019919448662283478,
+      "loss": 0.8924,
+      "step": 87
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.46257737471037086,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.7609,
+      "step": 88
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.44168783280802665,
+      "learning_rate": 0.00019912748620320794,
+      "loss": 0.814,
+      "step": 89
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5454741570250863,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.8656,
+      "step": 90
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.45261852211312303,
+      "learning_rate": 0.00019905782170140238,
+      "loss": 0.8285,
+      "step": 91
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.49841285496121795,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8088,
+      "step": 92
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.48386523753794336,
+      "learning_rate": 0.00019898549498967343,
+      "loss": 0.8061,
+      "step": 93
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.48053831208676606,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.7484,
+      "step": 94
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.4771364223196281,
+      "learning_rate": 0.000198910508011824,
+      "loss": 0.811,
+      "step": 95
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.4970171280528449,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7632,
+      "step": 96
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.4687893128093611,
+      "learning_rate": 0.00019883286278315262,
+      "loss": 0.8255,
+      "step": 97
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.45499599756252357,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8242,
+      "step": 98
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.45447226273451646,
+      "learning_rate": 0.00019875256139039902,
+      "loss": 0.8082,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.47297411650227233,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8291,
+      "step": 100
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.40269649053074463,
+      "learning_rate": 0.00019866960599168826,
+      "loss": 0.7669,
+      "step": 101
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.5113495244968219,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.8032,
+      "step": 102
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.4327241663791887,
+      "learning_rate": 0.0001985839988164726,
+      "loss": 0.7878,
+      "step": 103
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.5038311447644328,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.8663,
+      "step": 104
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.4849232659450358,
+      "learning_rate": 0.00019849574216547171,
+      "loss": 0.801,
+      "step": 105
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.4681459892369315,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8459,
+      "step": 106
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.4360285161996867,
+      "learning_rate": 0.00019840483841061058,
+      "loss": 0.7529,
+      "step": 107
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5390379077321039,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.8735,
+      "step": 108
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.48479802014409695,
+      "learning_rate": 0.00019831128999495606,
+      "loss": 0.8477,
+      "step": 109
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.4057026106248144,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.7647,
+      "step": 110
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.44929090068815086,
+      "learning_rate": 0.0001982150994326511,
+      "loss": 0.8172,
+      "step": 111
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5354767704232606,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.7891,
+      "step": 112
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.48156568074510114,
+      "learning_rate": 0.0001981162693088471,
+      "loss": 0.7942,
+      "step": 113
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.4928990146899524,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.7533,
+      "step": 114
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.5409413287368926,
+      "learning_rate": 0.0001980148022796345,
+      "loss": 0.7595,
+      "step": 115
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.5430691232147374,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7757,
+      "step": 116
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.45766800539177555,
+      "learning_rate": 0.00019791070107197153,
+      "loss": 0.7872,
+      "step": 117
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.48224015934935577,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.7849,
+      "step": 118
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.5128445310359344,
+      "learning_rate": 0.0001978039684836106,
+      "loss": 0.8622,
+      "step": 119
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4791435694937396,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.8156,
+      "step": 120
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.48277627373747767,
+      "learning_rate": 0.0001976946073830234,
+      "loss": 0.841,
+      "step": 121
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.435256417777546,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.833,
+      "step": 122
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.423589901414426,
+      "learning_rate": 0.00019758262070932375,
+      "loss": 0.7381,
+      "step": 123
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.5259314780181126,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7611,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5738789466383943,
+      "learning_rate": 0.00019746801147218842,
+      "loss": 0.84,
+      "step": 125
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.4461538377556106,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.7562,
+      "step": 126
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 1.1937291424727432,
+      "learning_rate": 0.00019735078275177654,
+      "loss": 0.8755,
+      "step": 127
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.6228342628411248,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.884,
+      "step": 128
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.6214472402814468,
+      "learning_rate": 0.00019723093769864663,
+      "loss": 0.8182,
+      "step": 129
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.4557486063355505,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.7706,
+      "step": 130
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.4812066863375617,
+      "learning_rate": 0.0001971084795336719,
+      "loss": 0.8316,
+      "step": 131
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.447456871434426,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.7411,
+      "step": 132
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.39977666970703024,
+      "learning_rate": 0.00019698341154795389,
+      "loss": 0.7433,
+      "step": 133
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.43667315089753594,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.7684,
+      "step": 134
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.4494756970444253,
+      "learning_rate": 0.00019685573710273376,
+      "loss": 0.7688,
+      "step": 135
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.49026027382847054,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.9187,
+      "step": 136
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.47684851425475017,
+      "learning_rate": 0.00019672545962930215,
+      "loss": 0.8942,
+      "step": 137
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.45422995643590125,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.83,
+      "step": 138
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.4020057030148452,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.7248,
+      "step": 139
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.4503058589261113,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.7878,
+      "step": 140
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.7534723512494698,
+      "learning_rate": 0.00019645710967265882,
+      "loss": 0.8465,
+      "step": 141
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.5399174271022059,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7768,
+      "step": 142
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.48574380251997074,
+      "learning_rate": 0.00019631904440143612,
+      "loss": 0.7802,
+      "step": 143
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.5910788511465598,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.799,
+      "step": 144
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.42498860003951533,
+      "learning_rate": 0.00019617839052578603,
+      "loss": 0.7758,
+      "step": 145
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.4348223664639345,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7638,
+      "step": 146
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.4422740800075286,
+      "learning_rate": 0.0001960351518258255,
+      "loss": 0.7493,
+      "step": 147
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.4358249700796869,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.7581,
+      "step": 148
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.5218921014346616,
+      "learning_rate": 0.00019588933215113926,
+      "loss": 0.7822,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5028643589840232,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.7344,
+      "step": 150
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.49755928745931766,
+      "learning_rate": 0.00019574093542067673,
+      "loss": 0.8326,
+      "step": 151
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.568496585050559,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.8215,
+      "step": 152
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.429493869454417,
+      "learning_rate": 0.0001955899656226464,
+      "loss": 0.8248,
+      "step": 153
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.9825550664511603,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8457,
+      "step": 154
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.5113421864776261,
+      "learning_rate": 0.0001954364268144088,
+      "loss": 0.8057,
+      "step": 155
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.5357210696781818,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.8161,
+      "step": 156
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.5257837365454168,
+      "learning_rate": 0.00019528032312236736,
+      "loss": 0.8191,
+      "step": 157
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.6213616122058353,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7425,
+      "step": 158
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.444592236051405,
+      "learning_rate": 0.00019512165874185767,
+      "loss": 0.7491,
+      "step": 159
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.49639411717399873,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.8507,
+      "step": 160
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.39605949311819466,
+      "learning_rate": 0.0001949604379370345,
+      "loss": 0.7163,
+      "step": 161
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.47945044600591147,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.773,
+      "step": 162
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.4711373930990777,
+      "learning_rate": 0.00019479666504075736,
+      "loss": 0.7228,
+      "step": 163
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.4691074164513893,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.7575,
+      "step": 164
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.6643970248014088,
+      "learning_rate": 0.0001946303444544741,
+      "loss": 0.8381,
+      "step": 165
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.5492820446210721,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8513,
+      "step": 166
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.45992542125359726,
+      "learning_rate": 0.00019446148064810242,
+      "loss": 0.8132,
+      "step": 167
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.48259927780873124,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.7405,
+      "step": 168
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.4157849474201508,
+      "learning_rate": 0.00019429007815990993,
+      "loss": 0.7764,
+      "step": 169
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4630117970569246,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.8203,
+      "step": 170
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.4767300639859073,
+      "learning_rate": 0.00019411614159639204,
+      "loss": 0.7438,
+      "step": 171
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5643052333813844,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.8728,
+      "step": 172
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.5075287841776309,
+      "learning_rate": 0.00019393967563214833,
+      "loss": 0.8214,
+      "step": 173
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.4520406164867501,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.7665,
+      "step": 174
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.4210038006148116,
+      "learning_rate": 0.00019376068500975667,
+      "loss": 0.7426,
+      "step": 175
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.6248446036294675,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.845,
+      "step": 176
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.4643824586786272,
+      "learning_rate": 0.000193579174539646,
+      "loss": 0.7389,
+      "step": 177
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.4526319139574308,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7876,
+      "step": 178
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.42928338585788617,
+      "learning_rate": 0.00019339514909996706,
+      "loss": 0.7615,
+      "step": 179
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.55960583171419,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8021,
+      "step": 180
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.40208685338228656,
+      "learning_rate": 0.00019320861363646095,
+      "loss": 0.7784,
+      "step": 181
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.4726464051961062,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7474,
+      "step": 182
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.4847755436931592,
+      "learning_rate": 0.00019301957316232658,
+      "loss": 0.7826,
+      "step": 183
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.4591137881167212,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.8059,
+      "step": 184
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.5062833652786924,
+      "learning_rate": 0.0001928280327580858,
+      "loss": 0.8268,
+      "step": 185
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.49955099266247305,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.8585,
+      "step": 186
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.4999065709478267,
+      "learning_rate": 0.00019263399757144683,
+      "loss": 0.7069,
+      "step": 187
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.48090141281052196,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.7943,
+      "step": 188
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.5120636266933695,
+      "learning_rate": 0.000192437472817166,
+      "loss": 0.8641,
+      "step": 189
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.46396040035960373,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.7538,
+      "step": 190
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.48099853739089377,
+      "learning_rate": 0.00019223846377690754,
+      "loss": 0.8104,
+      "step": 191
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.45113176495385104,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.7747,
+      "step": 192
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.472301307522119,
+      "learning_rate": 0.00019203697579910154,
+      "loss": 0.7552,
+      "step": 193
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.4057783708879821,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7048,
+      "step": 194
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.467258387306045,
+      "learning_rate": 0.00019183301429880043,
+      "loss": 0.7941,
+      "step": 195
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.516487481035125,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.7853,
+      "step": 196
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.42347807802259363,
+      "learning_rate": 0.00019162658475753327,
+      "loss": 0.7649,
+      "step": 197
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.8367495246429115,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.7083,
+      "step": 198
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.4525969776176922,
+      "learning_rate": 0.00019141769272315858,
+      "loss": 0.7624,
+      "step": 199
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4982254354963437,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8169,
+      "step": 200
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.4897982009512447,
+      "learning_rate": 0.00019120634380971496,
+      "loss": 0.7375,
+      "step": 201
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.4721918519110925,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8343,
+      "step": 202
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.4802560936686167,
+      "learning_rate": 0.0001909925436972706,
+      "loss": 0.7365,
+      "step": 203
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.532981189533498,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.8503,
+      "step": 204
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.42372376234282055,
+      "learning_rate": 0.00019077629813177036,
+      "loss": 0.7674,
+      "step": 205
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.45321557602235424,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.8055,
+      "step": 206
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.4312975492954434,
+      "learning_rate": 0.00019055761292488142,
+      "loss": 0.7738,
+      "step": 207
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.43024914345350995,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.8008,
+      "step": 208
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.46450573652223054,
+      "learning_rate": 0.00019033649395383702,
+      "loss": 0.8204,
+      "step": 209
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.4802445996979556,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.7825,
+      "step": 210
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.5280220525726201,
+      "learning_rate": 0.00019011294716127867,
+      "loss": 0.8355,
+      "step": 211
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.4655367627849839,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7514,
+      "step": 212
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.4159172606320722,
+      "learning_rate": 0.0001898869785550963,
+      "loss": 0.7176,
+      "step": 213
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.4548273843508264,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.8194,
+      "step": 214
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.4942757859153581,
+      "learning_rate": 0.00018965859420826684,
+      "loss": 0.7833,
+      "step": 215
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.504896542459169,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.7522,
+      "step": 216
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.4447487563249888,
+      "learning_rate": 0.00018942780025869098,
+      "loss": 0.7616,
+      "step": 217
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.49632307924477864,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7939,
+      "step": 218
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.40869032768866065,
+      "learning_rate": 0.00018919460290902826,
+      "loss": 0.8287,
+      "step": 219
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4332429743929596,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.7404,
+      "step": 220
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.43036598063291176,
+      "learning_rate": 0.0001889590084265304,
+      "loss": 0.7933,
+      "step": 221
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.41630272039412397,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7512,
+      "step": 222
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.5123312220359458,
+      "learning_rate": 0.0001887210231428727,
+      "loss": 0.8504,
+      "step": 223
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.5116812124880064,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7846,
+      "step": 224
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.42195692450016364,
+      "learning_rate": 0.0001884806534539841,
+      "loss": 0.7404,
+      "step": 225
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.3700439187134879,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.6876,
+      "step": 226
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.3834528350465055,
+      "learning_rate": 0.0001882379058198751,
+      "loss": 0.7907,
+      "step": 227
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.4616055414126435,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.8315,
+      "step": 228
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.4983638436350939,
+      "learning_rate": 0.00018799278676446423,
+      "loss": 0.7365,
+      "step": 229
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.44216589035323284,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7756,
+      "step": 230
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.4742933096255244,
+      "learning_rate": 0.00018774530287540278,
+      "loss": 0.8325,
+      "step": 231
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.49045810829845043,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.749,
+      "step": 232
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.5184488152849216,
+      "learning_rate": 0.00018749546080389757,
+      "loss": 0.7795,
+      "step": 233
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.500204812288988,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7897,
+      "step": 234
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.4738561000941789,
+      "learning_rate": 0.00018724326726453244,
+      "loss": 0.8309,
+      "step": 235
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.3916407430344318,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7133,
+      "step": 236
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.4518339075048928,
+      "learning_rate": 0.00018698872903508755,
+      "loss": 0.808,
+      "step": 237
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.3856933180025405,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7658,
+      "step": 238
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.4940554727867292,
+      "learning_rate": 0.0001867318529563574,
+      "loss": 0.7636,
+      "step": 239
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.46136071241676646,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7221,
+      "step": 240
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.4321513860869631,
+      "learning_rate": 0.00018647264593196688,
+      "loss": 0.7615,
+      "step": 241
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.4206839910688456,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.7459,
+      "step": 242
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.43391516279938624,
+      "learning_rate": 0.00018621111492818585,
+      "loss": 0.7941,
+      "step": 243
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.40858977353454384,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.769,
+      "step": 244
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.4349779920862305,
+      "learning_rate": 0.00018594726697374175,
+      "loss": 0.6979,
+      "step": 245
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.45598470923578976,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.7796,
+      "step": 246
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.4665716149736225,
+      "learning_rate": 0.0001856811091596308,
+      "loss": 0.7878,
+      "step": 247
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4463411278050276,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.8049,
+      "step": 248
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.5077164997106266,
+      "learning_rate": 0.00018541264863892754,
+      "loss": 0.7728,
+      "step": 249
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.498872148590993,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.7527,
+      "step": 250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.4826197747486357,
+      "learning_rate": 0.00018514189262659235,
+      "loss": 0.7856,
+      "step": 251
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.42919062355595305,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8277,
+      "step": 252
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.41782538449347645,
+      "learning_rate": 0.00018486884839927768,
+      "loss": 0.6717,
+      "step": 253
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.42784641824086067,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.7196,
+      "step": 254
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.4512419784378731,
+      "learning_rate": 0.0001845935232951325,
+      "loss": 0.7424,
+      "step": 255
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4154593646623652,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7489,
+      "step": 256
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.4384153798043128,
+      "learning_rate": 0.00018431592471360503,
+      "loss": 0.7495,
+      "step": 257
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.408933173821226,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7471,
+      "step": 258
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.4466848371228825,
+      "learning_rate": 0.000184036060115244,
+      "loss": 0.7023,
+      "step": 259
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.5676238878157918,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.8444,
+      "step": 260
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.49940950060271694,
+      "learning_rate": 0.00018375393702149787,
+      "loss": 0.8093,
+      "step": 261
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.411486098422595,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.7161,
+      "step": 262
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.37889999469972857,
+      "learning_rate": 0.00018346956301451304,
+      "loss": 0.7245,
+      "step": 263
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.4288283798698304,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.7069,
+      "step": 264
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.44383544195252433,
+      "learning_rate": 0.00018318294573692985,
+      "loss": 0.7502,
+      "step": 265
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.48086052555298,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.7936,
+      "step": 266
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.3957322973294756,
+      "learning_rate": 0.0001828940928916772,
+      "loss": 0.7021,
+      "step": 267
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.4400708983479737,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7493,
+      "step": 268
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.5131851425434888,
+      "learning_rate": 0.00018260301224176558,
+      "loss": 0.7989,
+      "step": 269
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.4445777972230032,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.8149,
+      "step": 270
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.4081132952903988,
+      "learning_rate": 0.00018230971161007853,
+      "loss": 0.7559,
+      "step": 271
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.40312914828746943,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7255,
+      "step": 272
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.4993878111709934,
+      "learning_rate": 0.00018201419887916214,
+      "loss": 0.8224,
+      "step": 273
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.4383548287016405,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.7286,
+      "step": 274
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.44567548374122995,
+      "learning_rate": 0.00018171648199101346,
+      "loss": 0.7281,
+      "step": 275
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4292248672579036,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7653,
+      "step": 276
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.4955153306282761,
+      "learning_rate": 0.00018141656894686689,
+      "loss": 0.8314,
+      "step": 277
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.5094574028346628,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.8197,
+      "step": 278
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.4502767303537129,
+      "learning_rate": 0.00018111446780697929,
+      "loss": 0.771,
+      "step": 279
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4316637178021228,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.728,
+      "step": 280
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.5303243108120476,
+      "learning_rate": 0.00018081018669041324,
+      "loss": 0.7863,
+      "step": 281
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.3893145252905154,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.7195,
+      "step": 282
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.4790203255564661,
+      "learning_rate": 0.00018050373377481878,
+      "loss": 0.7858,
+      "step": 283
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.44061721980349183,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7219,
+      "step": 284
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.49032305188970193,
+      "learning_rate": 0.0001801951172962139,
+      "loss": 0.7501,
+      "step": 285
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.46994437658774807,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7514,
+      "step": 286
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.43599573839129335,
+      "learning_rate": 0.0001798843455487629,
+      "loss": 0.7232,
+      "step": 287
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.4968621506667461,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7856,
+      "step": 288
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.5334967293115999,
+      "learning_rate": 0.00017957142688455362,
+      "loss": 0.8355,
+      "step": 289
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.5049633275191162,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7993,
+      "step": 290
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.5226946501993106,
+      "learning_rate": 0.00017925636971337304,
+      "loss": 0.7614,
+      "step": 291
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.46031635923303504,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.8235,
+      "step": 292
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.47470104995479906,
+      "learning_rate": 0.00017893918250248104,
+      "loss": 0.7683,
+      "step": 293
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.420096335266906,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7614,
+      "step": 294
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.5307529328489271,
+      "learning_rate": 0.00017861987377638312,
+      "loss": 0.7344,
+      "step": 295
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5531361713545803,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7952,
+      "step": 296
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.4379379434223826,
+      "learning_rate": 0.0001782984521166011,
+      "loss": 0.7881,
+      "step": 297
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.47042976585887974,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7343,
+      "step": 298
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.3992671800588768,
+      "learning_rate": 0.00017797492616144256,
+      "loss": 0.6851,
+      "step": 299
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4177986577565525,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7668,
+      "step": 300
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.4354010399914639,
+      "learning_rate": 0.00017764930460576866,
+      "loss": 0.7041,
+      "step": 301
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.4462884492349542,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7676,
+      "step": 302
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.4191032529227301,
+      "learning_rate": 0.00017732159620076053,
+      "loss": 0.7693,
+      "step": 303
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.45465716446320836,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7267,
+      "step": 304
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.4457794134577356,
+      "learning_rate": 0.00017699180975368396,
+      "loss": 0.7928,
+      "step": 305
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.5296231776916995,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.8466,
+      "step": 306
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.38882200427077135,
+      "learning_rate": 0.00017665995412765285,
+      "loss": 0.6935,
+      "step": 307
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.43860765207791874,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.8161,
+      "step": 308
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.42531906914129314,
+      "learning_rate": 0.00017632603824139085,
+      "loss": 0.7676,
+      "step": 309
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.4091747089690747,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.7865,
+      "step": 310
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.42929783579785047,
+      "learning_rate": 0.0001759900710689918,
+      "loss": 0.7917,
+      "step": 311
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.4278215467335946,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.7195,
+      "step": 312
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.3666058590710916,
+      "learning_rate": 0.00017565206163967846,
+      "loss": 0.7403,
+      "step": 313
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.476708669261199,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.7797,
+      "step": 314
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.451915357445945,
+      "learning_rate": 0.00017531201903755994,
+      "loss": 0.8248,
+      "step": 315
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4025080301017516,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.7972,
+      "step": 316
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.3706391455912095,
+      "learning_rate": 0.00017496995240138744,
+      "loss": 0.6941,
+      "step": 317
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.4217817302490733,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.737,
+      "step": 318
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.435246810546811,
+      "learning_rate": 0.00017462587092430875,
+      "loss": 0.7746,
+      "step": 319
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.42681516551875237,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7491,
+      "step": 320
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.48731269462670573,
+      "learning_rate": 0.00017427978385362112,
+      "loss": 0.7313,
+      "step": 321
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.4280493549246099,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7517,
+      "step": 322
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.4106403555591145,
+      "learning_rate": 0.0001739317004905227,
+      "loss": 0.6978,
+      "step": 323
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.47468879663103564,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.8465,
+      "step": 324
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.45988680617556915,
+      "learning_rate": 0.00017358163018986282,
+      "loss": 0.819,
+      "step": 325
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.42180705245581873,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.7365,
+      "step": 326
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.4376924090800282,
+      "learning_rate": 0.00017322958235989016,
+      "loss": 0.6623,
+      "step": 327
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.48363351015083733,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7363,
+      "step": 328
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.3942595478028239,
+      "learning_rate": 0.00017287556646200018,
+      "loss": 0.7566,
+      "step": 329
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3933448029539557,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.6969,
+      "step": 330
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.434616179162153,
+      "learning_rate": 0.00017251959201048083,
+      "loss": 0.7609,
+      "step": 331
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.39288081687478416,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.728,
+      "step": 332
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.46607832581942166,
+      "learning_rate": 0.00017216166857225674,
+      "loss": 0.8335,
+      "step": 333
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.43282334279022616,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.7501,
+      "step": 334
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.5417948957602462,
+      "learning_rate": 0.00017180180576663228,
+      "loss": 0.7462,
+      "step": 335
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.467210018753779,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.7991,
+      "step": 336
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.4543489707663741,
+      "learning_rate": 0.00017144001326503273,
+      "loss": 0.733,
+      "step": 337
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.4932319707043495,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.7418,
+      "step": 338
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.4537239614953879,
+      "learning_rate": 0.00017107630079074478,
+      "loss": 0.7782,
+      "step": 339
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.5698675499674541,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7718,
+      "step": 340
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.4011892818050699,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.7275,
+      "step": 341
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.43859794585929307,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7169,
+      "step": 342
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.4202743554700086,
+      "learning_rate": 0.00017034315507498635,
+      "loss": 0.7884,
+      "step": 343
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.45863226891592085,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7542,
+      "step": 344
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.42463800114594674,
+      "learning_rate": 0.00016997374153703625,
+      "loss": 0.777,
+      "step": 345
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.46805220582677076,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.655,
+      "step": 346
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.4856095496588775,
+      "learning_rate": 0.00016960244743290868,
+      "loss": 0.8294,
+      "step": 347
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.3937245968763575,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7316,
+      "step": 348
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.49058614828004504,
+      "learning_rate": 0.00016922928274124886,
+      "loss": 0.7843,
+      "step": 349
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.47806594064348984,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.7152,
+      "step": 350
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.4731835025299986,
+      "learning_rate": 0.00016885425749097444,
+      "loss": 0.7094,
+      "step": 351
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.6731480218012125,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7495,
+      "step": 352
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.4297964437043898,
+      "learning_rate": 0.00016847738176100632,
+      "loss": 0.7177,
+      "step": 353
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.4227575564035185,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7613,
+      "step": 354
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.48544793119103596,
+      "learning_rate": 0.0001680986656799975,
+      "loss": 0.7532,
+      "step": 355
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.5077766955230878,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.7322,
+      "step": 356
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.4809980875485111,
+      "learning_rate": 0.00016771811942606108,
+      "loss": 0.749,
+      "step": 357
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.45732029859816226,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7143,
+      "step": 358
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.4662300851432532,
+      "learning_rate": 0.00016733575322649657,
+      "loss": 0.7359,
+      "step": 359
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3967132540424373,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.7129,
+      "step": 360
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.44430664357222777,
+      "learning_rate": 0.00016695157735751513,
+      "loss": 0.7227,
+      "step": 361
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.44023955662980985,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7675,
+      "step": 362
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.4169092049722635,
+      "learning_rate": 0.0001665656021439633,
+      "loss": 0.7546,
+      "step": 363
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.5277552917044552,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.8091,
+      "step": 364
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 1.2019996023887272,
+      "learning_rate": 0.00016617783795904565,
+      "loss": 0.7482,
+      "step": 365
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.903668879284044,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7268,
+      "step": 366
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.44840199228934713,
+      "learning_rate": 0.00016578829522404583,
+      "loss": 0.782,
+      "step": 367
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.5013335808521202,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7607,
+      "step": 368
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.4213974027032967,
+      "learning_rate": 0.00016539698440804661,
+      "loss": 0.7424,
+      "step": 369
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.41993773983541244,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.771,
+      "step": 370
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.40557131036398736,
+      "learning_rate": 0.0001650039160276485,
+      "loss": 0.7058,
+      "step": 371
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.4523507100232458,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7137,
+      "step": 372
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.4731710302193421,
+      "learning_rate": 0.0001646091006466871,
+      "loss": 0.7295,
+      "step": 373
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.42432724880364664,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.7482,
+      "step": 374
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.4831867061115927,
+      "learning_rate": 0.00016421254887594917,
+      "loss": 0.7652,
+      "step": 375
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.41640311207575037,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.6579,
+      "step": 376
+    },
+    {
+      "epoch": 0.3016,
+      "grad_norm": 0.4586313986798324,
+      "learning_rate": 0.00016381427137288754,
+      "loss": 0.8476,
+      "step": 377
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.41162605214953046,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7042,
+      "step": 378
+    },
+    {
+      "epoch": 0.3032,
+      "grad_norm": 0.4038408611576945,
+      "learning_rate": 0.0001634142788413346,
+      "loss": 0.7604,
+      "step": 379
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.3915146244892308,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7389,
+      "step": 380
+    },
+    {
+      "epoch": 0.3048,
+      "grad_norm": 0.4308339515168911,
+      "learning_rate": 0.00016301258203121462,
+      "loss": 0.7486,
+      "step": 381
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.37800420601980367,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.6813,
+      "step": 382
+    },
+    {
+      "epoch": 0.3064,
+      "grad_norm": 0.39802369432443996,
+      "learning_rate": 0.00016260919173825508,
+      "loss": 0.7324,
+      "step": 383
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.38100120954656946,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.6791,
+      "step": 384
+    },
+    {
+      "epoch": 0.308,
+      "grad_norm": 0.4336361465865306,
+      "learning_rate": 0.00016220411880369601,
+      "loss": 0.7019,
+      "step": 385
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.46580240140204016,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.8084,
+      "step": 386
+    },
+    {
+      "epoch": 0.3096,
+      "grad_norm": 0.44591142184346877,
+      "learning_rate": 0.00016179737411399926,
+      "loss": 0.8015,
+      "step": 387
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.4126781708157891,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7149,
+      "step": 388
+    },
+    {
+      "epoch": 0.3112,
+      "grad_norm": 0.40599820091049194,
+      "learning_rate": 0.00016138896860055555,
+      "loss": 0.7162,
+      "step": 389
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.47857559032252306,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7205,
+      "step": 390
+    },
+    {
+      "epoch": 0.3128,
+      "grad_norm": 0.38736860683906243,
+      "learning_rate": 0.00016097891323939062,
+      "loss": 0.7345,
+      "step": 391
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.3911052800141332,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.6961,
+      "step": 392
+    },
+    {
+      "epoch": 0.3144,
+      "grad_norm": 0.4099751708672859,
+      "learning_rate": 0.00016056721905087056,
+      "loss": 0.7876,
+      "step": 393
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.4789589912306157,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.7504,
+      "step": 394
+    },
+    {
+      "epoch": 0.316,
+      "grad_norm": 0.5146265447705376,
+      "learning_rate": 0.00016015389709940538,
+      "loss": 0.845,
+      "step": 395
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.634963848674221,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.788,
+      "step": 396
+    },
+    {
+      "epoch": 0.3176,
+      "grad_norm": 0.42427227666675715,
+      "learning_rate": 0.0001597389584931517,
+      "loss": 0.7113,
+      "step": 397
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.4442940256282359,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7814,
+      "step": 398
+    },
+    {
+      "epoch": 0.3192,
+      "grad_norm": 0.4426652054664872,
+      "learning_rate": 0.0001593224143837142,
+      "loss": 0.7697,
+      "step": 399
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4317837128486373,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.6493,
+      "step": 400
+    },
+    {
+      "epoch": 0.3208,
+      "grad_norm": 0.4482570698551614,
+      "learning_rate": 0.00015890427596584617,
+      "loss": 0.7345,
+      "step": 401
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.44129565114368685,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.692,
+      "step": 402
+    },
+    {
+      "epoch": 0.3224,
+      "grad_norm": 0.4361340943441822,
+      "learning_rate": 0.00015848455447714822,
+      "loss": 0.7153,
+      "step": 403
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.48161014364941834,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7397,
+      "step": 404
+    },
+    {
+      "epoch": 0.324,
+      "grad_norm": 0.40021146044944383,
+      "learning_rate": 0.00015806326119776663,
+      "loss": 0.7628,
+      "step": 405
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.4391173187278975,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7228,
+      "step": 406
+    },
+    {
+      "epoch": 0.3256,
+      "grad_norm": 0.39810258693029316,
+      "learning_rate": 0.00015764040745008988,
+      "loss": 0.7685,
+      "step": 407
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4328481242077568,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.7482,
+      "step": 408
+    },
+    {
+      "epoch": 0.3272,
+      "grad_norm": 0.4028096379053827,
+      "learning_rate": 0.00015721600459844468,
+      "loss": 0.6994,
+      "step": 409
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.46181503625336007,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.808,
+      "step": 410
+    },
+    {
+      "epoch": 0.3288,
+      "grad_norm": 0.5113755956738689,
+      "learning_rate": 0.00015679006404879033,
+      "loss": 0.7563,
+      "step": 411
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.44512900415147594,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7207,
+      "step": 412
+    },
+    {
+      "epoch": 0.3304,
+      "grad_norm": 0.4611652590698016,
+      "learning_rate": 0.00015636259724841222,
+      "loss": 0.7583,
+      "step": 413
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.4591918606931989,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.6874,
+      "step": 414
+    },
+    {
+      "epoch": 0.332,
+      "grad_norm": 0.45267916132053,
+      "learning_rate": 0.00015593361568561428,
+      "loss": 0.7577,
+      "step": 415
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.48815860688357016,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7242,
+      "step": 416
+    },
+    {
+      "epoch": 0.3336,
+      "grad_norm": 0.4539180685668656,
+      "learning_rate": 0.0001555031308894101,
+      "loss": 0.7508,
+      "step": 417
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.5388177877102638,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7545,
+      "step": 418
+    },
+    {
+      "epoch": 0.3352,
+      "grad_norm": 0.3950822482754754,
+      "learning_rate": 0.0001550711544292131,
+      "loss": 0.7153,
+      "step": 419
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4450384687439473,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7622,
+      "step": 420
+    },
+    {
+      "epoch": 0.3368,
+      "grad_norm": 0.4177962669164234,
+      "learning_rate": 0.00015463769791452574,
+      "loss": 0.7639,
+      "step": 421
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.40009315582507743,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7264,
+      "step": 422
+    },
+    {
+      "epoch": 0.3384,
+      "grad_norm": 0.5054917593028728,
+      "learning_rate": 0.00015420277299462736,
+      "loss": 0.7599,
+      "step": 423
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.548084106630373,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7236,
+      "step": 424
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.44702097951848013,
+      "learning_rate": 0.00015376639135826107,
+      "loss": 0.7897,
+      "step": 425
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.4562795227267379,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7376,
+      "step": 426
+    },
+    {
+      "epoch": 0.3416,
+      "grad_norm": 0.43127424064408904,
+      "learning_rate": 0.00015332856473331978,
+      "loss": 0.7718,
+      "step": 427
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.45822416474456973,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7834,
+      "step": 428
+    },
+    {
+      "epoch": 0.3432,
+      "grad_norm": 0.39743796981470814,
+      "learning_rate": 0.00015288930488653094,
+      "loss": 0.6835,
+      "step": 429
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.4899454842148273,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.8564,
+      "step": 430
+    },
+    {
+      "epoch": 0.3448,
+      "grad_norm": 0.42129591015531603,
+      "learning_rate": 0.0001524486236231402,
+      "loss": 0.715,
+      "step": 431
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.3975991085405616,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7243,
+      "step": 432
+    },
+    {
+      "epoch": 0.3464,
+      "grad_norm": 0.42462308414974403,
+      "learning_rate": 0.00015200653278659432,
+      "loss": 0.8114,
+      "step": 433
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.47903098108233355,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7138,
+      "step": 434
+    },
+    {
+      "epoch": 0.348,
+      "grad_norm": 0.4577855193694479,
+      "learning_rate": 0.00015156304425822267,
+      "loss": 0.8058,
+      "step": 435
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.43580528832715576,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.783,
+      "step": 436
+    },
+    {
+      "epoch": 0.3496,
+      "grad_norm": 0.48304346147280797,
+      "learning_rate": 0.00015111816995691809,
+      "loss": 0.7576,
+      "step": 437
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.40679595500463156,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.659,
+      "step": 438
+    },
+    {
+      "epoch": 0.3512,
+      "grad_norm": 0.4437871417383588,
+      "learning_rate": 0.00015067192183881658,
+      "loss": 0.743,
+      "step": 439
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.48954633510827106,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.8154,
+      "step": 440
+    },
+    {
+      "epoch": 0.3528,
+      "grad_norm": 0.44973543847141406,
+      "learning_rate": 0.00015022431189697568,
+      "loss": 0.7579,
+      "step": 441
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.5524658691503546,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8149,
+      "step": 442
+    },
+    {
+      "epoch": 0.3544,
+      "grad_norm": 0.4396765832399089,
+      "learning_rate": 0.0001497753521610526,
+      "loss": 0.6877,
+      "step": 443
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.4406786552471726,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.715,
+      "step": 444
+    },
+    {
+      "epoch": 0.356,
+      "grad_norm": 0.41322964282223196,
+      "learning_rate": 0.00014932505469698052,
+      "loss": 0.7443,
+      "step": 445
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.4604077012727492,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7457,
+      "step": 446
+    },
+    {
+      "epoch": 0.3576,
+      "grad_norm": 0.43228587119062767,
+      "learning_rate": 0.0001488734316066446,
+      "loss": 0.7069,
+      "step": 447
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3866639819270244,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.6804,
+      "step": 448
+    },
+    {
+      "epoch": 0.3592,
+      "grad_norm": 0.4072491204192593,
+      "learning_rate": 0.0001484204950275565,
+      "loss": 0.7182,
+      "step": 449
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4158115899003882,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.6316,
+      "step": 450
+    },
+    {
+      "epoch": 0.3608,
+      "grad_norm": 0.41856795470122343,
+      "learning_rate": 0.00014796625713252848,
+      "loss": 0.6877,
+      "step": 451
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.44032268508903893,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7754,
+      "step": 452
+    },
+    {
+      "epoch": 0.3624,
+      "grad_norm": 0.3793269364037703,
+      "learning_rate": 0.00014751073012934587,
+      "loss": 0.6997,
+      "step": 453
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.4073378870722764,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.6769,
+      "step": 454
+    },
+    {
+      "epoch": 0.364,
+      "grad_norm": 0.42363471968666755,
+      "learning_rate": 0.0001470539262604393,
+      "loss": 0.7889,
+      "step": 455
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.4520135230580361,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7619,
+      "step": 456
+    },
+    {
+      "epoch": 0.3656,
+      "grad_norm": 0.4280710487309365,
+      "learning_rate": 0.00014659585780255556,
+      "loss": 0.7166,
+      "step": 457
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.40250832106697515,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.7183,
+      "step": 458
+    },
+    {
+      "epoch": 0.3672,
+      "grad_norm": 0.42287049275064076,
+      "learning_rate": 0.0001461365370664276,
+      "loss": 0.7637,
+      "step": 459
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3961706966946667,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7116,
+      "step": 460
+    },
+    {
+      "epoch": 0.3688,
+      "grad_norm": 0.47852621016581787,
+      "learning_rate": 0.00014567597639644387,
+      "loss": 0.789,
+      "step": 461
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.42921164444493115,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7491,
+      "step": 462
+    },
+    {
+      "epoch": 0.3704,
+      "grad_norm": 0.4231796426849485,
+      "learning_rate": 0.00014521418817031628,
+      "loss": 0.7023,
+      "step": 463
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.5219949806476437,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.8286,
+      "step": 464
+    },
+    {
+      "epoch": 0.372,
+      "grad_norm": 0.4417614083672534,
+      "learning_rate": 0.00014475118479874774,
+      "loss": 0.6982,
+      "step": 465
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.398529604058241,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7249,
+      "step": 466
+    },
+    {
+      "epoch": 0.3736,
+      "grad_norm": 0.4063119021543565,
+      "learning_rate": 0.0001442869787250987,
+      "loss": 0.7563,
+      "step": 467
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4150229671695859,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.6996,
+      "step": 468
+    },
+    {
+      "epoch": 0.3752,
+      "grad_norm": 0.41294976755644697,
+      "learning_rate": 0.00014382158242505234,
+      "loss": 0.7335,
+      "step": 469
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.46171998521483076,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7272,
+      "step": 470
+    },
+    {
+      "epoch": 0.3768,
+      "grad_norm": 0.43386019673425774,
+      "learning_rate": 0.00014335500840627986,
+      "loss": 0.7348,
+      "step": 471
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.4748047451711399,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.762,
+      "step": 472
+    },
+    {
+      "epoch": 0.3784,
+      "grad_norm": 0.4530528969151596,
+      "learning_rate": 0.0001428872692081038,
+      "loss": 0.705,
+      "step": 473
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.7154024051831739,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7291,
+      "step": 474
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.38238003488071715,
+      "learning_rate": 0.00014241837740116132,
+      "loss": 0.6948,
+      "step": 475
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.42954402140844056,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7365,
+      "step": 476
+    },
+    {
+      "epoch": 0.3816,
+      "grad_norm": 0.43920465231655637,
+      "learning_rate": 0.00014194834558706632,
+      "loss": 0.6759,
+      "step": 477
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.50806928834242,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7163,
+      "step": 478
+    },
+    {
+      "epoch": 0.3832,
+      "grad_norm": 0.49100412658177245,
+      "learning_rate": 0.0001414771863980707,
+      "loss": 0.7757,
+      "step": 479
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4265044923408589,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.6973,
+      "step": 480
+    },
+    {
+      "epoch": 0.3848,
+      "grad_norm": 0.4485524583028315,
+      "learning_rate": 0.00014100491249672498,
+      "loss": 0.6689,
+      "step": 481
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.43226811757877065,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7459,
+      "step": 482
+    },
+    {
+      "epoch": 0.3864,
+      "grad_norm": 0.416201547102863,
+      "learning_rate": 0.0001405315365755379,
+      "loss": 0.7209,
+      "step": 483
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.39664734173834565,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.6513,
+      "step": 484
+    },
+    {
+      "epoch": 0.388,
+      "grad_norm": 0.5873619252994348,
+      "learning_rate": 0.00014005707135663527,
+      "loss": 0.6968,
+      "step": 485
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.40793179787572403,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7358,
+      "step": 486
+    },
+    {
+      "epoch": 0.3896,
+      "grad_norm": 0.4228884403871606,
+      "learning_rate": 0.00013958152959141825,
+      "loss": 0.7642,
+      "step": 487
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.41060499981163073,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7101,
+      "step": 488
+    },
+    {
+      "epoch": 0.3912,
+      "grad_norm": 0.435862591273703,
+      "learning_rate": 0.00013910492406022033,
+      "loss": 0.6965,
+      "step": 489
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.4869556356346243,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7007,
+      "step": 490
+    },
+    {
+      "epoch": 0.3928,
+      "grad_norm": 0.36488581075679316,
+      "learning_rate": 0.0001386272675719642,
+      "loss": 0.6422,
+      "step": 491
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.43592851398129917,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7195,
+      "step": 492
+    },
+    {
+      "epoch": 0.3944,
+      "grad_norm": 0.4296963476221147,
+      "learning_rate": 0.00013814857296381728,
+      "loss": 0.7473,
+      "step": 493
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.37645455777538245,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.6946,
+      "step": 494
+    },
+    {
+      "epoch": 0.396,
+      "grad_norm": 0.3967480623621568,
+      "learning_rate": 0.00013766885310084688,
+      "loss": 0.7451,
+      "step": 495
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.39790923257593114,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7127,
+      "step": 496
+    },
+    {
+      "epoch": 0.3976,
+      "grad_norm": 0.44216137091031993,
+      "learning_rate": 0.00013718812087567414,
+      "loss": 0.7593,
+      "step": 497
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.39409587111420574,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.6994,
+      "step": 498
+    },
+    {
+      "epoch": 0.3992,
+      "grad_norm": 0.37628399571003923,
+      "learning_rate": 0.000136706389208128,
+      "loss": 0.6659,
+      "step": 499
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.40137145922137873,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7574,
+      "step": 500
+    },
+    {
+      "epoch": 0.4008,
+      "grad_norm": 0.40575702287687193,
+      "learning_rate": 0.00013622367104489756,
+      "loss": 0.7531,
+      "step": 501
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.40095939343185877,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.7255,
+      "step": 502
+    },
+    {
+      "epoch": 0.4024,
+      "grad_norm": 0.5328369689927243,
+      "learning_rate": 0.0001357399793591844,
+      "loss": 0.7035,
+      "step": 503
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.41558442501880183,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.5965,
+      "step": 504
+    },
+    {
+      "epoch": 0.404,
+      "grad_norm": 0.45450727782192485,
+      "learning_rate": 0.00013525532715035366,
+      "loss": 0.7154,
+      "step": 505
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.4429452206189218,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7424,
+      "step": 506
+    },
+    {
+      "epoch": 0.4056,
+      "grad_norm": 0.5286021883772923,
+      "learning_rate": 0.00013476972744358507,
+      "loss": 0.6966,
+      "step": 507
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.4210548211045082,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.7629,
+      "step": 508
+    },
+    {
+      "epoch": 0.4072,
+      "grad_norm": 0.4328737270139645,
+      "learning_rate": 0.00013428319328952253,
+      "loss": 0.7113,
+      "step": 509
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.47783936685408646,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7542,
+      "step": 510
+    },
+    {
+      "epoch": 0.4088,
+      "grad_norm": 0.4276078646841453,
+      "learning_rate": 0.0001337957377639235,
+      "loss": 0.7391,
+      "step": 511
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.43223865506668013,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7755,
+      "step": 512
+    },
+    {
+      "epoch": 0.4104,
+      "grad_norm": 0.4071862625631177,
+      "learning_rate": 0.0001333073739673076,
+      "loss": 0.7043,
+      "step": 513
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.5165019174887554,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.6902,
+      "step": 514
+    },
+    {
+      "epoch": 0.412,
+      "grad_norm": 0.3892717255463915,
+      "learning_rate": 0.0001328181150246045,
+      "loss": 0.7137,
+      "step": 515
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.42836054777343024,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7395,
+      "step": 516
+    },
+    {
+      "epoch": 0.4136,
+      "grad_norm": 0.4173728809888404,
+      "learning_rate": 0.00013232797408480127,
+      "loss": 0.742,
+      "step": 517
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.3535504159455629,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.6703,
+      "step": 518
+    },
+    {
+      "epoch": 0.4152,
+      "grad_norm": 0.3676936927081411,
+      "learning_rate": 0.00013183696432058888,
+      "loss": 0.6899,
+      "step": 519
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.42111910180024936,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7787,
+      "step": 520
+    },
+    {
+      "epoch": 0.4168,
+      "grad_norm": 0.4116016278981681,
+      "learning_rate": 0.00013134509892800822,
+      "loss": 0.7291,
+      "step": 521
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.4031029250802671,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7056,
+      "step": 522
+    },
+    {
+      "epoch": 0.4184,
+      "grad_norm": 0.3835513414134938,
+      "learning_rate": 0.00013085239112609547,
+      "loss": 0.7112,
+      "step": 523
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.46509263748679647,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7185,
+      "step": 524
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.5746143007898875,
+      "learning_rate": 0.00013035885415652685,
+      "loss": 0.7493,
+      "step": 525
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.40227934573081,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7322,
+      "step": 526
+    },
+    {
+      "epoch": 0.4216,
+      "grad_norm": 0.4230454693600124,
+      "learning_rate": 0.00012986450128326266,
+      "loss": 0.7309,
+      "step": 527
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.40336316080918005,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.6341,
+      "step": 528
+    },
+    {
+      "epoch": 0.4232,
+      "grad_norm": 0.4351510587979726,
+      "learning_rate": 0.00012936934579219094,
+      "loss": 0.7105,
+      "step": 529
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.39495593892561437,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7168,
+      "step": 530
+    },
+    {
+      "epoch": 0.4248,
+      "grad_norm": 0.3967426580793475,
+      "learning_rate": 0.00012887340099077024,
+      "loss": 0.7194,
+      "step": 531
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.4147208134179952,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7238,
+      "step": 532
+    },
+    {
+      "epoch": 0.4264,
+      "grad_norm": 0.36967969461923617,
+      "learning_rate": 0.0001283766802076722,
+      "loss": 0.6632,
+      "step": 533
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.4287011586351211,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7401,
+      "step": 534
+    },
+    {
+      "epoch": 0.428,
+      "grad_norm": 0.399649606969939,
+      "learning_rate": 0.00012787919679242306,
+      "loss": 0.7814,
+      "step": 535
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3733433687843168,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.6759,
+      "step": 536
+    },
+    {
+      "epoch": 0.4296,
+      "grad_norm": 0.4567797291797252,
+      "learning_rate": 0.00012738096411504522,
+      "loss": 0.7411,
+      "step": 537
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.39111249426608224,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.6682,
+      "step": 538
+    },
+    {
+      "epoch": 0.4312,
+      "grad_norm": 0.399184072015993,
+      "learning_rate": 0.00012688199556569753,
+      "loss": 0.7405,
+      "step": 539
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.3623284143286997,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7323,
+      "step": 540
+    },
+    {
+      "epoch": 0.4328,
+      "grad_norm": 0.4369971125336579,
+      "learning_rate": 0.0001263823045543158,
+      "loss": 0.7521,
+      "step": 541
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.3939866311585749,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.7191,
+      "step": 542
+    },
+    {
+      "epoch": 0.4344,
+      "grad_norm": 0.44007330550179097,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.7526,
+      "step": 543
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4623133533657372,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.7398,
+      "step": 544
+    },
+    {
+      "epoch": 0.436,
+      "grad_norm": 0.4356767891279233,
+      "learning_rate": 0.00012538080888191408,
+      "loss": 0.6771,
+      "step": 545
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.44781292119353033,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.6905,
+      "step": 546
+    },
+    {
+      "epoch": 0.4376,
+      "grad_norm": 0.4350358948741196,
+      "learning_rate": 0.00012487903113640337,
+      "loss": 0.7757,
+      "step": 547
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.4263688147824068,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7146,
+      "step": 548
+    },
+    {
+      "epoch": 0.4392,
+      "grad_norm": 0.3918373554164718,
+      "learning_rate": 0.00012437658475915377,
+      "loss": 0.6814,
+      "step": 549
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.42554161469132323,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7097,
+      "step": 550
+    },
+    {
+      "epoch": 0.4408,
+      "grad_norm": 0.4510043717815143,
+      "learning_rate": 0.00012387348325356874,
+      "loss": 0.7397,
+      "step": 551
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.40548676560106917,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.7188,
+      "step": 552
+    },
+    {
+      "epoch": 0.4424,
+      "grad_norm": 0.3930449013275765,
+      "learning_rate": 0.00012336974014065844,
+      "loss": 0.6938,
+      "step": 553
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.37686291795705645,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.6823,
+      "step": 554
+    },
+    {
+      "epoch": 0.444,
+      "grad_norm": 0.34403095650051896,
+      "learning_rate": 0.00012286536895867654,
+      "loss": 0.7023,
+      "step": 555
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.4224804903041466,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7154,
+      "step": 556
+    },
+    {
+      "epoch": 0.4456,
+      "grad_norm": 0.464741312247205,
+      "learning_rate": 0.00012236038326275626,
+      "loss": 0.7756,
+      "step": 557
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.41450530264771623,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7133,
+      "step": 558
+    },
+    {
+      "epoch": 0.4472,
+      "grad_norm": 0.42338494134964905,
+      "learning_rate": 0.00012185479662454595,
+      "loss": 0.7451,
+      "step": 559
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.39664522174179934,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.6868,
+      "step": 560
+    },
+    {
+      "epoch": 0.4488,
+      "grad_norm": 0.4214350598528371,
+      "learning_rate": 0.00012134862263184467,
+      "loss": 0.7087,
+      "step": 561
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.4460593375778333,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.698,
+      "step": 562
+    },
+    {
+      "epoch": 0.4504,
+      "grad_norm": 0.41866076775674965,
+      "learning_rate": 0.00012084187488823657,
+      "loss": 0.7361,
+      "step": 563
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.38057473365898153,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.751,
+      "step": 564
+    },
+    {
+      "epoch": 0.452,
+      "grad_norm": 0.38737097939391985,
+      "learning_rate": 0.00012033456701272576,
+      "loss": 0.7236,
+      "step": 565
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.5171385787171731,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7091,
+      "step": 566
+    },
+    {
+      "epoch": 0.4536,
+      "grad_norm": 0.39932206928896563,
+      "learning_rate": 0.00011982671263936995,
+      "loss": 0.7045,
+      "step": 567
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.45276083810766304,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.665,
+      "step": 568
+    },
+    {
+      "epoch": 0.4552,
+      "grad_norm": 0.4546548462423871,
+      "learning_rate": 0.00011931832541691418,
+      "loss": 0.7201,
+      "step": 569
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.4317938044917659,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.7137,
+      "step": 570
+    },
+    {
+      "epoch": 0.4568,
+      "grad_norm": 0.45750459611787675,
+      "learning_rate": 0.00011880941900842397,
+      "loss": 0.7061,
+      "step": 571
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.4054347668668675,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.6743,
+      "step": 572
+    },
+    {
+      "epoch": 0.4584,
+      "grad_norm": 0.3820550665195405,
+      "learning_rate": 0.00011830000709091815,
+      "loss": 0.7234,
+      "step": 573
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.5419350535886606,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.8053,
+      "step": 574
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.42611264604135285,
+      "learning_rate": 0.0001177901033550012,
+      "loss": 0.7728,
+      "step": 575
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.3519728305830471,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.6742,
+      "step": 576
+    },
+    {
+      "epoch": 0.4616,
+      "grad_norm": 0.4089474340694498,
+      "learning_rate": 0.00011727972150449544,
+      "loss": 0.6596,
+      "step": 577
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.41015990254032864,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.6952,
+      "step": 578
+    },
+    {
+      "epoch": 0.4632,
+      "grad_norm": 0.38680365195050437,
+      "learning_rate": 0.00011676887525607271,
+      "loss": 0.7113,
+      "step": 579
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4212260550526898,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7106,
+      "step": 580
+    },
+    {
+      "epoch": 0.4648,
+      "grad_norm": 0.48743070631599167,
+      "learning_rate": 0.00011625757833888551,
+      "loss": 0.7265,
+      "step": 581
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.36740607737976266,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.719,
+      "step": 582
+    },
+    {
+      "epoch": 0.4664,
+      "grad_norm": 0.43253030252457,
+      "learning_rate": 0.0001157458444941984,
+      "loss": 0.7987,
+      "step": 583
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.430381763981451,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7449,
+      "step": 584
+    },
+    {
+      "epoch": 0.468,
+      "grad_norm": 0.39252269540792745,
+      "learning_rate": 0.00011523368747501839,
+      "loss": 0.6505,
+      "step": 585
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.43452553027659485,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7067,
+      "step": 586
+    },
+    {
+      "epoch": 0.4696,
+      "grad_norm": 0.4112649946570928,
+      "learning_rate": 0.00011472112104572547,
+      "loss": 0.669,
+      "step": 587
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.43254489628109855,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.7559,
+      "step": 588
+    },
+    {
+      "epoch": 0.4712,
+      "grad_norm": 0.3854223725240552,
+      "learning_rate": 0.0001142081589817027,
+      "loss": 0.7326,
+      "step": 589
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.39943296691131486,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7384,
+      "step": 590
+    },
+    {
+      "epoch": 0.4728,
+      "grad_norm": 0.3999457589988329,
+      "learning_rate": 0.00011369481506896582,
+      "loss": 0.7388,
+      "step": 591
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.35882038109855535,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.6333,
+      "step": 592
+    },
+    {
+      "epoch": 0.4744,
+      "grad_norm": 0.4140687920652767,
+      "learning_rate": 0.00011318110310379301,
+      "loss": 0.7279,
+      "step": 593
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.3740753880813183,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7056,
+      "step": 594
+    },
+    {
+      "epoch": 0.476,
+      "grad_norm": 0.46208895798030497,
+      "learning_rate": 0.00011266703689235394,
+      "loss": 0.7335,
+      "step": 595
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.46377081662772324,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.716,
+      "step": 596
+    },
+    {
+      "epoch": 0.4776,
+      "grad_norm": 0.38305067732640785,
+      "learning_rate": 0.00011215263025033869,
+      "loss": 0.6512,
+      "step": 597
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.350180861720446,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.6376,
+      "step": 598
+    },
+    {
+      "epoch": 0.4792,
+      "grad_norm": 0.4480134312250272,
+      "learning_rate": 0.00011163789700258655,
+      "loss": 0.771,
+      "step": 599
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3995371197659842,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.657,
+      "step": 600
+    },
+    {
+      "epoch": 0.4808,
+      "grad_norm": 0.3841862716369333,
+      "learning_rate": 0.00011112285098271451,
+      "loss": 0.6695,
+      "step": 601
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.33530177000934397,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.6374,
+      "step": 602
+    },
+    {
+      "epoch": 0.4824,
+      "grad_norm": 0.37125518410251107,
+      "learning_rate": 0.00011060750603274535,
+      "loss": 0.6261,
+      "step": 603
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.4267792415551032,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.6829,
+      "step": 604
+    },
+    {
+      "epoch": 0.484,
+      "grad_norm": 0.42046426161249545,
+      "learning_rate": 0.00011009187600273566,
+      "loss": 0.7089,
+      "step": 605
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.461124919714734,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7683,
+      "step": 606
+    },
+    {
+      "epoch": 0.4856,
+      "grad_norm": 0.3835175921110268,
+      "learning_rate": 0.00010957597475040373,
+      "loss": 0.7167,
+      "step": 607
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.4566024997195662,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.787,
+      "step": 608
+    },
+    {
+      "epoch": 0.4872,
+      "grad_norm": 0.40318473698590684,
+      "learning_rate": 0.00010905981614075693,
+      "loss": 0.6743,
+      "step": 609
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.4285776183346902,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.7106,
+      "step": 610
+    },
+    {
+      "epoch": 0.4888,
+      "grad_norm": 0.43612420650390027,
+      "learning_rate": 0.00010854341404571928,
+      "loss": 0.6738,
+      "step": 611
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3378784964593356,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.6831,
+      "step": 612
+    },
+    {
+      "epoch": 0.4904,
+      "grad_norm": 0.4049813905815682,
+      "learning_rate": 0.00010802678234375851,
+      "loss": 0.7191,
+      "step": 613
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.37071274223937906,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.6722,
+      "step": 614
+    },
+    {
+      "epoch": 0.492,
+      "grad_norm": 0.4123650756025684,
+      "learning_rate": 0.0001075099349195131,
+      "loss": 0.6982,
+      "step": 615
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4007101740530657,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.6597,
+      "step": 616
+    },
+    {
+      "epoch": 0.4936,
+      "grad_norm": 0.40314128961867496,
+      "learning_rate": 0.00010699288566341914,
+      "loss": 0.7548,
+      "step": 617
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.3827117028203895,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.6954,
+      "step": 618
+    },
+    {
+      "epoch": 0.4952,
+      "grad_norm": 0.3743203935131224,
+      "learning_rate": 0.000106475648471337,
+      "loss": 0.7035,
+      "step": 619
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4068759593814105,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7002,
+      "step": 620
+    },
+    {
+      "epoch": 0.4968,
+      "grad_norm": 0.4125840511413991,
+      "learning_rate": 0.00010595823724417795,
+      "loss": 0.7444,
+      "step": 621
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.4430040213051985,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.7316,
+      "step": 622
+    },
+    {
+      "epoch": 0.4984,
+      "grad_norm": 0.3972766518463086,
+      "learning_rate": 0.00010544066588753044,
+      "loss": 0.732,
+      "step": 623
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3710596730970979,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.7097,
+      "step": 624
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.4166041804263977,
+      "learning_rate": 0.00010492294831128641,
+      "loss": 0.7319,
+      "step": 625
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.42407249404557373,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7102,
+      "step": 626
+    },
+    {
+      "epoch": 0.5016,
+      "grad_norm": 0.40428242410174164,
+      "learning_rate": 0.00010440509842926767,
+      "loss": 0.6893,
+      "step": 627
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.4090237750909439,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.6569,
+      "step": 628
+    },
+    {
+      "epoch": 0.5032,
+      "grad_norm": 0.41322585430378184,
+      "learning_rate": 0.00010388713015885161,
+      "loss": 0.6852,
+      "step": 629
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.4034583736535131,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.7315,
+      "step": 630
+    },
+    {
+      "epoch": 0.5048,
+      "grad_norm": 0.38020879026638044,
+      "learning_rate": 0.00010336905742059742,
+      "loss": 0.6595,
+      "step": 631
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.4534286548708508,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.6826,
+      "step": 632
+    },
+    {
+      "epoch": 0.5064,
+      "grad_norm": 0.47032200968307747,
+      "learning_rate": 0.0001028508941378719,
+      "loss": 0.672,
+      "step": 633
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.49084443662279653,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7733,
+      "step": 634
+    },
+    {
+      "epoch": 0.508,
+      "grad_norm": 0.44710860334300656,
+      "learning_rate": 0.00010233265423647523,
+      "loss": 0.7091,
+      "step": 635
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.4431207850279007,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.7457,
+      "step": 636
+    },
+    {
+      "epoch": 0.5096,
+      "grad_norm": 0.42285002870095373,
+      "learning_rate": 0.00010181435164426676,
+      "loss": 0.6834,
+      "step": 637
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.45191770643994017,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.6795,
+      "step": 638
+    },
+    {
+      "epoch": 0.5112,
+      "grad_norm": 0.4195601822442567,
+      "learning_rate": 0.00010129600029079072,
+      "loss": 0.6417,
+      "step": 639
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.39460985762925294,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7023,
+      "step": 640
+    },
+    {
+      "epoch": 0.5128,
+      "grad_norm": 0.44789571155478425,
+      "learning_rate": 0.00010077761410690172,
+      "loss": 0.7759,
+      "step": 641
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.4300703454189236,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.7448,
+      "step": 642
+    },
+    {
+      "epoch": 0.5144,
+      "grad_norm": 0.40179064279153015,
+      "learning_rate": 0.00010025920702439051,
+      "loss": 0.7139,
+      "step": 643
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4251526854618008,
+      "learning_rate": 0.0001,
+      "loss": 0.624,
+      "step": 644
+    },
+    {
+      "epoch": 0.516,
+      "grad_norm": 0.39057488078696945,
+      "learning_rate": 9.97407929756095e-05,
+      "loss": 0.6603,
+      "step": 645
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.42630707149847313,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.6358,
+      "step": 646
+    },
+    {
+      "epoch": 0.5176,
+      "grad_norm": 0.5327749683651356,
+      "learning_rate": 9.92223858930983e-05,
+      "loss": 0.6579,
+      "step": 647
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.40059686695101615,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.6655,
+      "step": 648
+    },
+    {
+      "epoch": 0.5192,
+      "grad_norm": 0.4818593418381336,
+      "learning_rate": 9.870399970920932e-05,
+      "loss": 0.6803,
+      "step": 649
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3831421544104826,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.6959,
+      "step": 650
+    },
+    {
+      "epoch": 0.5208,
+      "grad_norm": 0.4385493137510518,
+      "learning_rate": 9.818564835573323e-05,
+      "loss": 0.6315,
+      "step": 651
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.4187935330881887,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.7037,
+      "step": 652
+    },
+    {
+      "epoch": 0.5224,
+      "grad_norm": 0.417803657805014,
+      "learning_rate": 9.766734576352478e-05,
+      "loss": 0.7078,
+      "step": 653
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.4762132158687772,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.7378,
+      "step": 654
+    },
+    {
+      "epoch": 0.524,
+      "grad_norm": 0.4119388003608324,
+      "learning_rate": 9.714910586212816e-05,
+      "loss": 0.7254,
+      "step": 655
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.41486154550669097,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.6415,
+      "step": 656
+    },
+    {
+      "epoch": 0.5256,
+      "grad_norm": 0.3712347933088719,
+      "learning_rate": 9.663094257940258e-05,
+      "loss": 0.6451,
+      "step": 657
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.40303060884995057,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.6969,
+      "step": 658
+    },
+    {
+      "epoch": 0.5272,
+      "grad_norm": 0.4619075618967914,
+      "learning_rate": 9.611286984114841e-05,
+      "loss": 0.7211,
+      "step": 659
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3546479520494259,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.6806,
+      "step": 660
+    },
+    {
+      "epoch": 0.5288,
+      "grad_norm": 0.4383990518530939,
+      "learning_rate": 9.559490157073236e-05,
+      "loss": 0.6902,
+      "step": 661
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.4662434092306648,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.7011,
+      "step": 662
+    },
+    {
+      "epoch": 0.5304,
+      "grad_norm": 0.4028329669021629,
+      "learning_rate": 9.507705168871358e-05,
+      "loss": 0.6728,
+      "step": 663
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.42368747763122155,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6635,
+      "step": 664
+    },
+    {
+      "epoch": 0.532,
+      "grad_norm": 0.4726867637244179,
+      "learning_rate": 9.455933411246958e-05,
+      "loss": 0.7119,
+      "step": 665
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.4215263629263457,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.7225,
+      "step": 666
+    },
+    {
+      "epoch": 0.5336,
+      "grad_norm": 0.43626293441672603,
+      "learning_rate": 9.404176275582208e-05,
+      "loss": 0.7973,
+      "step": 667
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.39943765901908274,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.7022,
+      "step": 668
+    },
+    {
+      "epoch": 0.5352,
+      "grad_norm": 0.4112569173604557,
+      "learning_rate": 9.352435152866298e-05,
+      "loss": 0.6989,
+      "step": 669
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.3858144331806179,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.6243,
+      "step": 670
+    },
+    {
+      "epoch": 0.5368,
+      "grad_norm": 0.6098326779465145,
+      "learning_rate": 9.300711433658087e-05,
+      "loss": 0.7532,
+      "step": 671
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.37361671169949173,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.6195,
+      "step": 672
+    },
+    {
+      "epoch": 0.5384,
+      "grad_norm": 0.4370848116862876,
+      "learning_rate": 9.249006508048694e-05,
+      "loss": 0.7093,
+      "step": 673
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.39664569567571945,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.7014,
+      "step": 674
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.40670887161198743,
+      "learning_rate": 9.197321765624152e-05,
+      "loss": 0.693,
+      "step": 675
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.40726690010248706,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.6217,
+      "step": 676
+    },
+    {
+      "epoch": 0.5416,
+      "grad_norm": 0.44412816216991685,
+      "learning_rate": 9.145658595428074e-05,
+      "loss": 0.6866,
+      "step": 677
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.426728437560957,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7291,
+      "step": 678
+    },
+    {
+      "epoch": 0.5432,
+      "grad_norm": 0.4164401946897966,
+      "learning_rate": 9.09401838592431e-05,
+      "loss": 0.7369,
+      "step": 679
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4321081763133884,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.6376,
+      "step": 680
+    },
+    {
+      "epoch": 0.5448,
+      "grad_norm": 0.4731745394937619,
+      "learning_rate": 9.04240252495963e-05,
+      "loss": 0.7111,
+      "step": 681
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.39995757145877464,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.694,
+      "step": 682
+    },
+    {
+      "epoch": 0.5464,
+      "grad_norm": 0.4364384119760376,
+      "learning_rate": 8.990812399726435e-05,
+      "loss": 0.6365,
+      "step": 683
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.4114799763475482,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7632,
+      "step": 684
+    },
+    {
+      "epoch": 0.548,
+      "grad_norm": 0.42800537132030453,
+      "learning_rate": 8.939249396725467e-05,
+      "loss": 0.7324,
+      "step": 685
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.43782200893842077,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.704,
+      "step": 686
+    },
+    {
+      "epoch": 0.5496,
+      "grad_norm": 0.4301697913751564,
+      "learning_rate": 8.887714901728551e-05,
+      "loss": 0.6989,
+      "step": 687
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3923326776816395,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7393,
+      "step": 688
+    },
+    {
+      "epoch": 0.5512,
+      "grad_norm": 0.4355980113522696,
+      "learning_rate": 8.836210299741346e-05,
+      "loss": 0.6924,
+      "step": 689
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.4404265439104104,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.6794,
+      "step": 690
+    },
+    {
+      "epoch": 0.5528,
+      "grad_norm": 0.41387428034521984,
+      "learning_rate": 8.784736974966135e-05,
+      "loss": 0.7322,
+      "step": 691
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.390878116934256,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.6451,
+      "step": 692
+    },
+    {
+      "epoch": 0.5544,
+      "grad_norm": 0.4365440266919981,
+      "learning_rate": 8.733296310764611e-05,
+      "loss": 0.6906,
+      "step": 693
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.42202109340957455,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.6962,
+      "step": 694
+    },
+    {
+      "epoch": 0.556,
+      "grad_norm": 0.4681452856192567,
+      "learning_rate": 8.6818896896207e-05,
+      "loss": 0.7254,
+      "step": 695
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.4647895188575558,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7099,
+      "step": 696
+    },
+    {
+      "epoch": 0.5576,
+      "grad_norm": 0.47224116271446176,
+      "learning_rate": 8.63051849310342e-05,
+      "loss": 0.6874,
+      "step": 697
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.3767664595638354,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.6404,
+      "step": 698
+    },
+    {
+      "epoch": 0.5592,
+      "grad_norm": 0.4312374747931622,
+      "learning_rate": 8.579184101829734e-05,
+      "loss": 0.717,
+      "step": 699
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4754052770334311,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.7162,
+      "step": 700
+    },
+    {
+      "epoch": 0.5608,
+      "grad_norm": 0.5014028514766248,
+      "learning_rate": 8.527887895427454e-05,
+      "loss": 0.6555,
+      "step": 701
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.3734741582608827,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6584,
+      "step": 702
+    },
+    {
+      "epoch": 0.5624,
+      "grad_norm": 0.44542244524799046,
+      "learning_rate": 8.476631252498162e-05,
+      "loss": 0.7316,
+      "step": 703
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.4206856768160065,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.6753,
+      "step": 704
+    },
+    {
+      "epoch": 0.564,
+      "grad_norm": 0.42678188555779784,
+      "learning_rate": 8.425415550580162e-05,
+      "loss": 0.6847,
+      "step": 705
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.3635002403845702,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.627,
+      "step": 706
+    },
+    {
+      "epoch": 0.5656,
+      "grad_norm": 0.4065817189128296,
+      "learning_rate": 8.374242166111448e-05,
+      "loss": 0.6938,
+      "step": 707
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4316955926145281,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.701,
+      "step": 708
+    },
+    {
+      "epoch": 0.5672,
+      "grad_norm": 0.34775313117517553,
+      "learning_rate": 8.323112474392731e-05,
+      "loss": 0.6375,
+      "step": 709
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.4347971567231205,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.7339,
+      "step": 710
+    },
+    {
+      "epoch": 0.5688,
+      "grad_norm": 0.3870375456359733,
+      "learning_rate": 8.272027849550457e-05,
+      "loss": 0.7262,
+      "step": 711
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4507678296290438,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.6907,
+      "step": 712
+    },
+    {
+      "epoch": 0.5704,
+      "grad_norm": 0.42712214159726214,
+      "learning_rate": 8.220989664499878e-05,
+      "loss": 0.7668,
+      "step": 713
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.41015505420405474,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6888,
+      "step": 714
+    },
+    {
+      "epoch": 0.572,
+      "grad_norm": 0.39763102750325163,
+      "learning_rate": 8.169999290908188e-05,
+      "loss": 0.6419,
+      "step": 715
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.4191530147392954,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.7052,
+      "step": 716
+    },
+    {
+      "epoch": 0.5736,
+      "grad_norm": 0.46946824442040413,
+      "learning_rate": 8.119058099157604e-05,
+      "loss": 0.5974,
+      "step": 717
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.3728556157947248,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.6967,
+      "step": 718
+    },
+    {
+      "epoch": 0.5752,
+      "grad_norm": 0.4043781316642281,
+      "learning_rate": 8.068167458308582e-05,
+      "loss": 0.6936,
+      "step": 719
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.42578695870834243,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.7122,
+      "step": 720
+    },
+    {
+      "epoch": 0.5768,
+      "grad_norm": 0.3696512046075325,
+      "learning_rate": 8.017328736063006e-05,
+      "loss": 0.6606,
+      "step": 721
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.4591620718922479,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.7154,
+      "step": 722
+    },
+    {
+      "epoch": 0.5784,
+      "grad_norm": 0.43031506983236373,
+      "learning_rate": 7.966543298727425e-05,
+      "loss": 0.7474,
+      "step": 723
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.37611435319488984,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6613,
+      "step": 724
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.4398367622705673,
+      "learning_rate": 7.915812511176347e-05,
+      "loss": 0.6985,
+      "step": 725
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.4606668684235599,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.7546,
+      "step": 726
+    },
+    {
+      "epoch": 0.5816,
+      "grad_norm": 0.4454089848272011,
+      "learning_rate": 7.865137736815535e-05,
+      "loss": 0.7565,
+      "step": 727
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.3796944207616218,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6453,
+      "step": 728
+    },
+    {
+      "epoch": 0.5832,
+      "grad_norm": 0.4870938883020908,
+      "learning_rate": 7.814520337545406e-05,
+      "loss": 0.6231,
+      "step": 729
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3919047888701881,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.6756,
+      "step": 730
+    },
+    {
+      "epoch": 0.5848,
+      "grad_norm": 0.351449737980625,
+      "learning_rate": 7.763961673724379e-05,
+      "loss": 0.6984,
+      "step": 731
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.42252464066428147,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6686,
+      "step": 732
+    },
+    {
+      "epoch": 0.5864,
+      "grad_norm": 0.3914560028501668,
+      "learning_rate": 7.713463104132345e-05,
+      "loss": 0.6908,
+      "step": 733
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.40295174723623234,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.6445,
+      "step": 734
+    },
+    {
+      "epoch": 0.588,
+      "grad_norm": 0.39073172302904613,
+      "learning_rate": 7.663025985934158e-05,
+      "loss": 0.6937,
+      "step": 735
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3709963125321195,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.668,
+      "step": 736
+    },
+    {
+      "epoch": 0.5896,
+      "grad_norm": 0.3933866238712117,
+      "learning_rate": 7.61265167464313e-05,
+      "loss": 0.6484,
+      "step": 737
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.4063031184543873,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.6566,
+      "step": 738
+    },
+    {
+      "epoch": 0.5912,
+      "grad_norm": 0.40379378551884454,
+      "learning_rate": 7.562341524084623e-05,
+      "loss": 0.6959,
+      "step": 739
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.46800810242234814,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.7401,
+      "step": 740
+    },
+    {
+      "epoch": 0.5928,
+      "grad_norm": 0.41783370423759914,
+      "learning_rate": 7.512096886359664e-05,
+      "loss": 0.6845,
+      "step": 741
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.3445165919380514,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6381,
+      "step": 742
+    },
+    {
+      "epoch": 0.5944,
+      "grad_norm": 0.34281709881514283,
+      "learning_rate": 7.461919111808595e-05,
+      "loss": 0.6033,
+      "step": 743
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.4911202166509181,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.7597,
+      "step": 744
+    },
+    {
+      "epoch": 0.596,
+      "grad_norm": 0.4453953009786171,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.6656,
+      "step": 745
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.41829674492066976,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.7278,
+      "step": 746
+    },
+    {
+      "epoch": 0.5976,
+      "grad_norm": 0.4627807550930201,
+      "learning_rate": 7.361769544568425e-05,
+      "loss": 0.7188,
+      "step": 747
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.46143136545483737,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.678,
+      "step": 748
+    },
+    {
+      "epoch": 0.5992,
+      "grad_norm": 0.42262247738767705,
+      "learning_rate": 7.311800443430251e-05,
+      "loss": 0.6266,
+      "step": 749
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.36706273204871864,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.6833,
+      "step": 750
+    },
+    {
+      "epoch": 0.6008,
+      "grad_norm": 0.36955889205145553,
+      "learning_rate": 7.26190358849548e-05,
+      "loss": 0.6326,
+      "step": 751
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.4051583767582194,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.641,
+      "step": 752
+    },
+    {
+      "epoch": 0.6024,
+      "grad_norm": 0.40047164298288446,
+      "learning_rate": 7.212080320757695e-05,
+      "loss": 0.6597,
+      "step": 753
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.39501334821182466,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.6666,
+      "step": 754
+    },
+    {
+      "epoch": 0.604,
+      "grad_norm": 0.382263551767098,
+      "learning_rate": 7.162331979232783e-05,
+      "loss": 0.6689,
+      "step": 755
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.44546686537191926,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.6527,
+      "step": 756
+    },
+    {
+      "epoch": 0.6056,
+      "grad_norm": 0.47193164450770997,
+      "learning_rate": 7.112659900922976e-05,
+      "loss": 0.6341,
+      "step": 757
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.46796251950243534,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7434,
+      "step": 758
+    },
+    {
+      "epoch": 0.6072,
+      "grad_norm": 0.41135014400587067,
+      "learning_rate": 7.06306542078091e-05,
+      "loss": 0.6632,
+      "step": 759
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.35174694214103835,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.5599,
+      "step": 760
+    },
+    {
+      "epoch": 0.6088,
+      "grad_norm": 0.4261893917296931,
+      "learning_rate": 7.013549871673736e-05,
+      "loss": 0.7009,
+      "step": 761
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.4113412739863776,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6665,
+      "step": 762
+    },
+    {
+      "epoch": 0.6104,
+      "grad_norm": 0.40679542718728934,
+      "learning_rate": 6.964114584347316e-05,
+      "loss": 0.6701,
+      "step": 763
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3926481491746906,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6119,
+      "step": 764
+    },
+    {
+      "epoch": 0.612,
+      "grad_norm": 0.39778211443742756,
+      "learning_rate": 6.914760887390452e-05,
+      "loss": 0.6747,
+      "step": 765
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.44917308211135637,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.747,
+      "step": 766
+    },
+    {
+      "epoch": 0.6136,
+      "grad_norm": 0.4236705214830975,
+      "learning_rate": 6.865490107199181e-05,
+      "loss": 0.6911,
+      "step": 767
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.4014067169197921,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6786,
+      "step": 768
+    },
+    {
+      "epoch": 0.6152,
+      "grad_norm": 0.37858996457960126,
+      "learning_rate": 6.816303567941112e-05,
+      "loss": 0.7139,
+      "step": 769
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.3926606726180132,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.6702,
+      "step": 770
+    },
+    {
+      "epoch": 0.6168,
+      "grad_norm": 0.4258896736184436,
+      "learning_rate": 6.767202591519875e-05,
+      "loss": 0.7064,
+      "step": 771
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.4023392887251313,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.668,
+      "step": 772
+    },
+    {
+      "epoch": 0.6184,
+      "grad_norm": 0.5210062335588069,
+      "learning_rate": 6.718188497539554e-05,
+      "loss": 0.7616,
+      "step": 773
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.40328490054705385,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.6838,
+      "step": 774
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.3889947109465521,
+      "learning_rate": 6.669262603269246e-05,
+      "loss": 0.6857,
+      "step": 775
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.378072964715034,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.6649,
+      "step": 776
+    },
+    {
+      "epoch": 0.6216,
+      "grad_norm": 0.3841233525812373,
+      "learning_rate": 6.620426223607654e-05,
+      "loss": 0.6999,
+      "step": 777
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.5040506704424114,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6357,
+      "step": 778
+    },
+    {
+      "epoch": 0.6232,
+      "grad_norm": 0.4034614415798281,
+      "learning_rate": 6.571680671047749e-05,
+      "loss": 0.7319,
+      "step": 779
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.4226046627171998,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7206,
+      "step": 780
+    },
+    {
+      "epoch": 0.6248,
+      "grad_norm": 0.3863173873353244,
+      "learning_rate": 6.523027255641493e-05,
+      "loss": 0.6538,
+      "step": 781
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.4509624943489986,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.6813,
+      "step": 782
+    },
+    {
+      "epoch": 0.6264,
+      "grad_norm": 0.4325182683212749,
+      "learning_rate": 6.474467284964634e-05,
+      "loss": 0.6811,
+      "step": 783
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.4304866664044165,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6954,
+      "step": 784
+    },
+    {
+      "epoch": 0.628,
+      "grad_norm": 0.40887192147637236,
+      "learning_rate": 6.426002064081565e-05,
+      "loss": 0.6923,
+      "step": 785
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.47669886743863876,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7068,
+      "step": 786
+    },
+    {
+      "epoch": 0.6296,
+      "grad_norm": 0.3654067929691481,
+      "learning_rate": 6.377632895510248e-05,
+      "loss": 0.6683,
+      "step": 787
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.383565537193584,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6437,
+      "step": 788
+    },
+    {
+      "epoch": 0.6312,
+      "grad_norm": 0.3893638313547271,
+      "learning_rate": 6.329361079187199e-05,
+      "loss": 0.6859,
+      "step": 789
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.3927667218699901,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.6595,
+      "step": 790
+    },
+    {
+      "epoch": 0.6328,
+      "grad_norm": 0.38977229894335297,
+      "learning_rate": 6.281187912432587e-05,
+      "loss": 0.6497,
+      "step": 791
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.4012048981102772,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6744,
+      "step": 792
+    },
+    {
+      "epoch": 0.6344,
+      "grad_norm": 0.4205859614124761,
+      "learning_rate": 6.233114689915316e-05,
+      "loss": 0.7039,
+      "step": 793
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.4238390039930889,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.5847,
+      "step": 794
+    },
+    {
+      "epoch": 0.636,
+      "grad_norm": 0.5431135837848629,
+      "learning_rate": 6.18514270361827e-05,
+      "loss": 0.7442,
+      "step": 795
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.3841051928298376,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.6183,
+      "step": 796
+    },
+    {
+      "epoch": 0.6376,
+      "grad_norm": 0.3697223969819957,
+      "learning_rate": 6.13727324280358e-05,
+      "loss": 0.6375,
+      "step": 797
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.3590098236656919,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.6404,
+      "step": 798
+    },
+    {
+      "epoch": 0.6392,
+      "grad_norm": 0.4125776500113079,
+      "learning_rate": 6.08950759397797e-05,
+      "loss": 0.6878,
+      "step": 799
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3947146565222585,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.7141,
+      "step": 800
+    },
+    {
+      "epoch": 0.6408,
+      "grad_norm": 0.3694402423492765,
+      "learning_rate": 6.0418470408581774e-05,
+      "loss": 0.7004,
+      "step": 801
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.3835766426472468,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.6935,
+      "step": 802
+    },
+    {
+      "epoch": 0.6424,
+      "grad_norm": 0.37080887534840773,
+      "learning_rate": 5.9942928643364724e-05,
+      "loss": 0.5795,
+      "step": 803
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.43723240190470886,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6628,
+      "step": 804
+    },
+    {
+      "epoch": 0.644,
+      "grad_norm": 0.39385458611995466,
+      "learning_rate": 5.946846342446214e-05,
+      "loss": 0.7022,
+      "step": 805
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.3901449855591286,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.647,
+      "step": 806
+    },
+    {
+      "epoch": 0.6456,
+      "grad_norm": 0.42770788167662127,
+      "learning_rate": 5.899508750327501e-05,
+      "loss": 0.7048,
+      "step": 807
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4896025780748295,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6736,
+      "step": 808
+    },
+    {
+      "epoch": 0.6472,
+      "grad_norm": 0.3939777926516438,
+      "learning_rate": 5.8522813601929324e-05,
+      "loss": 0.722,
+      "step": 809
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.41252256589683123,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6749,
+      "step": 810
+    },
+    {
+      "epoch": 0.6488,
+      "grad_norm": 0.4230612348858875,
+      "learning_rate": 5.80516544129337e-05,
+      "loss": 0.6704,
+      "step": 811
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.4217687932057089,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6463,
+      "step": 812
+    },
+    {
+      "epoch": 0.6504,
+      "grad_norm": 0.36426708106640004,
+      "learning_rate": 5.758162259883867e-05,
+      "loss": 0.6484,
+      "step": 813
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.3953184886570678,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.6941,
+      "step": 814
+    },
+    {
+      "epoch": 0.652,
+      "grad_norm": 0.38223814139919626,
+      "learning_rate": 5.7112730791896207e-05,
+      "loss": 0.6297,
+      "step": 815
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.416064650999233,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.6965,
+      "step": 816
+    },
+    {
+      "epoch": 0.6536,
+      "grad_norm": 0.4004307146179374,
+      "learning_rate": 5.664499159372017e-05,
+      "loss": 0.6475,
+      "step": 817
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.4708079297383675,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.6851,
+      "step": 818
+    },
+    {
+      "epoch": 0.6552,
+      "grad_norm": 0.4290088114427895,
+      "learning_rate": 5.617841757494762e-05,
+      "loss": 0.6676,
+      "step": 819
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.39728379907446343,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.7086,
+      "step": 820
+    },
+    {
+      "epoch": 0.6568,
+      "grad_norm": 0.3539627879192858,
+      "learning_rate": 5.5713021274901335e-05,
+      "loss": 0.6368,
+      "step": 821
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.4516247553177675,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.7271,
+      "step": 822
+    },
+    {
+      "epoch": 0.6584,
+      "grad_norm": 0.3719122344570917,
+      "learning_rate": 5.524881520125229e-05,
+      "loss": 0.6593,
+      "step": 823
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.4201357728075419,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.7337,
+      "step": 824
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.3856777395195659,
+      "learning_rate": 5.4785811829683764e-05,
+      "loss": 0.6698,
+      "step": 825
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.4191793263602028,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.6846,
+      "step": 826
+    },
+    {
+      "epoch": 0.6616,
+      "grad_norm": 0.5023677642761707,
+      "learning_rate": 5.432402360355615e-05,
+      "loss": 0.7193,
+      "step": 827
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.40152491674816604,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.6647,
+      "step": 828
+    },
+    {
+      "epoch": 0.6632,
+      "grad_norm": 0.5021325234550964,
+      "learning_rate": 5.386346293357242e-05,
+      "loss": 0.7424,
+      "step": 829
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.3663233920022643,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6455,
+      "step": 830
+    },
+    {
+      "epoch": 0.6648,
+      "grad_norm": 0.3766987285711635,
+      "learning_rate": 5.3404142197444506e-05,
+      "loss": 0.6492,
+      "step": 831
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.39615195638083806,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.6335,
+      "step": 832
+    },
+    {
+      "epoch": 0.6664,
+      "grad_norm": 0.3727499688053087,
+      "learning_rate": 5.2946073739560706e-05,
+      "loss": 0.6462,
+      "step": 833
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.4314468517020133,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6355,
+      "step": 834
+    },
+    {
+      "epoch": 0.668,
+      "grad_norm": 0.39132260159381754,
+      "learning_rate": 5.248926987065417e-05,
+      "loss": 0.6415,
+      "step": 835
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.41696845922044734,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6324,
+      "step": 836
+    },
+    {
+      "epoch": 0.6696,
+      "grad_norm": 0.43720442904996637,
+      "learning_rate": 5.203374286747158e-05,
+      "loss": 0.683,
+      "step": 837
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.4182924705742696,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6227,
+      "step": 838
+    },
+    {
+      "epoch": 0.6712,
+      "grad_norm": 0.35803751020012203,
+      "learning_rate": 5.15795049724435e-05,
+      "loss": 0.6363,
+      "step": 839
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.35708142177875307,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.646,
+      "step": 840
+    },
+    {
+      "epoch": 0.6728,
+      "grad_norm": 0.4616648828250431,
+      "learning_rate": 5.112656839335543e-05,
+      "loss": 0.6798,
+      "step": 841
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.4188551672236205,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.6459,
+      "step": 842
+    },
+    {
+      "epoch": 0.6744,
+      "grad_norm": 0.46822151383276933,
+      "learning_rate": 5.0674945303019526e-05,
+      "loss": 0.6995,
+      "step": 843
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.44004281585292426,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6202,
+      "step": 844
+    },
+    {
+      "epoch": 0.676,
+      "grad_norm": 0.3573156653645513,
+      "learning_rate": 5.022464783894744e-05,
+      "loss": 0.5897,
+      "step": 845
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.40237252162483383,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.6221,
+      "step": 846
+    },
+    {
+      "epoch": 0.6776,
+      "grad_norm": 0.3975645475240448,
+      "learning_rate": 4.977568810302432e-05,
+      "loss": 0.6496,
+      "step": 847
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.38499345755069325,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.682,
+      "step": 848
+    },
+    {
+      "epoch": 0.6792,
+      "grad_norm": 0.5140050698466838,
+      "learning_rate": 4.9328078161183464e-05,
+      "loss": 0.6829,
+      "step": 849
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.3725159875858386,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6639,
+      "step": 850
+    },
+    {
+      "epoch": 0.6808,
+      "grad_norm": 0.40950955909005243,
+      "learning_rate": 4.88818300430819e-05,
+      "loss": 0.6989,
+      "step": 851
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.46031007485696074,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6312,
+      "step": 852
+    },
+    {
+      "epoch": 0.6824,
+      "grad_norm": 0.3963141432384529,
+      "learning_rate": 4.843695574177737e-05,
+      "loss": 0.6873,
+      "step": 853
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.43177629728606337,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.6716,
+      "step": 854
+    },
+    {
+      "epoch": 0.684,
+      "grad_norm": 0.36534440801635726,
+      "learning_rate": 4.7993467213405706e-05,
+      "loss": 0.7054,
+      "step": 855
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.36222210330217647,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6385,
+      "step": 856
+    },
+    {
+      "epoch": 0.6856,
+      "grad_norm": 0.39893542029413387,
+      "learning_rate": 4.755137637685979e-05,
+      "loss": 0.7219,
+      "step": 857
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.4118250270518911,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.7154,
+      "step": 858
+    },
+    {
+      "epoch": 0.6872,
+      "grad_norm": 0.3568852499449988,
+      "learning_rate": 4.7110695113469085e-05,
+      "loss": 0.6567,
+      "step": 859
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.43549006346138774,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.7329,
+      "step": 860
+    },
+    {
+      "epoch": 0.6888,
+      "grad_norm": 0.35674849903182165,
+      "learning_rate": 4.6671435266680216e-05,
+      "loss": 0.6475,
+      "step": 861
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.38232166331613066,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.6737,
+      "step": 862
+    },
+    {
+      "epoch": 0.6904,
+      "grad_norm": 0.3909706882327398,
+      "learning_rate": 4.623360864173893e-05,
+      "loss": 0.6181,
+      "step": 863
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.4524146819591539,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.7207,
+      "step": 864
+    },
+    {
+      "epoch": 0.692,
+      "grad_norm": 0.48696787150203236,
+      "learning_rate": 4.579722700537268e-05,
+      "loss": 0.6191,
+      "step": 865
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.4794679869123501,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6811,
+      "step": 866
+    },
+    {
+      "epoch": 0.6936,
+      "grad_norm": 0.35425548179888433,
+      "learning_rate": 4.5362302085474254e-05,
+      "loss": 0.6335,
+      "step": 867
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.4067978349950799,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.6612,
+      "step": 868
+    },
+    {
+      "epoch": 0.6952,
+      "grad_norm": 0.40303274236473946,
+      "learning_rate": 4.492884557078688e-05,
+      "loss": 0.6574,
+      "step": 869
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.4237551677595413,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.5851,
+      "step": 870
+    },
+    {
+      "epoch": 0.6968,
+      "grad_norm": 0.42875369891186,
+      "learning_rate": 4.449686911058992e-05,
+      "loss": 0.676,
+      "step": 871
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.42706425272560744,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.7297,
+      "step": 872
+    },
+    {
+      "epoch": 0.6984,
+      "grad_norm": 0.41576864434001926,
+      "learning_rate": 4.406638431438576e-05,
+      "loss": 0.6471,
+      "step": 873
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.418117804387114,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.6621,
+      "step": 874
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.4150020946129471,
+      "learning_rate": 4.36374027515878e-05,
+      "loss": 0.6467,
+      "step": 875
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.3928687191971617,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6527,
+      "step": 876
+    },
+    {
+      "epoch": 0.7016,
+      "grad_norm": 0.4160100982513934,
+      "learning_rate": 4.320993595120969e-05,
+      "loss": 0.6808,
+      "step": 877
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.4794207042809734,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6599,
+      "step": 878
+    },
+    {
+      "epoch": 0.7032,
+      "grad_norm": 0.46730184137061487,
+      "learning_rate": 4.278399540155536e-05,
+      "loss": 0.6519,
+      "step": 879
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.37505905651722327,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.6537,
+      "step": 880
+    },
+    {
+      "epoch": 0.7048,
+      "grad_norm": 0.4232559113614526,
+      "learning_rate": 4.2359592549910145e-05,
+      "loss": 0.6731,
+      "step": 881
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.40992758465781476,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.6794,
+      "step": 882
+    },
+    {
+      "epoch": 0.7064,
+      "grad_norm": 0.4058249457233674,
+      "learning_rate": 4.193673880223339e-05,
+      "loss": 0.6786,
+      "step": 883
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.4035278599984284,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6481,
+      "step": 884
+    },
+    {
+      "epoch": 0.708,
+      "grad_norm": 0.39784788196463317,
+      "learning_rate": 4.1515445522851784e-05,
+      "loss": 0.5858,
+      "step": 885
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.4318721099836124,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6577,
+      "step": 886
+    },
+    {
+      "epoch": 0.7096,
+      "grad_norm": 0.3706105798394763,
+      "learning_rate": 4.109572403415386e-05,
+      "loss": 0.6809,
+      "step": 887
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.49204290897437447,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.6702,
+      "step": 888
+    },
+    {
+      "epoch": 0.7112,
+      "grad_norm": 0.5342172203437557,
+      "learning_rate": 4.0677585616285774e-05,
+      "loss": 0.6571,
+      "step": 889
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.40420590231024983,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6931,
+      "step": 890
+    },
+    {
+      "epoch": 0.7128,
+      "grad_norm": 0.39025351336075903,
+      "learning_rate": 4.026104150684835e-05,
+      "loss": 0.6894,
+      "step": 891
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.42860663505358243,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.66,
+      "step": 892
+    },
+    {
+      "epoch": 0.7144,
+      "grad_norm": 0.40188415325475024,
+      "learning_rate": 3.984610290059467e-05,
+      "loss": 0.6811,
+      "step": 893
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.5099159876951098,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.6812,
+      "step": 894
+    },
+    {
+      "epoch": 0.716,
+      "grad_norm": 0.40195803826116805,
+      "learning_rate": 3.943278094912946e-05,
+      "loss": 0.6455,
+      "step": 895
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3770629912663238,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.5853,
+      "step": 896
+    },
+    {
+      "epoch": 0.7176,
+      "grad_norm": 0.3849565435590295,
+      "learning_rate": 3.902108676060937e-05,
+      "loss": 0.6395,
+      "step": 897
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.3636221080946348,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.5883,
+      "step": 898
+    },
+    {
+      "epoch": 0.7192,
+      "grad_norm": 0.4324813351929036,
+      "learning_rate": 3.861103139944449e-05,
+      "loss": 0.627,
+      "step": 899
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.36972044268012877,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.6807,
+      "step": 900
+    },
+    {
+      "epoch": 0.7208,
+      "grad_norm": 0.4246459396787344,
+      "learning_rate": 3.820262588600074e-05,
+      "loss": 0.611,
+      "step": 901
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.4095368430242777,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.669,
+      "step": 902
+    },
+    {
+      "epoch": 0.7224,
+      "grad_norm": 0.450062496563086,
+      "learning_rate": 3.7795881196303995e-05,
+      "loss": 0.6818,
+      "step": 903
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.45122798759617516,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.661,
+      "step": 904
+    },
+    {
+      "epoch": 0.724,
+      "grad_norm": 0.4172593406435854,
+      "learning_rate": 3.739080826174498e-05,
+      "loss": 0.6722,
+      "step": 905
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.41231268375543334,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.6579,
+      "step": 906
+    },
+    {
+      "epoch": 0.7256,
+      "grad_norm": 0.36609118253065914,
+      "learning_rate": 3.6987417968785366e-05,
+      "loss": 0.6195,
+      "step": 907
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3944213557612597,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.6892,
+      "step": 908
+    },
+    {
+      "epoch": 0.7272,
+      "grad_norm": 0.5130378278043198,
+      "learning_rate": 3.658572115866541e-05,
+      "loss": 0.6606,
+      "step": 909
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.41001351868405705,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6359,
+      "step": 910
+    },
+    {
+      "epoch": 0.7288,
+      "grad_norm": 0.36033257918341494,
+      "learning_rate": 3.618572862711247e-05,
+      "loss": 0.6272,
+      "step": 911
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.43236393064685835,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.6306,
+      "step": 912
+    },
+    {
+      "epoch": 0.7304,
+      "grad_norm": 0.4308461646312396,
+      "learning_rate": 3.578745112405083e-05,
+      "loss": 0.7005,
+      "step": 913
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.4317565433068937,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.6891,
+      "step": 914
+    },
+    {
+      "epoch": 0.732,
+      "grad_norm": 0.4522227837448108,
+      "learning_rate": 3.539089935331294e-05,
+      "loss": 0.6608,
+      "step": 915
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.44695081016231847,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.7083,
+      "step": 916
+    },
+    {
+      "epoch": 0.7336,
+      "grad_norm": 0.3542022042021266,
+      "learning_rate": 3.4996083972351515e-05,
+      "loss": 0.5939,
+      "step": 917
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.3683736484173129,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.7423,
+      "step": 918
+    },
+    {
+      "epoch": 0.7352,
+      "grad_norm": 0.39741628676824414,
+      "learning_rate": 3.4603015591953395e-05,
+      "loss": 0.6767,
+      "step": 919
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.4034667413845989,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.7175,
+      "step": 920
+    },
+    {
+      "epoch": 0.7368,
+      "grad_norm": 0.38925888471791487,
+      "learning_rate": 3.421170477595419e-05,
+      "loss": 0.6653,
+      "step": 921
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.4160330119149825,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.6609,
+      "step": 922
+    },
+    {
+      "epoch": 0.7384,
+      "grad_norm": 0.40576759925288564,
+      "learning_rate": 3.3822162040954354e-05,
+      "loss": 0.6533,
+      "step": 923
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.4035520416846824,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6722,
+      "step": 924
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.4009366996579774,
+      "learning_rate": 3.34343978560367e-05,
+      "loss": 0.6656,
+      "step": 925
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.3973017016693891,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.7151,
+      "step": 926
+    },
+    {
+      "epoch": 0.7416,
+      "grad_norm": 0.4016695475436383,
+      "learning_rate": 3.3048422642484886e-05,
+      "loss": 0.645,
+      "step": 927
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.375683111180452,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6926,
+      "step": 928
+    },
+    {
+      "epoch": 0.7432,
+      "grad_norm": 0.3737568291563691,
+      "learning_rate": 3.266424677350346e-05,
+      "loss": 0.6383,
+      "step": 929
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4321402724189271,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6269,
+      "step": 930
+    },
+    {
+      "epoch": 0.7448,
+      "grad_norm": 0.654352094348001,
+      "learning_rate": 3.228188057393895e-05,
+      "loss": 0.6318,
+      "step": 931
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.3697878748099031,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.6839,
+      "step": 932
+    },
+    {
+      "epoch": 0.7464,
+      "grad_norm": 0.4100180548031645,
+      "learning_rate": 3.190133432000252e-05,
+      "loss": 0.5577,
+      "step": 933
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.423162638686225,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6166,
+      "step": 934
+    },
+    {
+      "epoch": 0.748,
+      "grad_norm": 0.3626995861072231,
+      "learning_rate": 3.1522618238993725e-05,
+      "loss": 0.5919,
+      "step": 935
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.487656427977271,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.6516,
+      "step": 936
+    },
+    {
+      "epoch": 0.7496,
+      "grad_norm": 0.4626190649805298,
+      "learning_rate": 3.114574250902558e-05,
+      "loss": 0.6741,
+      "step": 937
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.4507445934656397,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6436,
+      "step": 938
+    },
+    {
+      "epoch": 0.7512,
+      "grad_norm": 0.44973384212634937,
+      "learning_rate": 3.077071725875116e-05,
+      "loss": 0.6943,
+      "step": 939
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3542486429343615,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6172,
+      "step": 940
+    },
+    {
+      "epoch": 0.7528,
+      "grad_norm": 0.483158545603061,
+      "learning_rate": 3.0397552567091337e-05,
+      "loss": 0.7324,
+      "step": 941
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.4443601420717475,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.6322,
+      "step": 942
+    },
+    {
+      "epoch": 0.7544,
+      "grad_norm": 0.39815518075615547,
+      "learning_rate": 3.0026258462963787e-05,
+      "loss": 0.6544,
+      "step": 943
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.43538466982005025,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.6692,
+      "step": 944
+    },
+    {
+      "epoch": 0.756,
+      "grad_norm": 0.3987085128336364,
+      "learning_rate": 2.9656844925013637e-05,
+      "loss": 0.6337,
+      "step": 945
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.4867568989774928,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6384,
+      "step": 946
+    },
+    {
+      "epoch": 0.7576,
+      "grad_norm": 0.435120991086208,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.6758,
+      "step": 947
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.44316051394587086,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6631,
+      "step": 948
+    },
+    {
+      "epoch": 0.7592,
+      "grad_norm": 0.45090996634509384,
+      "learning_rate": 2.8923699209255284e-05,
+      "loss": 0.6656,
+      "step": 949
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3738596205343436,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.6349,
+      "step": 950
+    },
+    {
+      "epoch": 0.7608,
+      "grad_norm": 0.43872253842178166,
+      "learning_rate": 2.8559986734967282e-05,
+      "loss": 0.6305,
+      "step": 951
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3890988139091703,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6524,
+      "step": 952
+    },
+    {
+      "epoch": 0.7624,
+      "grad_norm": 0.43589107655422343,
+      "learning_rate": 2.819819423336775e-05,
+      "loss": 0.6641,
+      "step": 953
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.48641179767435216,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6973,
+      "step": 954
+    },
+    {
+      "epoch": 0.764,
+      "grad_norm": 0.4701137681040988,
+      "learning_rate": 2.7838331427743282e-05,
+      "loss": 0.6249,
+      "step": 955
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.41946748101231746,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6626,
+      "step": 956
+    },
+    {
+      "epoch": 0.7656,
+      "grad_norm": 0.4308555349400613,
+      "learning_rate": 2.7480407989519198e-05,
+      "loss": 0.6501,
+      "step": 957
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.3868020833047252,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.6524,
+      "step": 958
+    },
+    {
+      "epoch": 0.7672,
+      "grad_norm": 0.4180293930043457,
+      "learning_rate": 2.712443353799984e-05,
+      "loss": 0.6858,
+      "step": 959
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3989146936164297,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6585,
+      "step": 960
+    },
+    {
+      "epoch": 0.7688,
+      "grad_norm": 0.409696477744784,
+      "learning_rate": 2.677041764010988e-05,
+      "loss": 0.6544,
+      "step": 961
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.39017189335418073,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6374,
+      "step": 962
+    },
+    {
+      "epoch": 0.7704,
+      "grad_norm": 0.40924851230332643,
+      "learning_rate": 2.6418369810137188e-05,
+      "loss": 0.6235,
+      "step": 963
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.41369897553900026,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6297,
+      "step": 964
+    },
+    {
+      "epoch": 0.772,
+      "grad_norm": 0.35413877056139925,
+      "learning_rate": 2.6068299509477266e-05,
+      "loss": 0.7408,
+      "step": 965
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.4200042956105686,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.7043,
+      "step": 966
+    },
+    {
+      "epoch": 0.7736,
+      "grad_norm": 0.37583423368129587,
+      "learning_rate": 2.5720216146378917e-05,
+      "loss": 0.575,
+      "step": 967
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.44458425127920537,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.6365,
+      "step": 968
+    },
+    {
+      "epoch": 0.7752,
+      "grad_norm": 0.36979499734209953,
+      "learning_rate": 2.5374129075691265e-05,
+      "loss": 0.6411,
+      "step": 969
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.36321911943415197,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.5964,
+      "step": 970
+    },
+    {
+      "epoch": 0.7768,
+      "grad_norm": 0.4190613289113207,
+      "learning_rate": 2.503004759861258e-05,
+      "loss": 0.6236,
+      "step": 971
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.41415344820995503,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6581,
+      "step": 972
+    },
+    {
+      "epoch": 0.7784,
+      "grad_norm": 0.4235017481418864,
+      "learning_rate": 2.4687980962440072e-05,
+      "loss": 0.6457,
+      "step": 973
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.5429098905663646,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6682,
+      "step": 974
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.37140274130759376,
+      "learning_rate": 2.4347938360321566e-05,
+      "loss": 0.6323,
+      "step": 975
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.3995966583247825,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6536,
+      "step": 976
+    },
+    {
+      "epoch": 0.7816,
+      "grad_norm": 0.3845667540435273,
+      "learning_rate": 2.400992893100822e-05,
+      "loss": 0.6479,
+      "step": 977
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.503563493811046,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.7002,
+      "step": 978
+    },
+    {
+      "epoch": 0.7832,
+      "grad_norm": 0.48271921958958586,
+      "learning_rate": 2.3673961758609152e-05,
+      "loss": 0.6357,
+      "step": 979
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4063399200880828,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.6684,
+      "step": 980
+    },
+    {
+      "epoch": 0.7848,
+      "grad_norm": 0.4059102196974217,
+      "learning_rate": 2.334004587234717e-05,
+      "loss": 0.6512,
+      "step": 981
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.37060607282625513,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6346,
+      "step": 982
+    },
+    {
+      "epoch": 0.7864,
+      "grad_norm": 0.3571634773263615,
+      "learning_rate": 2.300819024631603e-05,
+      "loss": 0.6243,
+      "step": 983
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.4632222917295501,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6473,
+      "step": 984
+    },
+    {
+      "epoch": 0.788,
+      "grad_norm": 0.38257054846317295,
+      "learning_rate": 2.26784037992395e-05,
+      "loss": 0.677,
+      "step": 985
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.4856941237215944,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6743,
+      "step": 986
+    },
+    {
+      "epoch": 0.7896,
+      "grad_norm": 0.38315673385655835,
+      "learning_rate": 2.2350695394231345e-05,
+      "loss": 0.5963,
+      "step": 987
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.3710499452465796,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.616,
+      "step": 988
+    },
+    {
+      "epoch": 0.7912,
+      "grad_norm": 0.40594033576649546,
+      "learning_rate": 2.2025073838557454e-05,
+      "loss": 0.6515,
+      "step": 989
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.39039736546857506,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6275,
+      "step": 990
+    },
+    {
+      "epoch": 0.7928,
+      "grad_norm": 0.3598234016005934,
+      "learning_rate": 2.1701547883398922e-05,
+      "loss": 0.5968,
+      "step": 991
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.3397120750449489,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6329,
+      "step": 992
+    },
+    {
+      "epoch": 0.7944,
+      "grad_norm": 0.3940077109393743,
+      "learning_rate": 2.138012622361689e-05,
+      "loss": 0.6749,
+      "step": 993
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.3887730184910833,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6352,
+      "step": 994
+    },
+    {
+      "epoch": 0.796,
+      "grad_norm": 0.4228058902082928,
+      "learning_rate": 2.106081749751897e-05,
+      "loss": 0.6594,
+      "step": 995
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3505931888428044,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.6448,
+      "step": 996
+    },
+    {
+      "epoch": 0.7976,
+      "grad_norm": 0.46031606701165484,
+      "learning_rate": 2.0743630286627002e-05,
+      "loss": 0.7063,
+      "step": 997
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.4807742352290222,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6369,
+      "step": 998
+    },
+    {
+      "epoch": 0.7992,
+      "grad_norm": 0.4122791741476168,
+      "learning_rate": 2.0428573115446392e-05,
+      "loss": 0.6459,
+      "step": 999
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3689588926242668,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6087,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8008,
+      "grad_norm": 0.3883132360963373,
+      "learning_rate": 2.011565445123711e-05,
+      "loss": 0.636,
+      "step": 1001
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.42706209168848325,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.6331,
+      "step": 1002
+    },
+    {
+      "epoch": 0.8024,
+      "grad_norm": 0.3781175481002125,
+      "learning_rate": 1.980488270378612e-05,
+      "loss": 0.6368,
+      "step": 1003
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.5456382242501266,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6248,
+      "step": 1004
+    },
+    {
+      "epoch": 0.804,
+      "grad_norm": 0.4037859656966116,
+      "learning_rate": 1.9496266225181248e-05,
+      "loss": 0.6351,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.6203428892266306,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.8087,
+      "step": 1006
+    },
+    {
+      "epoch": 0.8056,
+      "grad_norm": 0.42159889631433856,
+      "learning_rate": 1.918981330958678e-05,
+      "loss": 0.6825,
+      "step": 1007
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.4586925573580974,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6819,
+      "step": 1008
+    },
+    {
+      "epoch": 0.8072,
+      "grad_norm": 0.3816846708998703,
+      "learning_rate": 1.8885532193020704e-05,
+      "loss": 0.6365,
+      "step": 1009
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.41334639135666074,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7254,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8088,
+      "grad_norm": 0.4045931207610921,
+      "learning_rate": 1.8583431053133127e-05,
+      "loss": 0.6933,
+      "step": 1011
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.7938660736245234,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.6161,
+      "step": 1012
+    },
+    {
+      "epoch": 0.8104,
+      "grad_norm": 0.3798388450210105,
+      "learning_rate": 1.8283518008986567e-05,
+      "loss": 0.6537,
+      "step": 1013
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.398486259721519,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.6353,
+      "step": 1014
+    },
+    {
+      "epoch": 0.812,
+      "grad_norm": 0.3924844708387776,
+      "learning_rate": 1.7985801120837865e-05,
+      "loss": 0.6127,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.3704378021683162,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.641,
+      "step": 1016
+    },
+    {
+      "epoch": 0.8136,
+      "grad_norm": 0.43102170150852875,
+      "learning_rate": 1.7690288389921493e-05,
+      "loss": 0.6408,
+      "step": 1017
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.3974103484818911,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.6454,
+      "step": 1018
+    },
+    {
+      "epoch": 0.8152,
+      "grad_norm": 0.4046789441586394,
+      "learning_rate": 1.739698775823442e-05,
+      "loss": 0.6832,
+      "step": 1019
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.5029288780877051,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.6854,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8168,
+      "grad_norm": 0.4306683847026681,
+      "learning_rate": 1.7105907108322816e-05,
+      "loss": 0.6454,
+      "step": 1021
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.36919468007704176,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.6881,
+      "step": 1022
+    },
+    {
+      "epoch": 0.8184,
+      "grad_norm": 0.4696102188764594,
+      "learning_rate": 1.6817054263070174e-05,
+      "loss": 0.6854,
+      "step": 1023
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.43331871809502737,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.7095,
+      "step": 1024
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.415549264948765,
+      "learning_rate": 1.6530436985486996e-05,
+      "loss": 0.6591,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.3259761160426885,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.5486,
+      "step": 1026
+    },
+    {
+      "epoch": 0.8216,
+      "grad_norm": 0.4730303904496809,
+      "learning_rate": 1.6246062978502164e-05,
+      "loss": 0.7071,
+      "step": 1027
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.39164646897654865,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6343,
+      "step": 1028
+    },
+    {
+      "epoch": 0.8232,
+      "grad_norm": 0.401940831835664,
+      "learning_rate": 1.5963939884756042e-05,
+      "loss": 0.6552,
+      "step": 1029
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.4131804273551023,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.7025,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8248,
+      "grad_norm": 0.36578940848087904,
+      "learning_rate": 1.5684075286394985e-05,
+      "loss": 0.6048,
+      "step": 1031
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.43494772273468696,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6574,
+      "step": 1032
+    },
+    {
+      "epoch": 0.8264,
+      "grad_norm": 0.3413079186978449,
+      "learning_rate": 1.5406476704867524e-05,
+      "loss": 0.5834,
+      "step": 1033
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.3702228926158319,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6724,
+      "step": 1034
+    },
+    {
+      "epoch": 0.828,
+      "grad_norm": 0.35894623325969405,
+      "learning_rate": 1.5131151600722337e-05,
+      "loss": 0.6094,
+      "step": 1035
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3832707585161335,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6574,
+      "step": 1036
+    },
+    {
+      "epoch": 0.8296,
+      "grad_norm": 0.4848049922814122,
+      "learning_rate": 1.485810737340767e-05,
+      "loss": 0.6712,
+      "step": 1037
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.4371107376549497,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.6892,
+      "step": 1038
+    },
+    {
+      "epoch": 0.8312,
+      "grad_norm": 0.4247444084248974,
+      "learning_rate": 1.4587351361072454e-05,
+      "loss": 0.6255,
+      "step": 1039
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.41743798972209106,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6532,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8328,
+      "grad_norm": 0.43356762665383475,
+      "learning_rate": 1.4318890840369182e-05,
+      "loss": 0.6294,
+      "step": 1041
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.3745596848118526,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.609,
+      "step": 1042
+    },
+    {
+      "epoch": 0.8344,
+      "grad_norm": 0.5161669674957426,
+      "learning_rate": 1.4052733026258281e-05,
+      "loss": 0.6717,
+      "step": 1043
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.40106947707116897,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.6496,
+      "step": 1044
+    },
+    {
+      "epoch": 0.836,
+      "grad_norm": 0.37874436034022985,
+      "learning_rate": 1.3788885071814172e-05,
+      "loss": 0.6408,
+      "step": 1045
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.40967781910183637,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6272,
+      "step": 1046
+    },
+    {
+      "epoch": 0.8376,
+      "grad_norm": 0.4502051746466981,
+      "learning_rate": 1.3527354068033139e-05,
+      "loss": 0.6752,
+      "step": 1047
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.41409654078712216,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.6559,
+      "step": 1048
+    },
+    {
+      "epoch": 0.8392,
+      "grad_norm": 0.4024007134875258,
+      "learning_rate": 1.326814704364262e-05,
+      "loss": 0.6149,
+      "step": 1049
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.42202386062247166,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6933,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8408,
+      "grad_norm": 0.34140954461432244,
+      "learning_rate": 1.3011270964912459e-05,
+      "loss": 0.6667,
+      "step": 1051
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.35743381773303684,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.621,
+      "step": 1052
+    },
+    {
+      "epoch": 0.8424,
+      "grad_norm": 0.38721575694378707,
+      "learning_rate": 1.275673273546758e-05,
+      "loss": 0.603,
+      "step": 1053
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.4745584619884366,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6752,
+      "step": 1054
+    },
+    {
+      "epoch": 0.844,
+      "grad_norm": 0.39580118702239475,
+      "learning_rate": 1.2504539196102439e-05,
+      "loss": 0.6712,
+      "step": 1055
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.41660875623574883,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6059,
+      "step": 1056
+    },
+    {
+      "epoch": 0.8456,
+      "grad_norm": 0.5109707410202864,
+      "learning_rate": 1.2254697124597237e-05,
+      "loss": 0.7445,
+      "step": 1057
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.4272723584555947,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6213,
+      "step": 1058
+    },
+    {
+      "epoch": 0.8472,
+      "grad_norm": 0.39866401576252947,
+      "learning_rate": 1.2007213235535786e-05,
+      "loss": 0.6749,
+      "step": 1059
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4140133323335469,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6834,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8488,
+      "grad_norm": 0.4310592151251458,
+      "learning_rate": 1.176209418012495e-05,
+      "loss": 0.6687,
+      "step": 1061
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.39700840989987757,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6731,
+      "step": 1062
+    },
+    {
+      "epoch": 0.8504,
+      "grad_norm": 0.41815502073714694,
+      "learning_rate": 1.1519346546015907e-05,
+      "loss": 0.6177,
+      "step": 1063
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.46605250253452,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.5856,
+      "step": 1064
+    },
+    {
+      "epoch": 0.852,
+      "grad_norm": 0.43347330875978707,
+      "learning_rate": 1.1278976857127311e-05,
+      "loss": 0.6567,
+      "step": 1065
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.4088890928128398,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.7008,
+      "step": 1066
+    },
+    {
+      "epoch": 0.8536,
+      "grad_norm": 0.3982232717283411,
+      "learning_rate": 1.1040991573469629e-05,
+      "loss": 0.6431,
+      "step": 1067
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3379945503748024,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.5687,
+      "step": 1068
+    },
+    {
+      "epoch": 0.8552,
+      "grad_norm": 0.6047215426337988,
+      "learning_rate": 1.0805397090971737e-05,
+      "loss": 0.6368,
+      "step": 1069
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.38961219633009386,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6473,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8568,
+      "grad_norm": 0.45536044577956303,
+      "learning_rate": 1.057219974130903e-05,
+      "loss": 0.6487,
+      "step": 1071
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.39258615219014364,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6526,
+      "step": 1072
+    },
+    {
+      "epoch": 0.8584,
+      "grad_norm": 0.42327691151854674,
+      "learning_rate": 1.0341405791733183e-05,
+      "loss": 0.6143,
+      "step": 1073
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.4330882506423897,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6514,
+      "step": 1074
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.42339403515666846,
+      "learning_rate": 1.0113021444903726e-05,
+      "loss": 0.6148,
+      "step": 1075
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3982875661057225,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6447,
+      "step": 1076
+    },
+    {
+      "epoch": 0.8616,
+      "grad_norm": 0.4259757936497943,
+      "learning_rate": 9.887052838721322e-06,
+      "loss": 0.611,
+      "step": 1077
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.37383392180131153,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6141,
+      "step": 1078
+    },
+    {
+      "epoch": 0.8632,
+      "grad_norm": 0.4245333502093859,
+      "learning_rate": 9.663506046162985e-06,
+      "loss": 0.6965,
+      "step": 1079
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.5487012845068449,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6283,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8648,
+      "grad_norm": 0.5532168865592939,
+      "learning_rate": 9.44238707511862e-06,
+      "loss": 0.6066,
+      "step": 1081
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.39103516028749713,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6187,
+      "step": 1082
+    },
+    {
+      "epoch": 0.8664,
+      "grad_norm": 0.44356388628846,
+      "learning_rate": 9.22370186822965e-06,
+      "loss": 0.6503,
+      "step": 1083
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.48139483872715877,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.5831,
+      "step": 1084
+    },
+    {
+      "epoch": 0.868,
+      "grad_norm": 0.38181658336917085,
+      "learning_rate": 9.0074563027294e-06,
+      "loss": 0.658,
+      "step": 1085
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.4386022855137955,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6995,
+      "step": 1086
+    },
+    {
+      "epoch": 0.8696,
+      "grad_norm": 0.3901008769152281,
+      "learning_rate": 8.79365619028507e-06,
+      "loss": 0.6704,
+      "step": 1087
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4572589949870747,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6376,
+      "step": 1088
+    },
+    {
+      "epoch": 0.8712,
+      "grad_norm": 0.4531364663828187,
+      "learning_rate": 8.582307276841462e-06,
+      "loss": 0.6841,
+      "step": 1089
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.6513861861854793,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.6916,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8728,
+      "grad_norm": 0.4595477594709598,
+      "learning_rate": 8.37341524246672e-06,
+      "loss": 0.6351,
+      "step": 1091
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.3872826193208302,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.66,
+      "step": 1092
+    },
+    {
+      "epoch": 0.8744,
+      "grad_norm": 0.3634488772721573,
+      "learning_rate": 8.166985701199582e-06,
+      "loss": 0.6141,
+      "step": 1093
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.4031911965288131,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6401,
+      "step": 1094
+    },
+    {
+      "epoch": 0.876,
+      "grad_norm": 0.4016344899493216,
+      "learning_rate": 7.963024200898462e-06,
+      "loss": 0.6671,
+      "step": 1095
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.4475939413873218,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6833,
+      "step": 1096
+    },
+    {
+      "epoch": 0.8776,
+      "grad_norm": 0.37003724677834593,
+      "learning_rate": 7.761536223092458e-06,
+      "loss": 0.5828,
+      "step": 1097
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.42351050004449026,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6609,
+      "step": 1098
+    },
+    {
+      "epoch": 0.8792,
+      "grad_norm": 0.41176807685505945,
+      "learning_rate": 7.562527182833978e-06,
+      "loss": 0.6423,
+      "step": 1099
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.44785618534091903,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.7489,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8808,
+      "grad_norm": 0.36977653560254525,
+      "learning_rate": 7.366002428553153e-06,
+      "loss": 0.5788,
+      "step": 1101
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.37308227584401626,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6358,
+      "step": 1102
+    },
+    {
+      "epoch": 0.8824,
+      "grad_norm": 0.3550084968253862,
+      "learning_rate": 7.171967241914224e-06,
+      "loss": 0.6017,
+      "step": 1103
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.41228005753397967,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6555,
+      "step": 1104
+    },
+    {
+      "epoch": 0.884,
+      "grad_norm": 0.4114956714467532,
+      "learning_rate": 6.980426837673437e-06,
+      "loss": 0.6754,
+      "step": 1105
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.3619490877691349,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.5776,
+      "step": 1106
+    },
+    {
+      "epoch": 0.8856,
+      "grad_norm": 0.5093977449239254,
+      "learning_rate": 6.791386363539065e-06,
+      "loss": 0.6357,
+      "step": 1107
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.41458795501113116,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.6342,
+      "step": 1108
+    },
+    {
+      "epoch": 0.8872,
+      "grad_norm": 0.44181071205391537,
+      "learning_rate": 6.604850900032955e-06,
+      "loss": 0.7112,
+      "step": 1109
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.44258110359726704,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6001,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8888,
+      "grad_norm": 0.4262379828710342,
+      "learning_rate": 6.420825460353974e-06,
+      "loss": 0.6599,
+      "step": 1111
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3615813743871014,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6202,
+      "step": 1112
+    },
+    {
+      "epoch": 0.8904,
+      "grad_norm": 0.4118062024955257,
+      "learning_rate": 6.239314990243339e-06,
+      "loss": 0.641,
+      "step": 1113
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.3724427948084534,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.6596,
+      "step": 1114
+    },
+    {
+      "epoch": 0.892,
+      "grad_norm": 0.4357527971934326,
+      "learning_rate": 6.0603243678516995e-06,
+      "loss": 0.658,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.38569246848606425,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.5852,
+      "step": 1116
+    },
+    {
+      "epoch": 0.8936,
+      "grad_norm": 0.42228534505779775,
+      "learning_rate": 5.883858403607967e-06,
+      "loss": 0.6388,
+      "step": 1117
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.36227443102893414,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6094,
+      "step": 1118
+    },
+    {
+      "epoch": 0.8952,
+      "grad_norm": 0.4331382548215056,
+      "learning_rate": 5.7099218400900716e-06,
+      "loss": 0.6084,
+      "step": 1119
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.7345493278756747,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6157,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8968,
+      "grad_norm": 0.38239616006000005,
+      "learning_rate": 5.538519351897575e-06,
+      "loss": 0.6536,
+      "step": 1121
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.417734215760906,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6337,
+      "step": 1122
+    },
+    {
+      "epoch": 0.8984,
+      "grad_norm": 0.38428949678475666,
+      "learning_rate": 5.369655545525909e-06,
+      "loss": 0.6168,
+      "step": 1123
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.4017311026883857,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6786,
+      "step": 1124
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.39124216968664244,
+      "learning_rate": 5.2033349592426335e-06,
+      "loss": 0.6254,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.44134178764982895,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.7357,
+      "step": 1126
+    },
+    {
+      "epoch": 0.9016,
+      "grad_norm": 0.44013606406112815,
+      "learning_rate": 5.039562062965508e-06,
+      "loss": 0.6511,
+      "step": 1127
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.4143382812512047,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6132,
+      "step": 1128
+    },
+    {
+      "epoch": 0.9032,
+      "grad_norm": 0.37524037136107635,
+      "learning_rate": 4.87834125814235e-06,
+      "loss": 0.6601,
+      "step": 1129
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.3676822009982106,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.5757,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9048,
+      "grad_norm": 0.4578191014129421,
+      "learning_rate": 4.719676877632639e-06,
+      "loss": 0.6736,
+      "step": 1131
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.4277201145139041,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6259,
+      "step": 1132
+    },
+    {
+      "epoch": 0.9064,
+      "grad_norm": 0.3953872212086165,
+      "learning_rate": 4.563573185591219e-06,
+      "loss": 0.6443,
+      "step": 1133
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.35997835055017297,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6642,
+      "step": 1134
+    },
+    {
+      "epoch": 0.908,
+      "grad_norm": 0.4444962679689668,
+      "learning_rate": 4.4100343773536225e-06,
+      "loss": 0.6286,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.4615731175022121,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6951,
+      "step": 1136
+    },
+    {
+      "epoch": 0.9096,
+      "grad_norm": 0.4298474182913664,
+      "learning_rate": 4.259064579323302e-06,
+      "loss": 0.697,
+      "step": 1137
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.4581268475559863,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6712,
+      "step": 1138
+    },
+    {
+      "epoch": 0.9112,
+      "grad_norm": 0.39428803622226477,
+      "learning_rate": 4.1106678488607495e-06,
+      "loss": 0.6479,
+      "step": 1139
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4222497647461776,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.6695,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9128,
+      "grad_norm": 0.36614219046640323,
+      "learning_rate": 3.964848174174541e-06,
+      "loss": 0.6076,
+      "step": 1141
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.41210763713885834,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.6477,
+      "step": 1142
+    },
+    {
+      "epoch": 0.9144,
+      "grad_norm": 0.4537911864094117,
+      "learning_rate": 3.821609474213983e-06,
+      "loss": 0.6246,
+      "step": 1143
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.39606692867099963,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6843,
+      "step": 1144
+    },
+    {
+      "epoch": 0.916,
+      "grad_norm": 0.39611050158265976,
+      "learning_rate": 3.6809555985639068e-06,
+      "loss": 0.6274,
+      "step": 1145
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.43727607037447086,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6375,
+      "step": 1146
+    },
+    {
+      "epoch": 0.9176,
+      "grad_norm": 0.4101542675685551,
+      "learning_rate": 3.5428903273411863e-06,
+      "loss": 0.5848,
+      "step": 1147
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.4193003486528778,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6676,
+      "step": 1148
+    },
+    {
+      "epoch": 0.9192,
+      "grad_norm": 0.4134121549238326,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.7118,
+      "step": 1149
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.4212007528436988,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6537,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9208,
+      "grad_norm": 0.42379361189297626,
+      "learning_rate": 3.2745403706978872e-06,
+      "loss": 0.6256,
+      "step": 1151
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.5180894594898717,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.7122,
+      "step": 1152
+    },
+    {
+      "epoch": 0.9224,
+      "grad_norm": 0.47307211912432073,
+      "learning_rate": 3.1442628972662704e-06,
+      "loss": 0.6683,
+      "step": 1153
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.43413757223825766,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6657,
+      "step": 1154
+    },
+    {
+      "epoch": 0.924,
+      "grad_norm": 0.3895483494241512,
+      "learning_rate": 3.0165884520461316e-06,
+      "loss": 0.6266,
+      "step": 1155
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3829929019680482,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.6368,
+      "step": 1156
+    },
+    {
+      "epoch": 0.9256,
+      "grad_norm": 0.36524539885154184,
+      "learning_rate": 2.8915204663281013e-06,
+      "loss": 0.6319,
+      "step": 1157
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.40304824129876693,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.7264,
+      "step": 1158
+    },
+    {
+      "epoch": 0.9272,
+      "grad_norm": 0.3875386822649642,
+      "learning_rate": 2.7690623013533976e-06,
+      "loss": 0.6186,
+      "step": 1159
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.43477554408887387,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.6688,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9288,
+      "grad_norm": 0.401224756273508,
+      "learning_rate": 2.649217248223468e-06,
+      "loss": 0.6272,
+      "step": 1161
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.40867393736243735,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.606,
+      "step": 1162
+    },
+    {
+      "epoch": 0.9304,
+      "grad_norm": 0.4026384489112317,
+      "learning_rate": 2.5319885278115906e-06,
+      "loss": 0.6251,
+      "step": 1163
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.38630138834839706,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.6321,
+      "step": 1164
+    },
+    {
+      "epoch": 0.932,
+      "grad_norm": 0.4304460037135496,
+      "learning_rate": 2.4173792906762804e-06,
+      "loss": 0.5856,
+      "step": 1165
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.437848733905509,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6794,
+      "step": 1166
+    },
+    {
+      "epoch": 0.9336,
+      "grad_norm": 0.4005311104954899,
+      "learning_rate": 2.3053926169765984e-06,
+      "loss": 0.6497,
+      "step": 1167
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.4199134531108724,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6099,
+      "step": 1168
+    },
+    {
+      "epoch": 0.9352,
+      "grad_norm": 0.3979361042617356,
+      "learning_rate": 2.1960315163894075e-06,
+      "loss": 0.6798,
+      "step": 1169
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.386354953580877,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.5612,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9368,
+      "grad_norm": 0.404254564584727,
+      "learning_rate": 2.0892989280284823e-06,
+      "loss": 0.6745,
+      "step": 1171
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.42490876854613235,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.5576,
+      "step": 1172
+    },
+    {
+      "epoch": 0.9384,
+      "grad_norm": 0.47076547735719737,
+      "learning_rate": 1.9851977203654835e-06,
+      "loss": 0.6711,
+      "step": 1173
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.38845126740707286,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6826,
+      "step": 1174
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.4269692732958586,
+      "learning_rate": 1.8837306911529184e-06,
+      "loss": 0.6626,
+      "step": 1175
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.3392421835390835,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.5744,
+      "step": 1176
+    },
+    {
+      "epoch": 0.9416,
+      "grad_norm": 0.42102143136980824,
+      "learning_rate": 1.7849005673489127e-06,
+      "loss": 0.6317,
+      "step": 1177
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.3860964066759228,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6548,
+      "step": 1178
+    },
+    {
+      "epoch": 0.9432,
+      "grad_norm": 0.43600753519657165,
+      "learning_rate": 1.6887100050439587e-06,
+      "loss": 0.6195,
+      "step": 1179
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.42991034950979723,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6245,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9448,
+      "grad_norm": 0.5136795950746355,
+      "learning_rate": 1.595161589389449e-06,
+      "loss": 0.6429,
+      "step": 1181
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.3936575876555075,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6021,
+      "step": 1182
+    },
+    {
+      "epoch": 0.9464,
+      "grad_norm": 0.42876218271277017,
+      "learning_rate": 1.5042578345283108e-06,
+      "loss": 0.6478,
+      "step": 1183
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3946883037955344,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6055,
+      "step": 1184
+    },
+    {
+      "epoch": 0.948,
+      "grad_norm": 0.35540431096006864,
+      "learning_rate": 1.4160011835273934e-06,
+      "loss": 0.6307,
+      "step": 1185
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.4403763082610581,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.6056,
+      "step": 1186
+    },
+    {
+      "epoch": 0.9496,
+      "grad_norm": 0.49814568194496855,
+      "learning_rate": 1.3303940083117527e-06,
+      "loss": 0.734,
+      "step": 1187
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.41978407244414206,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6491,
+      "step": 1188
+    },
+    {
+      "epoch": 0.9512,
+      "grad_norm": 0.41463671705550975,
+      "learning_rate": 1.2474386096010039e-06,
+      "loss": 0.6125,
+      "step": 1189
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4117919435091052,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.6596,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9528,
+      "grad_norm": 0.4339789880833592,
+      "learning_rate": 1.1671372168474138e-06,
+      "loss": 0.629,
+      "step": 1191
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.5259979050287945,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6626,
+      "step": 1192
+    },
+    {
+      "epoch": 0.9544,
+      "grad_norm": 0.37344303123801154,
+      "learning_rate": 1.089491988176017e-06,
+      "loss": 0.629,
+      "step": 1193
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.3790064804687914,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.5889,
+      "step": 1194
+    },
+    {
+      "epoch": 0.956,
+      "grad_norm": 0.45761397831864875,
+      "learning_rate": 1.014505010326583e-06,
+      "loss": 0.6793,
+      "step": 1195
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.4289257270302121,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.6444,
+      "step": 1196
+    },
+    {
+      "epoch": 0.9576,
+      "grad_norm": 0.38046977543975785,
+      "learning_rate": 9.421782985976068e-07,
+      "loss": 0.6396,
+      "step": 1197
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.3951377232346646,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.68,
+      "step": 1198
+    },
+    {
+      "epoch": 0.9592,
+      "grad_norm": 0.4212853470737934,
+      "learning_rate": 8.725137967920738e-07,
+      "loss": 0.6226,
+      "step": 1199
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.48858763273052036,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6551,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9608,
+      "grad_norm": 0.4135828243426833,
+      "learning_rate": 8.055133771652345e-07,
+      "loss": 0.6525,
+      "step": 1201
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.3847938876096351,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.5939,
+      "step": 1202
+    },
+    {
+      "epoch": 0.9624,
+      "grad_norm": 0.38890861500471724,
+      "learning_rate": 7.411788403743237e-07,
+      "loss": 0.5691,
+      "step": 1203
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.4136033176045734,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6644,
+      "step": 1204
+    },
+    {
+      "epoch": 0.964,
+      "grad_norm": 0.6868547482667599,
+      "learning_rate": 6.7951191543012e-07,
+      "loss": 0.8096,
+      "step": 1205
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.459762749444286,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6286,
+      "step": 1206
+    },
+    {
+      "epoch": 0.9656,
+      "grad_norm": 0.42440389461679645,
+      "learning_rate": 6.205142596505176e-07,
+      "loss": 0.6463,
+      "step": 1207
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.39160122224974353,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6527,
+      "step": 1208
+    },
+    {
+      "epoch": 0.9672,
+      "grad_norm": 0.39897606690271464,
+      "learning_rate": 5.64187458615939e-07,
+      "loss": 0.5795,
+      "step": 1209
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.37557030555380766,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.5773,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9688,
+      "grad_norm": 0.4339982295309075,
+      "learning_rate": 5.105330261267916e-07,
+      "loss": 0.6233,
+      "step": 1211
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.39570577798208106,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6276,
+      "step": 1212
+    },
+    {
+      "epoch": 0.9704,
+      "grad_norm": 0.3693545088566975,
+      "learning_rate": 4.5955240416271084e-07,
+      "loss": 0.6218,
+      "step": 1213
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.4255677202256316,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.655,
+      "step": 1214
+    },
+    {
+      "epoch": 0.972,
+      "grad_norm": 0.3815337333068184,
+      "learning_rate": 4.112469628438365e-07,
+      "loss": 0.6685,
+      "step": 1215
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.40631620283837727,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6269,
+      "step": 1216
+    },
+    {
+      "epoch": 0.9736,
+      "grad_norm": 0.5140202578119795,
+      "learning_rate": 3.6561800039403016e-07,
+      "loss": 0.705,
+      "step": 1217
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.4097602726401862,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.7011,
+      "step": 1218
+    },
+    {
+      "epoch": 0.9752,
+      "grad_norm": 0.356559194773096,
+      "learning_rate": 3.2266674310589273e-07,
+      "loss": 0.5875,
+      "step": 1219
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.47372925471991206,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6191,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9768,
+      "grad_norm": 0.40884250486595014,
+      "learning_rate": 2.8239434530792365e-07,
+      "loss": 0.6747,
+      "step": 1221
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.35824242354732977,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.5815,
+      "step": 1222
+    },
+    {
+      "epoch": 0.9784,
+      "grad_norm": 0.4188952895360446,
+      "learning_rate": 2.448018893333681e-07,
+      "loss": 0.6485,
+      "step": 1223
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.4248652087397519,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.685,
+      "step": 1224
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.4474075111691391,
+      "learning_rate": 2.098903854912515e-07,
+      "loss": 0.7218,
+      "step": 1225
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.3928266375173183,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6051,
+      "step": 1226
+    },
+    {
+      "epoch": 0.9816,
+      "grad_norm": 0.456892325397415,
+      "learning_rate": 1.7766077203915655e-07,
+      "loss": 0.607,
+      "step": 1227
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.394065013835043,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6122,
+      "step": 1228
+    },
+    {
+      "epoch": 0.9832,
+      "grad_norm": 0.38099445169011364,
+      "learning_rate": 1.481139151579991e-07,
+      "loss": 0.6408,
+      "step": 1229
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.3616810806947919,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.588,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9848,
+      "grad_norm": 0.3464251650455377,
+      "learning_rate": 1.2125060892881346e-07,
+      "loss": 0.6282,
+      "step": 1231
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.4305910204395361,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.6426,
+      "step": 1232
+    },
+    {
+      "epoch": 0.9864,
+      "grad_norm": 0.4601820483165333,
+      "learning_rate": 9.707157531134713e-08,
+      "loss": 0.6547,
+      "step": 1233
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.36916695811479766,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6605,
+      "step": 1234
+    },
+    {
+      "epoch": 0.988,
+      "grad_norm": 0.38115261332943323,
+      "learning_rate": 7.557746412468758e-08,
+      "loss": 0.6157,
+      "step": 1235
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.3988189730753151,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6277,
+      "step": 1236
+    },
+    {
+      "epoch": 0.9896,
+      "grad_norm": 0.38448671069725276,
+      "learning_rate": 5.6768853029787184e-08,
+      "loss": 0.6408,
+      "step": 1237
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.44966188238655397,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6595,
+      "step": 1238
+    },
+    {
+      "epoch": 0.9912,
+      "grad_norm": 0.3983863858624136,
+      "learning_rate": 4.064624751394242e-08,
+      "loss": 0.6776,
+      "step": 1239
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.38956811671641306,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6073,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9928,
+      "grad_norm": 0.3740991924737241,
+      "learning_rate": 2.7210080877237976e-08,
+      "loss": 0.5863,
+      "step": 1241
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.4382104532354976,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6068,
+      "step": 1242
+    },
+    {
+      "epoch": 0.9944,
+      "grad_norm": 0.44523308215970514,
+      "learning_rate": 1.646071422083395e-08,
+      "loss": 0.626,
+      "step": 1243
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.4050514605095301,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.566,
+      "step": 1244
+    },
+    {
+      "epoch": 0.996,
+      "grad_norm": 0.44418607390339454,
+      "learning_rate": 8.398436437317969e-09,
+      "loss": 0.7074,
+      "step": 1245
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.4078146301957918,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6424,
+      "step": 1246
+    },
+    {
+      "epoch": 0.9976,
+      "grad_norm": 0.5548149284659238,
+      "learning_rate": 3.023464202944748e-09,
+      "loss": 0.7001,
+      "step": 1247
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.46736439462345836,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.7094,
+      "step": 1248
+    },
+    {
+      "epoch": 0.9992,
+      "grad_norm": 0.4518059608864925,
+      "learning_rate": 3.3594197175190745e-10,
+      "loss": 0.6831,
+      "step": 1249
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.4425256887616959,
+      "learning_rate": 0.0,
+      "loss": 0.6108,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0,
+      "step": 1250,
+      "total_flos": 1092507233615872.0,
+      "train_loss": 0.7165939631938935,
+      "train_runtime": 19654.796,
+      "train_samples_per_second": 1.018,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1092507233615872.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7aacd364b6a2e06745417f97963fbf5ef1bd654f
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "v_proj",
+    "q_proj",
+    "down_proj",
+    "o_proj",
+    "gate_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..621bceadcc0722ba6a274d73b215dc7505e0fb0b
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4e77648b06a232b0f637629669895ef414878adeb2c1e242ee78c4c1e8c4384
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f558ae91a282fc45f2aa3944bed7ed032a3a0530
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcd0d75dc06578ee9f1adc8d554a420f8d979fea00ed1483f42202a5a4597b6f
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e2db56473d5b1d17f4975876e3d6e95c36d3fc8
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,8792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.9040188214118176,
+      "learning_rate": 5.263157894736842e-06,
+      "loss": 1.4158,
+      "step": 1
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.8074281761096858,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.276,
+      "step": 2
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.9202532471159965,
+      "learning_rate": 1.5789473684210526e-05,
+      "loss": 1.3454,
+      "step": 3
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9621776164972817,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.4377,
+      "step": 4
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.8873872672858544,
+      "learning_rate": 2.6315789473684212e-05,
+      "loss": 1.4082,
+      "step": 5
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.8203039349084842,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.3406,
+      "step": 6
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.808691922577677,
+      "learning_rate": 3.6842105263157895e-05,
+      "loss": 1.2893,
+      "step": 7
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 1.1192000416783763,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.2639,
+      "step": 8
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.9533747850668197,
+      "learning_rate": 4.736842105263158e-05,
+      "loss": 1.0958,
+      "step": 9
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.5229297009363443,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.1333,
+      "step": 10
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.8190964064118581,
+      "learning_rate": 5.789473684210527e-05,
+      "loss": 0.9881,
+      "step": 11
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.6678239522256918,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 0.9268,
+      "step": 12
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.7765669885509979,
+      "learning_rate": 6.842105263157895e-05,
+      "loss": 1.0071,
+      "step": 13
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.7902060939230818,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.9962,
+      "step": 14
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.6222902380931187,
+      "learning_rate": 7.894736842105263e-05,
+      "loss": 0.8933,
+      "step": 15
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.6276138188304536,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.9499,
+      "step": 16
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.504029875044396,
+      "learning_rate": 8.947368421052632e-05,
+      "loss": 0.8542,
+      "step": 17
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.5882045078003736,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9945,
+      "step": 18
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.6067680318757338,
+      "learning_rate": 0.0001,
+      "loss": 0.9622,
+      "step": 19
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.518887948143142,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.8837,
+      "step": 20
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.5203264428776911,
+      "learning_rate": 0.0001105263157894737,
+      "loss": 0.8789,
+      "step": 21
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.47424481517796524,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.8534,
+      "step": 22
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.4941053704203212,
+      "learning_rate": 0.00012105263157894738,
+      "loss": 0.8618,
+      "step": 23
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5818901801269433,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8146,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.5911189948900281,
+      "learning_rate": 0.00013157894736842108,
+      "loss": 0.9155,
+      "step": 25
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.48524617173345874,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.8631,
+      "step": 26
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.5138017758711481,
+      "learning_rate": 0.00014210526315789474,
+      "loss": 0.8625,
+      "step": 27
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5764785783785729,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.8924,
+      "step": 28
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.5617563618701408,
+      "learning_rate": 0.00015263157894736845,
+      "loss": 0.9154,
+      "step": 29
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.48801519614793604,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.9098,
+      "step": 30
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.5886665769080431,
+      "learning_rate": 0.0001631578947368421,
+      "loss": 0.8731,
+      "step": 31
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5038223806917298,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8437,
+      "step": 32
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.4910317511980981,
+      "learning_rate": 0.0001736842105263158,
+      "loss": 0.8726,
+      "step": 33
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.4970925346086658,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8247,
+      "step": 34
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.4783118687921105,
+      "learning_rate": 0.00018421052631578948,
+      "loss": 0.8158,
+      "step": 35
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.57283183166534,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8997,
+      "step": 36
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.4869656964368314,
+      "learning_rate": 0.00019473684210526317,
+      "loss": 0.8427,
+      "step": 37
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.51038300513493,
+      "learning_rate": 0.0002,
+      "loss": 0.9406,
+      "step": 38
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.4738312757391544,
+      "learning_rate": 0.00019999966405802826,
+      "loss": 0.7848,
+      "step": 39
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.48785244711182263,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8485,
+      "step": 40
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.48335867509192165,
+      "learning_rate": 0.00019999697653579705,
+      "loss": 0.8593,
+      "step": 41
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.4371580297482496,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.7799,
+      "step": 42
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.4848757194195308,
+      "learning_rate": 0.0001999916015635627,
+      "loss": 0.8057,
+      "step": 43
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5953297448372896,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.9029,
+      "step": 44
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.4832215495190979,
+      "learning_rate": 0.00019998353928577919,
+      "loss": 0.9057,
+      "step": 45
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.472982312319888,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8817,
+      "step": 46
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.5515206751669125,
+      "learning_rate": 0.0001999727899191228,
+      "loss": 0.8644,
+      "step": 47
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.504088694158904,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.914,
+      "step": 48
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.4855141657192389,
+      "learning_rate": 0.00019995935375248606,
+      "loss": 0.8794,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.45174233023081406,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8209,
+      "step": 50
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.5366171085546535,
+      "learning_rate": 0.00019994323114697022,
+      "loss": 0.8341,
+      "step": 51
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.416867725937757,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8277,
+      "step": 52
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.493119369029618,
+      "learning_rate": 0.0001999244225358753,
+      "loss": 0.8414,
+      "step": 53
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.45936334657120104,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8536,
+      "step": 54
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.4925848121650304,
+      "learning_rate": 0.00019990292842468868,
+      "loss": 0.8257,
+      "step": 55
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5090191041405892,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8029,
+      "step": 56
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.4610077140549898,
+      "learning_rate": 0.0001998787493910712,
+      "loss": 0.802,
+      "step": 57
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.45304790763425,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.7792,
+      "step": 58
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.4851067386832999,
+      "learning_rate": 0.000199851886084842,
+      "loss": 0.8257,
+      "step": 59
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.49386766283262384,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8654,
+      "step": 60
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.5778003128858153,
+      "learning_rate": 0.00019982233922796085,
+      "loss": 0.827,
+      "step": 61
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5982362485307835,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8657,
+      "step": 62
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.4533742372264301,
+      "learning_rate": 0.00019979010961450878,
+      "loss": 0.7645,
+      "step": 63
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.43785735755869126,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.7817,
+      "step": 64
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.5031107349688599,
+      "learning_rate": 0.00019975519811066663,
+      "loss": 0.8511,
+      "step": 65
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.4266434125759772,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8479,
+      "step": 66
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.7435323722682388,
+      "learning_rate": 0.0001997176056546921,
+      "loss": 0.8925,
+      "step": 67
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.5172299270586722,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.7584,
+      "step": 68
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.4887792570599819,
+      "learning_rate": 0.0001996773332568941,
+      "loss": 0.8147,
+      "step": 69
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.47558116813678436,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.928,
+      "step": 70
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.4755402402219778,
+      "learning_rate": 0.00019963438199960599,
+      "loss": 0.8153,
+      "step": 71
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.43608674343070436,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.7996,
+      "step": 72
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.4560152584275628,
+      "learning_rate": 0.00019958875303715615,
+      "loss": 0.7683,
+      "step": 73
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.552602123722438,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.7999,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5021024404811664,
+      "learning_rate": 0.0001995404475958373,
+      "loss": 0.8828,
+      "step": 75
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.45265727595403926,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8004,
+      "step": 76
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.4703802428678396,
+      "learning_rate": 0.0001994894669738732,
+      "loss": 0.8478,
+      "step": 77
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.4674731530607759,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.7384,
+      "step": 78
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.4410723711791221,
+      "learning_rate": 0.0001994358125413841,
+      "loss": 0.7545,
+      "step": 79
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.41824527148519003,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.7648,
+      "step": 80
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.45545971341724595,
+      "learning_rate": 0.0001993794857403495,
+      "loss": 0.7813,
+      "step": 81
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.5004966730084717,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.8505,
+      "step": 82
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.4760663850299163,
+      "learning_rate": 0.0001993204880845699,
+      "loss": 0.8244,
+      "step": 83
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4882789811829889,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.7859,
+      "step": 84
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.45758092600536937,
+      "learning_rate": 0.00019925882115962568,
+      "loss": 0.7701,
+      "step": 85
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.45489206820776945,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.7703,
+      "step": 86
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.4953973875429509,
+      "learning_rate": 0.00019919448662283478,
+      "loss": 0.7997,
+      "step": 87
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.4706752206269874,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.7238,
+      "step": 88
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.5307930125586854,
+      "learning_rate": 0.00019912748620320794,
+      "loss": 0.8249,
+      "step": 89
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.49981665350066634,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.8521,
+      "step": 90
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.5564656093618034,
+      "learning_rate": 0.00019905782170140238,
+      "loss": 0.7512,
+      "step": 91
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.44834150164979303,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.7718,
+      "step": 92
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.5159727888050307,
+      "learning_rate": 0.00019898549498967343,
+      "loss": 0.8076,
+      "step": 93
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.4787236389531373,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.7917,
+      "step": 94
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.492894818998774,
+      "learning_rate": 0.000198910508011824,
+      "loss": 0.7781,
+      "step": 95
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.4292074077063288,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7411,
+      "step": 96
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.47839784486091336,
+      "learning_rate": 0.00019883286278315262,
+      "loss": 0.8444,
+      "step": 97
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.5133482397538831,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8673,
+      "step": 98
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.42033885728008547,
+      "learning_rate": 0.00019875256139039902,
+      "loss": 0.7766,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.46739787930042637,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8283,
+      "step": 100
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.4307240442868828,
+      "learning_rate": 0.00019866960599168826,
+      "loss": 0.7975,
+      "step": 101
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.49899824851338265,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.8347,
+      "step": 102
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.44772959357290676,
+      "learning_rate": 0.0001985839988164726,
+      "loss": 0.7913,
+      "step": 103
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.3971504789375786,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.7069,
+      "step": 104
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.43937587550035584,
+      "learning_rate": 0.00019849574216547171,
+      "loss": 0.7717,
+      "step": 105
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.522585248895453,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.788,
+      "step": 106
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.4876458104618733,
+      "learning_rate": 0.00019840483841061058,
+      "loss": 0.7161,
+      "step": 107
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.44230394173071313,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.8522,
+      "step": 108
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.5118965668326318,
+      "learning_rate": 0.00019831128999495606,
+      "loss": 0.8358,
+      "step": 109
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.4307903664861674,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.7655,
+      "step": 110
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.40542527217686575,
+      "learning_rate": 0.0001982150994326511,
+      "loss": 0.7809,
+      "step": 111
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4575152055233773,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.7818,
+      "step": 112
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.4492476210335051,
+      "learning_rate": 0.0001981162693088471,
+      "loss": 0.7938,
+      "step": 113
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.42168461140796104,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.7822,
+      "step": 114
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.48686512745299765,
+      "learning_rate": 0.0001980148022796345,
+      "loss": 0.7824,
+      "step": 115
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.40701186895737285,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7672,
+      "step": 116
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.5405954671628467,
+      "learning_rate": 0.00019791070107197153,
+      "loss": 0.8782,
+      "step": 117
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.4648181416591156,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.7651,
+      "step": 118
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.47812112860109685,
+      "learning_rate": 0.0001978039684836106,
+      "loss": 0.7482,
+      "step": 119
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4649791025079638,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.8236,
+      "step": 120
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.41964331638159746,
+      "learning_rate": 0.0001976946073830234,
+      "loss": 0.7619,
+      "step": 121
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.47113555893383857,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8301,
+      "step": 122
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.39594357398635965,
+      "learning_rate": 0.00019758262070932375,
+      "loss": 0.7286,
+      "step": 123
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.5197786058531375,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.871,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.40914468391436765,
+      "learning_rate": 0.00019746801147218842,
+      "loss": 0.7935,
+      "step": 125
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.48421063496832645,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.7757,
+      "step": 126
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.40375094393415706,
+      "learning_rate": 0.00019735078275177654,
+      "loss": 0.7551,
+      "step": 127
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.40718004312825856,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.8125,
+      "step": 128
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.4812014892417162,
+      "learning_rate": 0.00019723093769864663,
+      "loss": 0.7058,
+      "step": 129
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.4953728745419649,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8233,
+      "step": 130
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.4700253036597113,
+      "learning_rate": 0.0001971084795336719,
+      "loss": 0.7956,
+      "step": 131
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.5215964588468351,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.844,
+      "step": 132
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.4724685471889322,
+      "learning_rate": 0.00019698341154795389,
+      "loss": 0.7369,
+      "step": 133
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.4834071598595188,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.8177,
+      "step": 134
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.507433079199277,
+      "learning_rate": 0.00019685573710273376,
+      "loss": 0.8305,
+      "step": 135
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.4728754801754339,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.7487,
+      "step": 136
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.48745482688410646,
+      "learning_rate": 0.00019672545962930215,
+      "loss": 0.8188,
+      "step": 137
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.4642048721871913,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.7658,
+      "step": 138
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.5378006281898445,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.8066,
+      "step": 139
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.41028452787634606,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.7391,
+      "step": 140
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.5577951692374868,
+      "learning_rate": 0.00019645710967265882,
+      "loss": 0.7467,
+      "step": 141
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.44260415171980666,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7558,
+      "step": 142
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.543371139901953,
+      "learning_rate": 0.00019631904440143612,
+      "loss": 0.8466,
+      "step": 143
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.458721343297423,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.8494,
+      "step": 144
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.5390806782860094,
+      "learning_rate": 0.00019617839052578603,
+      "loss": 0.8402,
+      "step": 145
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.48744932087130155,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7678,
+      "step": 146
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.504789130409822,
+      "learning_rate": 0.0001960351518258255,
+      "loss": 0.7891,
+      "step": 147
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.4465558048903934,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8409,
+      "step": 148
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.3842798692320988,
+      "learning_rate": 0.00019588933215113926,
+      "loss": 0.6798,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.4945421272518994,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.7774,
+      "step": 150
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.43464997145903267,
+      "learning_rate": 0.00019574093542067673,
+      "loss": 0.7287,
+      "step": 151
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.48187487570821935,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.783,
+      "step": 152
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.42590267799494497,
+      "learning_rate": 0.0001955899656226464,
+      "loss": 0.7811,
+      "step": 153
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.4476569948112045,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.7811,
+      "step": 154
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.4746596130009304,
+      "learning_rate": 0.0001954364268144088,
+      "loss": 0.809,
+      "step": 155
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.439012305507357,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7954,
+      "step": 156
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.4849607584466126,
+      "learning_rate": 0.00019528032312236736,
+      "loss": 0.8013,
+      "step": 157
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.43950558594870676,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7969,
+      "step": 158
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.49193291403363903,
+      "learning_rate": 0.00019512165874185767,
+      "loss": 0.8599,
+      "step": 159
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.47752096536945043,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.7436,
+      "step": 160
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.5085683059812202,
+      "learning_rate": 0.0001949604379370345,
+      "loss": 0.8096,
+      "step": 161
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.48482351023606757,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.7786,
+      "step": 162
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.45731700507184797,
+      "learning_rate": 0.00019479666504075736,
+      "loss": 0.7232,
+      "step": 163
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.4881198659574083,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8234,
+      "step": 164
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.53339133035444,
+      "learning_rate": 0.0001946303444544741,
+      "loss": 0.8353,
+      "step": 165
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.5017086808903384,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.7523,
+      "step": 166
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.4519444173975852,
+      "learning_rate": 0.00019446148064810242,
+      "loss": 0.8024,
+      "step": 167
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.4368475174874208,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.7571,
+      "step": 168
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.46015022149254187,
+      "learning_rate": 0.00019429007815990993,
+      "loss": 0.7922,
+      "step": 169
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4526030924546475,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7819,
+      "step": 170
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.43878196577602335,
+      "learning_rate": 0.00019411614159639204,
+      "loss": 0.7911,
+      "step": 171
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.45829512673345807,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7991,
+      "step": 172
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.4673957138089711,
+      "learning_rate": 0.00019393967563214833,
+      "loss": 0.8296,
+      "step": 173
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.4570484436862653,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8398,
+      "step": 174
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.44614625036883643,
+      "learning_rate": 0.00019376068500975667,
+      "loss": 0.7416,
+      "step": 175
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4301659125357434,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.7532,
+      "step": 176
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.4915260373013018,
+      "learning_rate": 0.000193579174539646,
+      "loss": 0.8246,
+      "step": 177
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.4588067567139292,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7283,
+      "step": 178
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.47831915684525106,
+      "learning_rate": 0.00019339514909996706,
+      "loss": 0.7658,
+      "step": 179
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.44874842746039945,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.862,
+      "step": 180
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.5679768293469264,
+      "learning_rate": 0.00019320861363646095,
+      "loss": 0.7803,
+      "step": 181
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.5120399489784894,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.8634,
+      "step": 182
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.4484584521277922,
+      "learning_rate": 0.00019301957316232658,
+      "loss": 0.7415,
+      "step": 183
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.4613681670976354,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.8105,
+      "step": 184
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.4423279781155262,
+      "learning_rate": 0.0001928280327580858,
+      "loss": 0.7747,
+      "step": 185
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.45490201656129764,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7403,
+      "step": 186
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.4220401525170389,
+      "learning_rate": 0.00019263399757144683,
+      "loss": 0.815,
+      "step": 187
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.5194451614177903,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8446,
+      "step": 188
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.45210322284078563,
+      "learning_rate": 0.000192437472817166,
+      "loss": 0.7791,
+      "step": 189
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.46050866955640335,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.7157,
+      "step": 190
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.4235505136533195,
+      "learning_rate": 0.00019223846377690754,
+      "loss": 0.7635,
+      "step": 191
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.4703681211642257,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8228,
+      "step": 192
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.44136920565020055,
+      "learning_rate": 0.00019203697579910154,
+      "loss": 0.7433,
+      "step": 193
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.45519261099119485,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7425,
+      "step": 194
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.5383156346157177,
+      "learning_rate": 0.00019183301429880043,
+      "loss": 0.8363,
+      "step": 195
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.4649654247908177,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.7671,
+      "step": 196
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.4028443817499883,
+      "learning_rate": 0.00019162658475753327,
+      "loss": 0.7348,
+      "step": 197
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.4189554064231662,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.7221,
+      "step": 198
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.43893145761291824,
+      "learning_rate": 0.00019141769272315858,
+      "loss": 0.7863,
+      "step": 199
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4383750996085504,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8282,
+      "step": 200
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.35778735409711204,
+      "learning_rate": 0.00019120634380971496,
+      "loss": 0.6993,
+      "step": 201
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.4927470353487211,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.7192,
+      "step": 202
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.4608492380594332,
+      "learning_rate": 0.0001909925436972706,
+      "loss": 0.7428,
+      "step": 203
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.48397589855498346,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.8228,
+      "step": 204
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.45123776740033,
+      "learning_rate": 0.00019077629813177036,
+      "loss": 0.7836,
+      "step": 205
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.48630214364118873,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7871,
+      "step": 206
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.48414963185872245,
+      "learning_rate": 0.00019055761292488142,
+      "loss": 0.8581,
+      "step": 207
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.43828562684155925,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7769,
+      "step": 208
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.47294900525907246,
+      "learning_rate": 0.00019033649395383702,
+      "loss": 0.7628,
+      "step": 209
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.3897177476488877,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.766,
+      "step": 210
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.434877741565264,
+      "learning_rate": 0.00019011294716127867,
+      "loss": 0.7797,
+      "step": 211
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.4168764136828289,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7935,
+      "step": 212
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.5733400243587182,
+      "learning_rate": 0.0001898869785550963,
+      "loss": 0.8599,
+      "step": 213
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.4670533609795901,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.7234,
+      "step": 214
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.3910677038914766,
+      "learning_rate": 0.00018965859420826684,
+      "loss": 0.6943,
+      "step": 215
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.4696248418112437,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.8311,
+      "step": 216
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.4685919639322512,
+      "learning_rate": 0.00018942780025869098,
+      "loss": 0.8238,
+      "step": 217
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.4068952644051147,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7628,
+      "step": 218
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.5001897403160873,
+      "learning_rate": 0.00018919460290902826,
+      "loss": 0.7819,
+      "step": 219
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.49498086830282373,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.7927,
+      "step": 220
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.46919273381855464,
+      "learning_rate": 0.0001889590084265304,
+      "loss": 0.7854,
+      "step": 221
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.46719410797559685,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.8124,
+      "step": 222
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.41358190476326695,
+      "learning_rate": 0.0001887210231428727,
+      "loss": 0.8068,
+      "step": 223
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4048646256150849,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7136,
+      "step": 224
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.4645755589249061,
+      "learning_rate": 0.0001884806534539841,
+      "loss": 0.7403,
+      "step": 225
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.45204161778965424,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.7742,
+      "step": 226
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.4296617628050709,
+      "learning_rate": 0.0001882379058198751,
+      "loss": 0.72,
+      "step": 227
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.43401567939298147,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.7885,
+      "step": 228
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.5041044277014657,
+      "learning_rate": 0.00018799278676446423,
+      "loss": 0.8149,
+      "step": 229
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.5188659570705662,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.8609,
+      "step": 230
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.42446039511917316,
+      "learning_rate": 0.00018774530287540278,
+      "loss": 0.7654,
+      "step": 231
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.389139862357896,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.6807,
+      "step": 232
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.4231105336792517,
+      "learning_rate": 0.00018749546080389757,
+      "loss": 0.8009,
+      "step": 233
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.44988946913128747,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.8206,
+      "step": 234
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.43170221354481614,
+      "learning_rate": 0.00018724326726453244,
+      "loss": 0.7268,
+      "step": 235
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5343955357192294,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.8537,
+      "step": 236
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.45997831665191913,
+      "learning_rate": 0.00018698872903508755,
+      "loss": 0.8326,
+      "step": 237
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.4130366400377277,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7269,
+      "step": 238
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.4392305512227261,
+      "learning_rate": 0.0001867318529563574,
+      "loss": 0.79,
+      "step": 239
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4082950983629924,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.712,
+      "step": 240
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.4291084127655194,
+      "learning_rate": 0.00018647264593196688,
+      "loss": 0.7103,
+      "step": 241
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.4589004338736792,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.7601,
+      "step": 242
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.4591406240237622,
+      "learning_rate": 0.00018621111492818585,
+      "loss": 0.7697,
+      "step": 243
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4962568753940962,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.78,
+      "step": 244
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.4134638367406118,
+      "learning_rate": 0.00018594726697374175,
+      "loss": 0.7439,
+      "step": 245
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.42922739161294365,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.7037,
+      "step": 246
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.4258601748130776,
+      "learning_rate": 0.0001856811091596308,
+      "loss": 0.8155,
+      "step": 247
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4343439953650062,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.8443,
+      "step": 248
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.39816417780442004,
+      "learning_rate": 0.00018541264863892754,
+      "loss": 0.7369,
+      "step": 249
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.48743224261890594,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.862,
+      "step": 250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.4472661555055735,
+      "learning_rate": 0.00018514189262659235,
+      "loss": 0.8504,
+      "step": 251
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.5496124715565721,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.785,
+      "step": 252
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.39415142341636117,
+      "learning_rate": 0.00018486884839927768,
+      "loss": 0.7165,
+      "step": 253
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.42039212706623386,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.7382,
+      "step": 254
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.48614478447830717,
+      "learning_rate": 0.0001845935232951325,
+      "loss": 0.7797,
+      "step": 255
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.46549521278374123,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7051,
+      "step": 256
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.5232250422287944,
+      "learning_rate": 0.00018431592471360503,
+      "loss": 0.7975,
+      "step": 257
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.5523566081412623,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.8275,
+      "step": 258
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.4472619801194902,
+      "learning_rate": 0.000184036060115244,
+      "loss": 0.8145,
+      "step": 259
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4136886310159193,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.7447,
+      "step": 260
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.4612846975781372,
+      "learning_rate": 0.00018375393702149787,
+      "loss": 0.7825,
+      "step": 261
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.4455585318867921,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.788,
+      "step": 262
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.5064153760953172,
+      "learning_rate": 0.00018346956301451304,
+      "loss": 0.807,
+      "step": 263
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.4661251981742707,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.7484,
+      "step": 264
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.3821580778603391,
+      "learning_rate": 0.00018318294573692985,
+      "loss": 0.746,
+      "step": 265
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.3831976970751616,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.7057,
+      "step": 266
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.4022448064622375,
+      "learning_rate": 0.0001828940928916772,
+      "loss": 0.7458,
+      "step": 267
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.4934838492023005,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.8417,
+      "step": 268
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.476188751217707,
+      "learning_rate": 0.00018260301224176558,
+      "loss": 0.743,
+      "step": 269
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.4567472664299517,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7725,
+      "step": 270
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.504907834045924,
+      "learning_rate": 0.00018230971161007853,
+      "loss": 0.7827,
+      "step": 271
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4097416674665615,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7228,
+      "step": 272
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.4045760812288052,
+      "learning_rate": 0.00018201419887916214,
+      "loss": 0.6967,
+      "step": 273
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.4221629374769461,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.8001,
+      "step": 274
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.44254128881116356,
+      "learning_rate": 0.00018171648199101346,
+      "loss": 0.7954,
+      "step": 275
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.44157543567294516,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7273,
+      "step": 276
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.3835702807213834,
+      "learning_rate": 0.00018141656894686689,
+      "loss": 0.7686,
+      "step": 277
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.4289662272715087,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.6706,
+      "step": 278
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.43635961138793755,
+      "learning_rate": 0.00018111446780697929,
+      "loss": 0.747,
+      "step": 279
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.41113903850753764,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7771,
+      "step": 280
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.5076809297662583,
+      "learning_rate": 0.00018081018669041324,
+      "loss": 0.7254,
+      "step": 281
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.4621183740096553,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.7807,
+      "step": 282
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.4448200360900046,
+      "learning_rate": 0.00018050373377481878,
+      "loss": 0.798,
+      "step": 283
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.42688334948529383,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7191,
+      "step": 284
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.4427575286048552,
+      "learning_rate": 0.0001801951172962139,
+      "loss": 0.7336,
+      "step": 285
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.45215607236247485,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7761,
+      "step": 286
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.4216117939443323,
+      "learning_rate": 0.0001798843455487629,
+      "loss": 0.7111,
+      "step": 287
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.41841797640236994,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7131,
+      "step": 288
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.42424125348843356,
+      "learning_rate": 0.00017957142688455362,
+      "loss": 0.7085,
+      "step": 289
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.43822645489543594,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7565,
+      "step": 290
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.5014000995540826,
+      "learning_rate": 0.00017925636971337304,
+      "loss": 0.7768,
+      "step": 291
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4533902744400488,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7577,
+      "step": 292
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.4760129113128871,
+      "learning_rate": 0.00017893918250248104,
+      "loss": 0.7964,
+      "step": 293
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.46475141271015147,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.776,
+      "step": 294
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.44577724078727743,
+      "learning_rate": 0.00017861987377638312,
+      "loss": 0.6926,
+      "step": 295
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.46018865828232647,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7761,
+      "step": 296
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.4273900429531465,
+      "learning_rate": 0.0001782984521166011,
+      "loss": 0.7104,
+      "step": 297
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.48720115137203984,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.6669,
+      "step": 298
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.43724150243758453,
+      "learning_rate": 0.00017797492616144256,
+      "loss": 0.7849,
+      "step": 299
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4536456547647474,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7314,
+      "step": 300
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.41949648012326524,
+      "learning_rate": 0.00017764930460576866,
+      "loss": 0.7841,
+      "step": 301
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.5530323965617054,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7513,
+      "step": 302
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.41496103490168035,
+      "learning_rate": 0.00017732159620076053,
+      "loss": 0.7043,
+      "step": 303
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.4342150998833833,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7713,
+      "step": 304
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.3836574507427057,
+      "learning_rate": 0.00017699180975368396,
+      "loss": 0.695,
+      "step": 305
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.4833306604596649,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.8012,
+      "step": 306
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.48941071903525935,
+      "learning_rate": 0.00017665995412765285,
+      "loss": 0.83,
+      "step": 307
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.45546309598972384,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.677,
+      "step": 308
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.4273286577487821,
+      "learning_rate": 0.00017632603824139085,
+      "loss": 0.7741,
+      "step": 309
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.4363221942447534,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.752,
+      "step": 310
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.4994010117040842,
+      "learning_rate": 0.0001759900710689918,
+      "loss": 0.8144,
+      "step": 311
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.4341683399826866,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.6678,
+      "step": 312
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.401530866349554,
+      "learning_rate": 0.00017565206163967846,
+      "loss": 0.7257,
+      "step": 313
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.529625527241754,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8088,
+      "step": 314
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.39026455230444157,
+      "learning_rate": 0.00017531201903755994,
+      "loss": 0.7194,
+      "step": 315
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.42800283073832723,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.7903,
+      "step": 316
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.3972390258183331,
+      "learning_rate": 0.00017496995240138744,
+      "loss": 0.7185,
+      "step": 317
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.3834178092651497,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7082,
+      "step": 318
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.4518687861232824,
+      "learning_rate": 0.00017462587092430875,
+      "loss": 0.7919,
+      "step": 319
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.5346360451967396,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.8324,
+      "step": 320
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.46979803699729494,
+      "learning_rate": 0.00017427978385362112,
+      "loss": 0.8236,
+      "step": 321
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.43694407342820374,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7477,
+      "step": 322
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.5095323025630049,
+      "learning_rate": 0.0001739317004905227,
+      "loss": 0.8413,
+      "step": 323
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4968128564994145,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.7885,
+      "step": 324
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.5078776182199096,
+      "learning_rate": 0.00017358163018986282,
+      "loss": 0.7603,
+      "step": 325
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.4744412438088314,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.7661,
+      "step": 326
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.4614229691130732,
+      "learning_rate": 0.00017322958235989016,
+      "loss": 0.7322,
+      "step": 327
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.46843691236917434,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7172,
+      "step": 328
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.4684751645469075,
+      "learning_rate": 0.00017287556646200018,
+      "loss": 0.7047,
+      "step": 329
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.46630543381550216,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.73,
+      "step": 330
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.43745535560275245,
+      "learning_rate": 0.00017251959201048083,
+      "loss": 0.7629,
+      "step": 331
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.4762020250493979,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7868,
+      "step": 332
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.4357948760020524,
+      "learning_rate": 0.00017216166857225674,
+      "loss": 0.783,
+      "step": 333
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.436943677523882,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.7373,
+      "step": 334
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.40932891382179964,
+      "learning_rate": 0.00017180180576663228,
+      "loss": 0.7202,
+      "step": 335
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.5883861574712437,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8278,
+      "step": 336
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.40132402370081177,
+      "learning_rate": 0.00017144001326503273,
+      "loss": 0.7263,
+      "step": 337
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.4115217805673836,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.6755,
+      "step": 338
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.4268351110332894,
+      "learning_rate": 0.00017107630079074478,
+      "loss": 0.7103,
+      "step": 339
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4923655416141933,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.8044,
+      "step": 340
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.4997493249774998,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.8539,
+      "step": 341
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.48643623176041373,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.8026,
+      "step": 342
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.4773576785484544,
+      "learning_rate": 0.00017034315507498635,
+      "loss": 0.7214,
+      "step": 343
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.39494352193372345,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.6902,
+      "step": 344
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.46984891638247384,
+      "learning_rate": 0.00016997374153703625,
+      "loss": 0.7609,
+      "step": 345
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.4333884424966963,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.6992,
+      "step": 346
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.4040061587158778,
+      "learning_rate": 0.00016960244743290868,
+      "loss": 0.6719,
+      "step": 347
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.44559369491003564,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.6996,
+      "step": 348
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.4549801746358309,
+      "learning_rate": 0.00016922928274124886,
+      "loss": 0.7783,
+      "step": 349
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.44001321625949863,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.7176,
+      "step": 350
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.4639773269310298,
+      "learning_rate": 0.00016885425749097444,
+      "loss": 0.8004,
+      "step": 351
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4303955636727868,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.755,
+      "step": 352
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.4141004937252427,
+      "learning_rate": 0.00016847738176100632,
+      "loss": 0.7598,
+      "step": 353
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.43962722102707485,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.81,
+      "step": 354
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.4162715488430212,
+      "learning_rate": 0.0001680986656799975,
+      "loss": 0.6627,
+      "step": 355
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.4529540083988486,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.789,
+      "step": 356
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.42182531250811367,
+      "learning_rate": 0.00016771811942606108,
+      "loss": 0.72,
+      "step": 357
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.40127151670189865,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.734,
+      "step": 358
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.4671780008586571,
+      "learning_rate": 0.00016733575322649657,
+      "loss": 0.7381,
+      "step": 359
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.5032627114867887,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.807,
+      "step": 360
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.44473524675660386,
+      "learning_rate": 0.00016695157735751513,
+      "loss": 0.7303,
+      "step": 361
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.4162124864646265,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7336,
+      "step": 362
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.4449890529639507,
+      "learning_rate": 0.0001665656021439633,
+      "loss": 0.7486,
+      "step": 363
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.5237231064323881,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7471,
+      "step": 364
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.4207160499922226,
+      "learning_rate": 0.00016617783795904565,
+      "loss": 0.7261,
+      "step": 365
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.4704045263890853,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.8208,
+      "step": 366
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.43304544107045895,
+      "learning_rate": 0.00016578829522404583,
+      "loss": 0.7635,
+      "step": 367
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.39131061001722545,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7196,
+      "step": 368
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.37808206725966376,
+      "learning_rate": 0.00016539698440804661,
+      "loss": 0.7097,
+      "step": 369
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.3743360142983906,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7536,
+      "step": 370
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.49000270338630736,
+      "learning_rate": 0.0001650039160276485,
+      "loss": 0.7793,
+      "step": 371
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.412541252941152,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7385,
+      "step": 372
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.4876990343890208,
+      "learning_rate": 0.0001646091006466871,
+      "loss": 0.7508,
+      "step": 373
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.45483122387541086,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.6547,
+      "step": 374
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.43292404787859134,
+      "learning_rate": 0.00016421254887594917,
+      "loss": 0.7429,
+      "step": 375
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.5240154554847967,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7897,
+      "step": 376
+    },
+    {
+      "epoch": 0.3016,
+      "grad_norm": 0.39038384511217455,
+      "learning_rate": 0.00016381427137288754,
+      "loss": 0.6865,
+      "step": 377
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.38623020339333136,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7633,
+      "step": 378
+    },
+    {
+      "epoch": 0.3032,
+      "grad_norm": 0.429645339971841,
+      "learning_rate": 0.0001634142788413346,
+      "loss": 0.7662,
+      "step": 379
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.43721601515086034,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.8308,
+      "step": 380
+    },
+    {
+      "epoch": 0.3048,
+      "grad_norm": 0.43647395057503274,
+      "learning_rate": 0.00016301258203121462,
+      "loss": 0.7686,
+      "step": 381
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.40950006672868094,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7398,
+      "step": 382
+    },
+    {
+      "epoch": 0.3064,
+      "grad_norm": 0.4912920895700108,
+      "learning_rate": 0.00016260919173825508,
+      "loss": 0.7458,
+      "step": 383
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.6168180200741212,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7912,
+      "step": 384
+    },
+    {
+      "epoch": 0.308,
+      "grad_norm": 0.41411027495970254,
+      "learning_rate": 0.00016220411880369601,
+      "loss": 0.7168,
+      "step": 385
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.4497642149521469,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.6723,
+      "step": 386
+    },
+    {
+      "epoch": 0.3096,
+      "grad_norm": 0.4418045102455252,
+      "learning_rate": 0.00016179737411399926,
+      "loss": 0.7442,
+      "step": 387
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.4373246281997312,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7293,
+      "step": 388
+    },
+    {
+      "epoch": 0.3112,
+      "grad_norm": 0.47325885066264217,
+      "learning_rate": 0.00016138896860055555,
+      "loss": 0.7655,
+      "step": 389
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.3807983166517147,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.6831,
+      "step": 390
+    },
+    {
+      "epoch": 0.3128,
+      "grad_norm": 0.6715867627560135,
+      "learning_rate": 0.00016097891323939062,
+      "loss": 0.7487,
+      "step": 391
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.48371564351939556,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7976,
+      "step": 392
+    },
+    {
+      "epoch": 0.3144,
+      "grad_norm": 0.48130480681229615,
+      "learning_rate": 0.00016056721905087056,
+      "loss": 0.7378,
+      "step": 393
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.4617302480330167,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.7098,
+      "step": 394
+    },
+    {
+      "epoch": 0.316,
+      "grad_norm": 0.5267134983219407,
+      "learning_rate": 0.00016015389709940538,
+      "loss": 0.7405,
+      "step": 395
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.4650556793621377,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7808,
+      "step": 396
+    },
+    {
+      "epoch": 0.3176,
+      "grad_norm": 0.5265641825715518,
+      "learning_rate": 0.0001597389584931517,
+      "loss": 0.8145,
+      "step": 397
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.3931228325769163,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7726,
+      "step": 398
+    },
+    {
+      "epoch": 0.3192,
+      "grad_norm": 0.4837066960426388,
+      "learning_rate": 0.0001593224143837142,
+      "loss": 0.8093,
+      "step": 399
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4184883413355477,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7748,
+      "step": 400
+    },
+    {
+      "epoch": 0.3208,
+      "grad_norm": 0.39714913018174774,
+      "learning_rate": 0.00015890427596584617,
+      "loss": 0.6945,
+      "step": 401
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.4486406663071972,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7621,
+      "step": 402
+    },
+    {
+      "epoch": 0.3224,
+      "grad_norm": 0.4802101093193108,
+      "learning_rate": 0.00015848455447714822,
+      "loss": 0.7852,
+      "step": 403
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.5632519786978436,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.8368,
+      "step": 404
+    },
+    {
+      "epoch": 0.324,
+      "grad_norm": 0.40690770552546535,
+      "learning_rate": 0.00015806326119776663,
+      "loss": 0.7121,
+      "step": 405
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.4167844799568287,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7582,
+      "step": 406
+    },
+    {
+      "epoch": 0.3256,
+      "grad_norm": 0.5272911783056048,
+      "learning_rate": 0.00015764040745008988,
+      "loss": 0.7908,
+      "step": 407
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4056347008128688,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.6806,
+      "step": 408
+    },
+    {
+      "epoch": 0.3272,
+      "grad_norm": 0.4653365044749056,
+      "learning_rate": 0.00015721600459844468,
+      "loss": 0.7713,
+      "step": 409
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.5202471557781018,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7944,
+      "step": 410
+    },
+    {
+      "epoch": 0.3288,
+      "grad_norm": 0.41787271318453006,
+      "learning_rate": 0.00015679006404879033,
+      "loss": 0.762,
+      "step": 411
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.3761029147324706,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.6588,
+      "step": 412
+    },
+    {
+      "epoch": 0.3304,
+      "grad_norm": 0.4072236756118215,
+      "learning_rate": 0.00015636259724841222,
+      "loss": 0.6799,
+      "step": 413
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.43890168022863474,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7598,
+      "step": 414
+    },
+    {
+      "epoch": 0.332,
+      "grad_norm": 0.5269865532585807,
+      "learning_rate": 0.00015593361568561428,
+      "loss": 0.7183,
+      "step": 415
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.5142377830763563,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7427,
+      "step": 416
+    },
+    {
+      "epoch": 0.3336,
+      "grad_norm": 0.39680223133582515,
+      "learning_rate": 0.0001555031308894101,
+      "loss": 0.6888,
+      "step": 417
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.38605801338535556,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7538,
+      "step": 418
+    },
+    {
+      "epoch": 0.3352,
+      "grad_norm": 0.4312235067932873,
+      "learning_rate": 0.0001550711544292131,
+      "loss": 0.7073,
+      "step": 419
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.47661865931411446,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7361,
+      "step": 420
+    },
+    {
+      "epoch": 0.3368,
+      "grad_norm": 0.378208127701607,
+      "learning_rate": 0.00015463769791452574,
+      "loss": 0.7094,
+      "step": 421
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.40054410067166246,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7333,
+      "step": 422
+    },
+    {
+      "epoch": 0.3384,
+      "grad_norm": 0.4465448155069957,
+      "learning_rate": 0.00015420277299462736,
+      "loss": 0.7734,
+      "step": 423
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.3895583553778457,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.713,
+      "step": 424
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.4456224885093965,
+      "learning_rate": 0.00015376639135826107,
+      "loss": 0.7199,
+      "step": 425
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.3883694852647505,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7291,
+      "step": 426
+    },
+    {
+      "epoch": 0.3416,
+      "grad_norm": 0.39948802584076715,
+      "learning_rate": 0.00015332856473331978,
+      "loss": 0.695,
+      "step": 427
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.38137650151076147,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7001,
+      "step": 428
+    },
+    {
+      "epoch": 0.3432,
+      "grad_norm": 0.4668339041021015,
+      "learning_rate": 0.00015288930488653094,
+      "loss": 0.7842,
+      "step": 429
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.4535718625573633,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.6747,
+      "step": 430
+    },
+    {
+      "epoch": 0.3448,
+      "grad_norm": 0.4648477500228524,
+      "learning_rate": 0.0001524486236231402,
+      "loss": 0.6815,
+      "step": 431
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.47199304890528454,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7611,
+      "step": 432
+    },
+    {
+      "epoch": 0.3464,
+      "grad_norm": 0.4099428591875097,
+      "learning_rate": 0.00015200653278659432,
+      "loss": 0.6938,
+      "step": 433
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.4535611989804537,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7348,
+      "step": 434
+    },
+    {
+      "epoch": 0.348,
+      "grad_norm": 0.5035535828466178,
+      "learning_rate": 0.00015156304425822267,
+      "loss": 0.7362,
+      "step": 435
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.42283019750423534,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7185,
+      "step": 436
+    },
+    {
+      "epoch": 0.3496,
+      "grad_norm": 0.4838752327509362,
+      "learning_rate": 0.00015111816995691809,
+      "loss": 0.7388,
+      "step": 437
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.42024802280700807,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.7356,
+      "step": 438
+    },
+    {
+      "epoch": 0.3512,
+      "grad_norm": 0.4160541380585408,
+      "learning_rate": 0.00015067192183881658,
+      "loss": 0.7701,
+      "step": 439
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3788347031877241,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7325,
+      "step": 440
+    },
+    {
+      "epoch": 0.3528,
+      "grad_norm": 0.38735818692185703,
+      "learning_rate": 0.00015022431189697568,
+      "loss": 0.6893,
+      "step": 441
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.3597178476469672,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.6976,
+      "step": 442
+    },
+    {
+      "epoch": 0.3544,
+      "grad_norm": 0.4178694077843292,
+      "learning_rate": 0.0001497753521610526,
+      "loss": 0.7257,
+      "step": 443
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.41761446690467124,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7874,
+      "step": 444
+    },
+    {
+      "epoch": 0.356,
+      "grad_norm": 0.42679300828782424,
+      "learning_rate": 0.00014932505469698052,
+      "loss": 0.7151,
+      "step": 445
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.39175218883611723,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.686,
+      "step": 446
+    },
+    {
+      "epoch": 0.3576,
+      "grad_norm": 0.4563357236112232,
+      "learning_rate": 0.0001488734316066446,
+      "loss": 0.8581,
+      "step": 447
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.38812383097666997,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.6779,
+      "step": 448
+    },
+    {
+      "epoch": 0.3592,
+      "grad_norm": 0.38277783873384447,
+      "learning_rate": 0.0001484204950275565,
+      "loss": 0.7104,
+      "step": 449
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.42620266377459964,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.77,
+      "step": 450
+    },
+    {
+      "epoch": 0.3608,
+      "grad_norm": 0.43257156599828467,
+      "learning_rate": 0.00014796625713252848,
+      "loss": 0.7269,
+      "step": 451
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4049971816820278,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.6721,
+      "step": 452
+    },
+    {
+      "epoch": 0.3624,
+      "grad_norm": 0.41157857479213467,
+      "learning_rate": 0.00014751073012934587,
+      "loss": 0.7696,
+      "step": 453
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.4704422303871149,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7603,
+      "step": 454
+    },
+    {
+      "epoch": 0.364,
+      "grad_norm": 0.48366105528864356,
+      "learning_rate": 0.0001470539262604393,
+      "loss": 0.7488,
+      "step": 455
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3943341335087824,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7683,
+      "step": 456
+    },
+    {
+      "epoch": 0.3656,
+      "grad_norm": 0.4132128850022526,
+      "learning_rate": 0.00014659585780255556,
+      "loss": 0.7668,
+      "step": 457
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.541500045682358,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.8034,
+      "step": 458
+    },
+    {
+      "epoch": 0.3672,
+      "grad_norm": 0.40760376296989886,
+      "learning_rate": 0.0001461365370664276,
+      "loss": 0.7051,
+      "step": 459
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.46584302551593115,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7822,
+      "step": 460
+    },
+    {
+      "epoch": 0.3688,
+      "grad_norm": 0.4537814273231918,
+      "learning_rate": 0.00014567597639644387,
+      "loss": 0.7745,
+      "step": 461
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.3929636636729761,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7179,
+      "step": 462
+    },
+    {
+      "epoch": 0.3704,
+      "grad_norm": 0.39104933951566373,
+      "learning_rate": 0.00014521418817031628,
+      "loss": 0.7321,
+      "step": 463
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.9724579030680056,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.6767,
+      "step": 464
+    },
+    {
+      "epoch": 0.372,
+      "grad_norm": 0.3937525902011488,
+      "learning_rate": 0.00014475118479874774,
+      "loss": 0.7209,
+      "step": 465
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.48600652817492457,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7702,
+      "step": 466
+    },
+    {
+      "epoch": 0.3736,
+      "grad_norm": 0.4732081299865097,
+      "learning_rate": 0.0001442869787250987,
+      "loss": 0.7576,
+      "step": 467
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.37407767109013385,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.6783,
+      "step": 468
+    },
+    {
+      "epoch": 0.3752,
+      "grad_norm": 0.45168072106832896,
+      "learning_rate": 0.00014382158242505234,
+      "loss": 0.7194,
+      "step": 469
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.4584354719992922,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7765,
+      "step": 470
+    },
+    {
+      "epoch": 0.3768,
+      "grad_norm": 0.39354936124716017,
+      "learning_rate": 0.00014335500840627986,
+      "loss": 0.7156,
+      "step": 471
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.42255891436963455,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.7169,
+      "step": 472
+    },
+    {
+      "epoch": 0.3784,
+      "grad_norm": 0.37187942916048466,
+      "learning_rate": 0.0001428872692081038,
+      "loss": 0.6497,
+      "step": 473
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.45687994821533223,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7141,
+      "step": 474
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.36445282527599093,
+      "learning_rate": 0.00014241837740116132,
+      "loss": 0.7537,
+      "step": 475
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.4005479130992262,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7476,
+      "step": 476
+    },
+    {
+      "epoch": 0.3816,
+      "grad_norm": 0.46674310582409684,
+      "learning_rate": 0.00014194834558706632,
+      "loss": 0.6526,
+      "step": 477
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.4260748722860748,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.69,
+      "step": 478
+    },
+    {
+      "epoch": 0.3832,
+      "grad_norm": 0.46437117291237945,
+      "learning_rate": 0.0001414771863980707,
+      "loss": 0.6467,
+      "step": 479
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4415056988395311,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.736,
+      "step": 480
+    },
+    {
+      "epoch": 0.3848,
+      "grad_norm": 0.4908767661708374,
+      "learning_rate": 0.00014100491249672498,
+      "loss": 0.7134,
+      "step": 481
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.47859834363322507,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7785,
+      "step": 482
+    },
+    {
+      "epoch": 0.3864,
+      "grad_norm": 0.47439178111884583,
+      "learning_rate": 0.0001405315365755379,
+      "loss": 0.7899,
+      "step": 483
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.4412734441330705,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7184,
+      "step": 484
+    },
+    {
+      "epoch": 0.388,
+      "grad_norm": 0.431771660931234,
+      "learning_rate": 0.00014005707135663527,
+      "loss": 0.7624,
+      "step": 485
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.43896564564751067,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7308,
+      "step": 486
+    },
+    {
+      "epoch": 0.3896,
+      "grad_norm": 0.40217102245700914,
+      "learning_rate": 0.00013958152959141825,
+      "loss": 0.7277,
+      "step": 487
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.4389039155421663,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7001,
+      "step": 488
+    },
+    {
+      "epoch": 0.3912,
+      "grad_norm": 0.38581682640274634,
+      "learning_rate": 0.00013910492406022033,
+      "loss": 0.6617,
+      "step": 489
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.4079475204598973,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.6675,
+      "step": 490
+    },
+    {
+      "epoch": 0.3928,
+      "grad_norm": 0.5585268315795652,
+      "learning_rate": 0.0001386272675719642,
+      "loss": 0.7166,
+      "step": 491
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.5022464531472589,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.8057,
+      "step": 492
+    },
+    {
+      "epoch": 0.3944,
+      "grad_norm": 0.394046028658977,
+      "learning_rate": 0.00013814857296381728,
+      "loss": 0.7622,
+      "step": 493
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.44257697534003737,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.6819,
+      "step": 494
+    },
+    {
+      "epoch": 0.396,
+      "grad_norm": 0.38307503044539076,
+      "learning_rate": 0.00013766885310084688,
+      "loss": 0.7019,
+      "step": 495
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.41807116882005585,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7152,
+      "step": 496
+    },
+    {
+      "epoch": 0.3976,
+      "grad_norm": 0.45290570075320474,
+      "learning_rate": 0.00013718812087567414,
+      "loss": 0.7348,
+      "step": 497
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.3925236397704885,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.6861,
+      "step": 498
+    },
+    {
+      "epoch": 0.3992,
+      "grad_norm": 0.45149071314815764,
+      "learning_rate": 0.000136706389208128,
+      "loss": 0.6878,
+      "step": 499
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4071222732548133,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7811,
+      "step": 500
+    },
+    {
+      "epoch": 0.4008,
+      "grad_norm": 0.4421342869463711,
+      "learning_rate": 0.00013622367104489756,
+      "loss": 0.7167,
+      "step": 501
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.5760793900115654,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.7143,
+      "step": 502
+    },
+    {
+      "epoch": 0.4024,
+      "grad_norm": 0.3872733363864618,
+      "learning_rate": 0.0001357399793591844,
+      "loss": 0.7523,
+      "step": 503
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.43872689780567015,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7063,
+      "step": 504
+    },
+    {
+      "epoch": 0.404,
+      "grad_norm": 0.38939207805743736,
+      "learning_rate": 0.00013525532715035366,
+      "loss": 0.7535,
+      "step": 505
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.44352116285817117,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7692,
+      "step": 506
+    },
+    {
+      "epoch": 0.4056,
+      "grad_norm": 0.49678791584464294,
+      "learning_rate": 0.00013476972744358507,
+      "loss": 0.7666,
+      "step": 507
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.4085049271929804,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.749,
+      "step": 508
+    },
+    {
+      "epoch": 0.4072,
+      "grad_norm": 0.40304066590545856,
+      "learning_rate": 0.00013428319328952253,
+      "loss": 0.7113,
+      "step": 509
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.43042250935552345,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.756,
+      "step": 510
+    },
+    {
+      "epoch": 0.4088,
+      "grad_norm": 0.4147908128865509,
+      "learning_rate": 0.0001337957377639235,
+      "loss": 0.725,
+      "step": 511
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.37609010502624507,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.6754,
+      "step": 512
+    },
+    {
+      "epoch": 0.4104,
+      "grad_norm": 0.40850172243478794,
+      "learning_rate": 0.0001333073739673076,
+      "loss": 0.7368,
+      "step": 513
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.44390614168079706,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7222,
+      "step": 514
+    },
+    {
+      "epoch": 0.412,
+      "grad_norm": 0.4120377036632598,
+      "learning_rate": 0.0001328181150246045,
+      "loss": 0.7087,
+      "step": 515
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.5573398188129507,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.758,
+      "step": 516
+    },
+    {
+      "epoch": 0.4136,
+      "grad_norm": 0.4200468433710076,
+      "learning_rate": 0.00013232797408480127,
+      "loss": 0.7227,
+      "step": 517
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.38446339270571694,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.6964,
+      "step": 518
+    },
+    {
+      "epoch": 0.4152,
+      "grad_norm": 0.4306117384300308,
+      "learning_rate": 0.00013183696432058888,
+      "loss": 0.6973,
+      "step": 519
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.40291091663964096,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.6936,
+      "step": 520
+    },
+    {
+      "epoch": 0.4168,
+      "grad_norm": 0.4488480953601367,
+      "learning_rate": 0.00013134509892800822,
+      "loss": 0.7487,
+      "step": 521
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.5513524541003362,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.6943,
+      "step": 522
+    },
+    {
+      "epoch": 0.4184,
+      "grad_norm": 0.40768522673540947,
+      "learning_rate": 0.00013085239112609547,
+      "loss": 0.7154,
+      "step": 523
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4077524614993159,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.707,
+      "step": 524
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.4159916451775266,
+      "learning_rate": 0.00013035885415652685,
+      "loss": 0.7144,
+      "step": 525
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.3956708039094935,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.6597,
+      "step": 526
+    },
+    {
+      "epoch": 0.4216,
+      "grad_norm": 0.4483404590623498,
+      "learning_rate": 0.00012986450128326266,
+      "loss": 0.7989,
+      "step": 527
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.39627321979143537,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.697,
+      "step": 528
+    },
+    {
+      "epoch": 0.4232,
+      "grad_norm": 0.4331520131916862,
+      "learning_rate": 0.00012936934579219094,
+      "loss": 0.7778,
+      "step": 529
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.4148175928410438,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7011,
+      "step": 530
+    },
+    {
+      "epoch": 0.4248,
+      "grad_norm": 0.4003042927395414,
+      "learning_rate": 0.00012887340099077024,
+      "loss": 0.7136,
+      "step": 531
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.3720842311567061,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.689,
+      "step": 532
+    },
+    {
+      "epoch": 0.4264,
+      "grad_norm": 0.40789792375893974,
+      "learning_rate": 0.0001283766802076722,
+      "loss": 0.7181,
+      "step": 533
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.4844489312487811,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7413,
+      "step": 534
+    },
+    {
+      "epoch": 0.428,
+      "grad_norm": 0.3701294279514779,
+      "learning_rate": 0.00012787919679242306,
+      "loss": 0.6729,
+      "step": 535
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4000085713509856,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.6832,
+      "step": 536
+    },
+    {
+      "epoch": 0.4296,
+      "grad_norm": 0.4672720327002756,
+      "learning_rate": 0.00012738096411504522,
+      "loss": 0.6796,
+      "step": 537
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.44961852455501133,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7298,
+      "step": 538
+    },
+    {
+      "epoch": 0.4312,
+      "grad_norm": 0.41438508218575254,
+      "learning_rate": 0.00012688199556569753,
+      "loss": 0.7158,
+      "step": 539
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4325399033022962,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7777,
+      "step": 540
+    },
+    {
+      "epoch": 0.4328,
+      "grad_norm": 0.5053188784919949,
+      "learning_rate": 0.0001263823045543158,
+      "loss": 0.7933,
+      "step": 541
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.38659410950521067,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.7245,
+      "step": 542
+    },
+    {
+      "epoch": 0.4344,
+      "grad_norm": 0.38715476585865544,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.6754,
+      "step": 543
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4747342319133257,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.7865,
+      "step": 544
+    },
+    {
+      "epoch": 0.436,
+      "grad_norm": 0.35438085675639874,
+      "learning_rate": 0.00012538080888191408,
+      "loss": 0.681,
+      "step": 545
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.44508203027244037,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7308,
+      "step": 546
+    },
+    {
+      "epoch": 0.4376,
+      "grad_norm": 0.38735676030249233,
+      "learning_rate": 0.00012487903113640337,
+      "loss": 0.6808,
+      "step": 547
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3872391895733674,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.6845,
+      "step": 548
+    },
+    {
+      "epoch": 0.4392,
+      "grad_norm": 0.3620989909229493,
+      "learning_rate": 0.00012437658475915377,
+      "loss": 0.6751,
+      "step": 549
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.43790438606046156,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.6791,
+      "step": 550
+    },
+    {
+      "epoch": 0.4408,
+      "grad_norm": 0.48366803986218093,
+      "learning_rate": 0.00012387348325356874,
+      "loss": 0.7287,
+      "step": 551
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.42143361862716416,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.6682,
+      "step": 552
+    },
+    {
+      "epoch": 0.4424,
+      "grad_norm": 0.4585085971230449,
+      "learning_rate": 0.00012336974014065844,
+      "loss": 0.7668,
+      "step": 553
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.3789995384101326,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.6827,
+      "step": 554
+    },
+    {
+      "epoch": 0.444,
+      "grad_norm": 0.4429889133762628,
+      "learning_rate": 0.00012286536895867654,
+      "loss": 0.6507,
+      "step": 555
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.357270318009767,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.6593,
+      "step": 556
+    },
+    {
+      "epoch": 0.4456,
+      "grad_norm": 0.3634325493084814,
+      "learning_rate": 0.00012236038326275626,
+      "loss": 0.7329,
+      "step": 557
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.4275663523962742,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.6958,
+      "step": 558
+    },
+    {
+      "epoch": 0.4472,
+      "grad_norm": 0.4778306138347968,
+      "learning_rate": 0.00012185479662454595,
+      "loss": 0.7725,
+      "step": 559
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.41254801675174013,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7217,
+      "step": 560
+    },
+    {
+      "epoch": 0.4488,
+      "grad_norm": 0.44175419694261003,
+      "learning_rate": 0.00012134862263184467,
+      "loss": 0.7179,
+      "step": 561
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.4516574575775512,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.652,
+      "step": 562
+    },
+    {
+      "epoch": 0.4504,
+      "grad_norm": 0.44152357206659537,
+      "learning_rate": 0.00012084187488823657,
+      "loss": 0.7973,
+      "step": 563
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.44010711894728904,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7573,
+      "step": 564
+    },
+    {
+      "epoch": 0.452,
+      "grad_norm": 0.4030181690076236,
+      "learning_rate": 0.00012033456701272576,
+      "loss": 0.6636,
+      "step": 565
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.3713858272541694,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7753,
+      "step": 566
+    },
+    {
+      "epoch": 0.4536,
+      "grad_norm": 0.3852877383621456,
+      "learning_rate": 0.00011982671263936995,
+      "loss": 0.7468,
+      "step": 567
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3971656730292044,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.6555,
+      "step": 568
+    },
+    {
+      "epoch": 0.4552,
+      "grad_norm": 0.412677326432461,
+      "learning_rate": 0.00011931832541691418,
+      "loss": 0.7235,
+      "step": 569
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.43671968280578716,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.7513,
+      "step": 570
+    },
+    {
+      "epoch": 0.4568,
+      "grad_norm": 0.5079628329497556,
+      "learning_rate": 0.00011880941900842397,
+      "loss": 0.7377,
+      "step": 571
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3922201059679018,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.6935,
+      "step": 572
+    },
+    {
+      "epoch": 0.4584,
+      "grad_norm": 0.40716418915542074,
+      "learning_rate": 0.00011830000709091815,
+      "loss": 0.7205,
+      "step": 573
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.4664659743807478,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7451,
+      "step": 574
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.44727220145677477,
+      "learning_rate": 0.0001177901033550012,
+      "loss": 0.7352,
+      "step": 575
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.40755267128384315,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.6837,
+      "step": 576
+    },
+    {
+      "epoch": 0.4616,
+      "grad_norm": 0.46850145166036933,
+      "learning_rate": 0.00011727972150449544,
+      "loss": 0.7688,
+      "step": 577
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.3981627024681664,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.6919,
+      "step": 578
+    },
+    {
+      "epoch": 0.4632,
+      "grad_norm": 0.45156301666489845,
+      "learning_rate": 0.00011676887525607271,
+      "loss": 0.6625,
+      "step": 579
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.35755911566580956,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.6978,
+      "step": 580
+    },
+    {
+      "epoch": 0.4648,
+      "grad_norm": 0.3835079024265838,
+      "learning_rate": 0.00011625757833888551,
+      "loss": 0.6633,
+      "step": 581
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.47477190931220487,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7835,
+      "step": 582
+    },
+    {
+      "epoch": 0.4664,
+      "grad_norm": 0.3891714455141602,
+      "learning_rate": 0.0001157458444941984,
+      "loss": 0.672,
+      "step": 583
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3781377080874938,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7253,
+      "step": 584
+    },
+    {
+      "epoch": 0.468,
+      "grad_norm": 0.4411229793123389,
+      "learning_rate": 0.00011523368747501839,
+      "loss": 0.7146,
+      "step": 585
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.37640745820873606,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7109,
+      "step": 586
+    },
+    {
+      "epoch": 0.4696,
+      "grad_norm": 0.41310883543295873,
+      "learning_rate": 0.00011472112104572547,
+      "loss": 0.6555,
+      "step": 587
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3918784302374364,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.6954,
+      "step": 588
+    },
+    {
+      "epoch": 0.4712,
+      "grad_norm": 0.37616228208789243,
+      "learning_rate": 0.0001142081589817027,
+      "loss": 0.7034,
+      "step": 589
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.44071304271586675,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.6857,
+      "step": 590
+    },
+    {
+      "epoch": 0.4728,
+      "grad_norm": 0.4202177258854724,
+      "learning_rate": 0.00011369481506896582,
+      "loss": 0.628,
+      "step": 591
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.4041060891580057,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.6533,
+      "step": 592
+    },
+    {
+      "epoch": 0.4744,
+      "grad_norm": 0.4756951270911055,
+      "learning_rate": 0.00011318110310379301,
+      "loss": 0.7524,
+      "step": 593
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.45831114124482203,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7382,
+      "step": 594
+    },
+    {
+      "epoch": 0.476,
+      "grad_norm": 0.3932523154677725,
+      "learning_rate": 0.00011266703689235394,
+      "loss": 0.6842,
+      "step": 595
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.4358429001327487,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.6159,
+      "step": 596
+    },
+    {
+      "epoch": 0.4776,
+      "grad_norm": 0.3655943811613015,
+      "learning_rate": 0.00011215263025033869,
+      "loss": 0.6671,
+      "step": 597
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.4399673893957657,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7069,
+      "step": 598
+    },
+    {
+      "epoch": 0.4792,
+      "grad_norm": 0.43456410694718184,
+      "learning_rate": 0.00011163789700258655,
+      "loss": 0.7314,
+      "step": 599
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4633362962833651,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.693,
+      "step": 600
+    },
+    {
+      "epoch": 0.4808,
+      "grad_norm": 0.46861443447088086,
+      "learning_rate": 0.00011112285098271451,
+      "loss": 0.7057,
+      "step": 601
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.42640314638531535,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.6459,
+      "step": 602
+    },
+    {
+      "epoch": 0.4824,
+      "grad_norm": 0.448308170960733,
+      "learning_rate": 0.00011060750603274535,
+      "loss": 0.7176,
+      "step": 603
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.39875915954725155,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.7211,
+      "step": 604
+    },
+    {
+      "epoch": 0.484,
+      "grad_norm": 0.37935878850667626,
+      "learning_rate": 0.00011009187600273566,
+      "loss": 0.6682,
+      "step": 605
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.4263246375252377,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.6852,
+      "step": 606
+    },
+    {
+      "epoch": 0.4856,
+      "grad_norm": 0.4102525106845525,
+      "learning_rate": 0.00010957597475040373,
+      "loss": 0.6988,
+      "step": 607
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.38935815027258963,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.678,
+      "step": 608
+    },
+    {
+      "epoch": 0.4872,
+      "grad_norm": 0.4754577297673937,
+      "learning_rate": 0.00010905981614075693,
+      "loss": 0.7183,
+      "step": 609
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.34456311695968816,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.6265,
+      "step": 610
+    },
+    {
+      "epoch": 0.4888,
+      "grad_norm": 0.4376235627193579,
+      "learning_rate": 0.00010854341404571928,
+      "loss": 0.7115,
+      "step": 611
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.5207658620128204,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.7223,
+      "step": 612
+    },
+    {
+      "epoch": 0.4904,
+      "grad_norm": 0.8435111329515005,
+      "learning_rate": 0.00010802678234375851,
+      "loss": 0.647,
+      "step": 613
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.39473901931373223,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.6482,
+      "step": 614
+    },
+    {
+      "epoch": 0.492,
+      "grad_norm": 0.41239851384583864,
+      "learning_rate": 0.0001075099349195131,
+      "loss": 0.7124,
+      "step": 615
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.3667110865340065,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.6974,
+      "step": 616
+    },
+    {
+      "epoch": 0.4936,
+      "grad_norm": 0.36671074295549133,
+      "learning_rate": 0.00010699288566341914,
+      "loss": 0.6495,
+      "step": 617
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.3682874881376714,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.6686,
+      "step": 618
+    },
+    {
+      "epoch": 0.4952,
+      "grad_norm": 0.3849121179416742,
+      "learning_rate": 0.000106475648471337,
+      "loss": 0.6678,
+      "step": 619
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.36527401278127675,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7052,
+      "step": 620
+    },
+    {
+      "epoch": 0.4968,
+      "grad_norm": 0.4028325536586198,
+      "learning_rate": 0.00010595823724417795,
+      "loss": 0.7029,
+      "step": 621
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.81310474354593,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.7142,
+      "step": 622
+    },
+    {
+      "epoch": 0.4984,
+      "grad_norm": 0.4460387883908326,
+      "learning_rate": 0.00010544066588753044,
+      "loss": 0.6854,
+      "step": 623
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.4225285883984137,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.6885,
+      "step": 624
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.43775957404339033,
+      "learning_rate": 0.00010492294831128641,
+      "loss": 0.7115,
+      "step": 625
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.4071503468095915,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7081,
+      "step": 626
+    },
+    {
+      "epoch": 0.5016,
+      "grad_norm": 0.4072514865834203,
+      "learning_rate": 0.00010440509842926767,
+      "loss": 0.7329,
+      "step": 627
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.5130755118707295,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.711,
+      "step": 628
+    },
+    {
+      "epoch": 0.5032,
+      "grad_norm": 0.3923604141588235,
+      "learning_rate": 0.00010388713015885161,
+      "loss": 0.7086,
+      "step": 629
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.41955181246507084,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.6916,
+      "step": 630
+    },
+    {
+      "epoch": 0.5048,
+      "grad_norm": 0.41686539342300905,
+      "learning_rate": 0.00010336905742059742,
+      "loss": 0.6599,
+      "step": 631
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3687345210472633,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.66,
+      "step": 632
+    },
+    {
+      "epoch": 0.5064,
+      "grad_norm": 0.38787817928589796,
+      "learning_rate": 0.0001028508941378719,
+      "loss": 0.6646,
+      "step": 633
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.4206728964596731,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.6985,
+      "step": 634
+    },
+    {
+      "epoch": 0.508,
+      "grad_norm": 0.4492924362995365,
+      "learning_rate": 0.00010233265423647523,
+      "loss": 0.7977,
+      "step": 635
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.4173882948390458,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.661,
+      "step": 636
+    },
+    {
+      "epoch": 0.5096,
+      "grad_norm": 0.4124795178345076,
+      "learning_rate": 0.00010181435164426676,
+      "loss": 0.6913,
+      "step": 637
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.5040626473799648,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.7675,
+      "step": 638
+    },
+    {
+      "epoch": 0.5112,
+      "grad_norm": 0.40943133951872795,
+      "learning_rate": 0.00010129600029079072,
+      "loss": 0.6529,
+      "step": 639
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4261878138325544,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.731,
+      "step": 640
+    },
+    {
+      "epoch": 0.5128,
+      "grad_norm": 0.4423565629647469,
+      "learning_rate": 0.00010077761410690172,
+      "loss": 0.7667,
+      "step": 641
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.481863110573246,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.739,
+      "step": 642
+    },
+    {
+      "epoch": 0.5144,
+      "grad_norm": 0.39023517953601977,
+      "learning_rate": 0.00010025920702439051,
+      "loss": 0.7213,
+      "step": 643
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.41072257265011736,
+      "learning_rate": 0.0001,
+      "loss": 0.6929,
+      "step": 644
+    },
+    {
+      "epoch": 0.516,
+      "grad_norm": 0.5312348751411199,
+      "learning_rate": 9.97407929756095e-05,
+      "loss": 0.6789,
+      "step": 645
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.40087709239929087,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.6247,
+      "step": 646
+    },
+    {
+      "epoch": 0.5176,
+      "grad_norm": 0.44410417507762034,
+      "learning_rate": 9.92223858930983e-05,
+      "loss": 0.6877,
+      "step": 647
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.4245476136267957,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.7603,
+      "step": 648
+    },
+    {
+      "epoch": 0.5192,
+      "grad_norm": 0.3780854932238046,
+      "learning_rate": 9.870399970920932e-05,
+      "loss": 0.6528,
+      "step": 649
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3792354421894993,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.6637,
+      "step": 650
+    },
+    {
+      "epoch": 0.5208,
+      "grad_norm": 0.3936634698121772,
+      "learning_rate": 9.818564835573323e-05,
+      "loss": 0.723,
+      "step": 651
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.39393154842887423,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.6381,
+      "step": 652
+    },
+    {
+      "epoch": 0.5224,
+      "grad_norm": 0.3927738883541016,
+      "learning_rate": 9.766734576352478e-05,
+      "loss": 0.6986,
+      "step": 653
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.5034317083917805,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6412,
+      "step": 654
+    },
+    {
+      "epoch": 0.524,
+      "grad_norm": 0.43113404265467736,
+      "learning_rate": 9.714910586212816e-05,
+      "loss": 0.711,
+      "step": 655
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4705743937849481,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.6725,
+      "step": 656
+    },
+    {
+      "epoch": 0.5256,
+      "grad_norm": 0.4792403381754814,
+      "learning_rate": 9.663094257940258e-05,
+      "loss": 0.6443,
+      "step": 657
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.43819077188745703,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.6952,
+      "step": 658
+    },
+    {
+      "epoch": 0.5272,
+      "grad_norm": 0.39775188295886227,
+      "learning_rate": 9.611286984114841e-05,
+      "loss": 0.6848,
+      "step": 659
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4046047048309528,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.6982,
+      "step": 660
+    },
+    {
+      "epoch": 0.5288,
+      "grad_norm": 0.4375084759429687,
+      "learning_rate": 9.559490157073236e-05,
+      "loss": 0.7474,
+      "step": 661
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.44147482075562255,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.7411,
+      "step": 662
+    },
+    {
+      "epoch": 0.5304,
+      "grad_norm": 0.5323479852419991,
+      "learning_rate": 9.507705168871358e-05,
+      "loss": 0.7737,
+      "step": 663
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.5508071254334945,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.7913,
+      "step": 664
+    },
+    {
+      "epoch": 0.532,
+      "grad_norm": 0.3696957269342562,
+      "learning_rate": 9.455933411246958e-05,
+      "loss": 0.657,
+      "step": 665
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.5281517742483623,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.7186,
+      "step": 666
+    },
+    {
+      "epoch": 0.5336,
+      "grad_norm": 0.3869413765965512,
+      "learning_rate": 9.404176275582208e-05,
+      "loss": 0.6461,
+      "step": 667
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3934247776585974,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.7359,
+      "step": 668
+    },
+    {
+      "epoch": 0.5352,
+      "grad_norm": 0.37114395834723085,
+      "learning_rate": 9.352435152866298e-05,
+      "loss": 0.6752,
+      "step": 669
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.42642359171154925,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.6942,
+      "step": 670
+    },
+    {
+      "epoch": 0.5368,
+      "grad_norm": 0.3921630135688927,
+      "learning_rate": 9.300711433658087e-05,
+      "loss": 0.7221,
+      "step": 671
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3853689735301196,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.6923,
+      "step": 672
+    },
+    {
+      "epoch": 0.5384,
+      "grad_norm": 0.4731525770960137,
+      "learning_rate": 9.249006508048694e-05,
+      "loss": 0.7174,
+      "step": 673
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.38663972282007136,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.6767,
+      "step": 674
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.39610131190645576,
+      "learning_rate": 9.197321765624152e-05,
+      "loss": 0.6656,
+      "step": 675
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.3754135509778824,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.6134,
+      "step": 676
+    },
+    {
+      "epoch": 0.5416,
+      "grad_norm": 0.3801330212153967,
+      "learning_rate": 9.145658595428074e-05,
+      "loss": 0.6722,
+      "step": 677
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.41028895427027573,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7008,
+      "step": 678
+    },
+    {
+      "epoch": 0.5432,
+      "grad_norm": 0.39141695430587464,
+      "learning_rate": 9.09401838592431e-05,
+      "loss": 0.7151,
+      "step": 679
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.44112737661889,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.7079,
+      "step": 680
+    },
+    {
+      "epoch": 0.5448,
+      "grad_norm": 0.3718632770115604,
+      "learning_rate": 9.04240252495963e-05,
+      "loss": 0.6287,
+      "step": 681
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.39364432961357876,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.7042,
+      "step": 682
+    },
+    {
+      "epoch": 0.5464,
+      "grad_norm": 0.4589740532198,
+      "learning_rate": 8.990812399726435e-05,
+      "loss": 0.6797,
+      "step": 683
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.3693628499573939,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.6285,
+      "step": 684
+    },
+    {
+      "epoch": 0.548,
+      "grad_norm": 0.3876729785604324,
+      "learning_rate": 8.939249396725467e-05,
+      "loss": 0.6568,
+      "step": 685
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.4985616156524874,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.6603,
+      "step": 686
+    },
+    {
+      "epoch": 0.5496,
+      "grad_norm": 0.37122591368859864,
+      "learning_rate": 8.887714901728551e-05,
+      "loss": 0.6536,
+      "step": 687
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.40407452323589693,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.6841,
+      "step": 688
+    },
+    {
+      "epoch": 0.5512,
+      "grad_norm": 0.41469700299247075,
+      "learning_rate": 8.836210299741346e-05,
+      "loss": 0.667,
+      "step": 689
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.4450566866828126,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7197,
+      "step": 690
+    },
+    {
+      "epoch": 0.5528,
+      "grad_norm": 0.4512946184916618,
+      "learning_rate": 8.784736974966135e-05,
+      "loss": 0.7381,
+      "step": 691
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.3834099121261059,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.6461,
+      "step": 692
+    },
+    {
+      "epoch": 0.5544,
+      "grad_norm": 0.5142206798245783,
+      "learning_rate": 8.733296310764611e-05,
+      "loss": 0.6494,
+      "step": 693
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.4613832855034234,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.7748,
+      "step": 694
+    },
+    {
+      "epoch": 0.556,
+      "grad_norm": 0.3947092325863364,
+      "learning_rate": 8.6818896896207e-05,
+      "loss": 0.7202,
+      "step": 695
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.4049850438144679,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.6778,
+      "step": 696
+    },
+    {
+      "epoch": 0.5576,
+      "grad_norm": 0.41145705863803866,
+      "learning_rate": 8.63051849310342e-05,
+      "loss": 0.6406,
+      "step": 697
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.47048080897016403,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.6565,
+      "step": 698
+    },
+    {
+      "epoch": 0.5592,
+      "grad_norm": 0.48566114930219795,
+      "learning_rate": 8.579184101829734e-05,
+      "loss": 0.7316,
+      "step": 699
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5316059554529852,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.7287,
+      "step": 700
+    },
+    {
+      "epoch": 0.5608,
+      "grad_norm": 0.3865101011840362,
+      "learning_rate": 8.527887895427454e-05,
+      "loss": 0.6617,
+      "step": 701
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.4096011733838115,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6473,
+      "step": 702
+    },
+    {
+      "epoch": 0.5624,
+      "grad_norm": 0.5431142279086918,
+      "learning_rate": 8.476631252498162e-05,
+      "loss": 0.6405,
+      "step": 703
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.47035277608730325,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7469,
+      "step": 704
+    },
+    {
+      "epoch": 0.564,
+      "grad_norm": 0.3946973147334313,
+      "learning_rate": 8.425415550580162e-05,
+      "loss": 0.6885,
+      "step": 705
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.3960346996238459,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6673,
+      "step": 706
+    },
+    {
+      "epoch": 0.5656,
+      "grad_norm": 0.46568227487979896,
+      "learning_rate": 8.374242166111448e-05,
+      "loss": 0.6794,
+      "step": 707
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4229367779207583,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.6814,
+      "step": 708
+    },
+    {
+      "epoch": 0.5672,
+      "grad_norm": 0.44380573773074394,
+      "learning_rate": 8.323112474392731e-05,
+      "loss": 0.6549,
+      "step": 709
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.4104428725647028,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.7053,
+      "step": 710
+    },
+    {
+      "epoch": 0.5688,
+      "grad_norm": 0.4608466261939257,
+      "learning_rate": 8.272027849550457e-05,
+      "loss": 0.6131,
+      "step": 711
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3776890700015985,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.6685,
+      "step": 712
+    },
+    {
+      "epoch": 0.5704,
+      "grad_norm": 0.4599150879515932,
+      "learning_rate": 8.220989664499878e-05,
+      "loss": 0.7842,
+      "step": 713
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.33902074110600416,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6193,
+      "step": 714
+    },
+    {
+      "epoch": 0.572,
+      "grad_norm": 0.4411052370972699,
+      "learning_rate": 8.169999290908188e-05,
+      "loss": 0.7052,
+      "step": 715
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3747971405873802,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.7022,
+      "step": 716
+    },
+    {
+      "epoch": 0.5736,
+      "grad_norm": 0.46448375655372215,
+      "learning_rate": 8.119058099157604e-05,
+      "loss": 0.7483,
+      "step": 717
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.3856855398064522,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.6887,
+      "step": 718
+    },
+    {
+      "epoch": 0.5752,
+      "grad_norm": 0.4998333150402489,
+      "learning_rate": 8.068167458308582e-05,
+      "loss": 0.7242,
+      "step": 719
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3834524748835133,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.7165,
+      "step": 720
+    },
+    {
+      "epoch": 0.5768,
+      "grad_norm": 0.48115621176536166,
+      "learning_rate": 8.017328736063006e-05,
+      "loss": 0.6942,
+      "step": 721
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.4922713984270049,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6318,
+      "step": 722
+    },
+    {
+      "epoch": 0.5784,
+      "grad_norm": 0.3986544050796855,
+      "learning_rate": 7.966543298727425e-05,
+      "loss": 0.7141,
+      "step": 723
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.40855269821962936,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6577,
+      "step": 724
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.40501432698401973,
+      "learning_rate": 7.915812511176347e-05,
+      "loss": 0.602,
+      "step": 725
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.39738148232900583,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.7335,
+      "step": 726
+    },
+    {
+      "epoch": 0.5816,
+      "grad_norm": 0.38766163926276814,
+      "learning_rate": 7.865137736815535e-05,
+      "loss": 0.6813,
+      "step": 727
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.44383473258838163,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6852,
+      "step": 728
+    },
+    {
+      "epoch": 0.5832,
+      "grad_norm": 0.42206468977725525,
+      "learning_rate": 7.814520337545406e-05,
+      "loss": 0.7053,
+      "step": 729
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.4153572045211229,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.719,
+      "step": 730
+    },
+    {
+      "epoch": 0.5848,
+      "grad_norm": 1.0923746965653356,
+      "learning_rate": 7.763961673724379e-05,
+      "loss": 0.6559,
+      "step": 731
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.44930053107994644,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.7261,
+      "step": 732
+    },
+    {
+      "epoch": 0.5864,
+      "grad_norm": 0.40637606745788074,
+      "learning_rate": 7.713463104132345e-05,
+      "loss": 0.6804,
+      "step": 733
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.3837904442426325,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.6959,
+      "step": 734
+    },
+    {
+      "epoch": 0.588,
+      "grad_norm": 0.4284283375699517,
+      "learning_rate": 7.663025985934158e-05,
+      "loss": 0.7494,
+      "step": 735
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.4178362115797804,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.7129,
+      "step": 736
+    },
+    {
+      "epoch": 0.5896,
+      "grad_norm": 0.7761490225234231,
+      "learning_rate": 7.61265167464313e-05,
+      "loss": 0.6737,
+      "step": 737
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.38018739961984366,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.6219,
+      "step": 738
+    },
+    {
+      "epoch": 0.5912,
+      "grad_norm": 0.3972394773066105,
+      "learning_rate": 7.562341524084623e-05,
+      "loss": 0.6465,
+      "step": 739
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.45967221724548674,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.6859,
+      "step": 740
+    },
+    {
+      "epoch": 0.5928,
+      "grad_norm": 0.38380842446436886,
+      "learning_rate": 7.512096886359664e-05,
+      "loss": 0.6675,
+      "step": 741
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.4046122518219008,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6781,
+      "step": 742
+    },
+    {
+      "epoch": 0.5944,
+      "grad_norm": 0.45503231168566083,
+      "learning_rate": 7.461919111808595e-05,
+      "loss": 0.7489,
+      "step": 743
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.39453696611094263,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.6758,
+      "step": 744
+    },
+    {
+      "epoch": 0.596,
+      "grad_norm": 0.39531586464235496,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.653,
+      "step": 745
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.37970796664435597,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6398,
+      "step": 746
+    },
+    {
+      "epoch": 0.5976,
+      "grad_norm": 0.3943564238156354,
+      "learning_rate": 7.361769544568425e-05,
+      "loss": 0.6248,
+      "step": 747
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.405054766800856,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.665,
+      "step": 748
+    },
+    {
+      "epoch": 0.5992,
+      "grad_norm": 0.41680869276437926,
+      "learning_rate": 7.311800443430251e-05,
+      "loss": 0.6365,
+      "step": 749
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.37894174988526586,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7404,
+      "step": 750
+    },
+    {
+      "epoch": 0.6008,
+      "grad_norm": 0.39464461456803773,
+      "learning_rate": 7.26190358849548e-05,
+      "loss": 0.6704,
+      "step": 751
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.34508384122431485,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.6499,
+      "step": 752
+    },
+    {
+      "epoch": 0.6024,
+      "grad_norm": 0.41096659975697875,
+      "learning_rate": 7.212080320757695e-05,
+      "loss": 0.6943,
+      "step": 753
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.4852612878782842,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.7435,
+      "step": 754
+    },
+    {
+      "epoch": 0.604,
+      "grad_norm": 0.431449953624555,
+      "learning_rate": 7.162331979232783e-05,
+      "loss": 0.6313,
+      "step": 755
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.34161717551810117,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.5986,
+      "step": 756
+    },
+    {
+      "epoch": 0.6056,
+      "grad_norm": 0.4624244394195268,
+      "learning_rate": 7.112659900922976e-05,
+      "loss": 0.6592,
+      "step": 757
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.41256403278587855,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.6655,
+      "step": 758
+    },
+    {
+      "epoch": 0.6072,
+      "grad_norm": 0.41735435891969386,
+      "learning_rate": 7.06306542078091e-05,
+      "loss": 0.6969,
+      "step": 759
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.38794315783957806,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6903,
+      "step": 760
+    },
+    {
+      "epoch": 0.6088,
+      "grad_norm": 0.5452037323592183,
+      "learning_rate": 7.013549871673736e-05,
+      "loss": 0.6451,
+      "step": 761
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.3980413708111724,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6393,
+      "step": 762
+    },
+    {
+      "epoch": 0.6104,
+      "grad_norm": 0.40636128158099566,
+      "learning_rate": 6.964114584347316e-05,
+      "loss": 0.6718,
+      "step": 763
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.4437989080933422,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.7368,
+      "step": 764
+    },
+    {
+      "epoch": 0.612,
+      "grad_norm": 0.3793297983580521,
+      "learning_rate": 6.914760887390452e-05,
+      "loss": 0.6655,
+      "step": 765
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.4062062166979556,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.7844,
+      "step": 766
+    },
+    {
+      "epoch": 0.6136,
+      "grad_norm": 0.4262858478328053,
+      "learning_rate": 6.865490107199181e-05,
+      "loss": 0.6992,
+      "step": 767
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3807916736194714,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6445,
+      "step": 768
+    },
+    {
+      "epoch": 0.6152,
+      "grad_norm": 0.3845725135795709,
+      "learning_rate": 6.816303567941112e-05,
+      "loss": 0.6203,
+      "step": 769
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.418809086744098,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.6432,
+      "step": 770
+    },
+    {
+      "epoch": 0.6168,
+      "grad_norm": 0.383022219002213,
+      "learning_rate": 6.767202591519875e-05,
+      "loss": 0.6512,
+      "step": 771
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.47668664566503743,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7295,
+      "step": 772
+    },
+    {
+      "epoch": 0.6184,
+      "grad_norm": 0.4106101853039467,
+      "learning_rate": 6.718188497539554e-05,
+      "loss": 0.7128,
+      "step": 773
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.38449380526734445,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.6213,
+      "step": 774
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.3680674432824615,
+      "learning_rate": 6.669262603269246e-05,
+      "loss": 0.6254,
+      "step": 775
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.40957792884479693,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.7017,
+      "step": 776
+    },
+    {
+      "epoch": 0.6216,
+      "grad_norm": 0.43382556405890244,
+      "learning_rate": 6.620426223607654e-05,
+      "loss": 0.6854,
+      "step": 777
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.41409917037082994,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6815,
+      "step": 778
+    },
+    {
+      "epoch": 0.6232,
+      "grad_norm": 0.42918642809051144,
+      "learning_rate": 6.571680671047749e-05,
+      "loss": 0.7014,
+      "step": 779
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.44385630524030634,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7144,
+      "step": 780
+    },
+    {
+      "epoch": 0.6248,
+      "grad_norm": 0.43617944813056336,
+      "learning_rate": 6.523027255641493e-05,
+      "loss": 0.7076,
+      "step": 781
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.35464296416961627,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.639,
+      "step": 782
+    },
+    {
+      "epoch": 0.6264,
+      "grad_norm": 0.48189683832526153,
+      "learning_rate": 6.474467284964634e-05,
+      "loss": 0.7592,
+      "step": 783
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.4501052381719782,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6808,
+      "step": 784
+    },
+    {
+      "epoch": 0.628,
+      "grad_norm": 0.4159299664040812,
+      "learning_rate": 6.426002064081565e-05,
+      "loss": 0.705,
+      "step": 785
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.445291134682033,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.6126,
+      "step": 786
+    },
+    {
+      "epoch": 0.6296,
+      "grad_norm": 0.39663608464677486,
+      "learning_rate": 6.377632895510248e-05,
+      "loss": 0.6834,
+      "step": 787
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.38177297587194176,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6623,
+      "step": 788
+    },
+    {
+      "epoch": 0.6312,
+      "grad_norm": 0.4124401789745345,
+      "learning_rate": 6.329361079187199e-05,
+      "loss": 0.7397,
+      "step": 789
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.37116529704099843,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.6629,
+      "step": 790
+    },
+    {
+      "epoch": 0.6328,
+      "grad_norm": 0.3469758455320982,
+      "learning_rate": 6.281187912432587e-05,
+      "loss": 0.6509,
+      "step": 791
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.3767146131837542,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6508,
+      "step": 792
+    },
+    {
+      "epoch": 0.6344,
+      "grad_norm": 0.4274285346901684,
+      "learning_rate": 6.233114689915316e-05,
+      "loss": 0.6655,
+      "step": 793
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.5099042034462896,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.7014,
+      "step": 794
+    },
+    {
+      "epoch": 0.636,
+      "grad_norm": 0.3889311658102342,
+      "learning_rate": 6.18514270361827e-05,
+      "loss": 0.6186,
+      "step": 795
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.44392756556456253,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.7286,
+      "step": 796
+    },
+    {
+      "epoch": 0.6376,
+      "grad_norm": 0.3765472685597652,
+      "learning_rate": 6.13727324280358e-05,
+      "loss": 0.6504,
+      "step": 797
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.39876856119905346,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.7048,
+      "step": 798
+    },
+    {
+      "epoch": 0.6392,
+      "grad_norm": 0.3815269716249916,
+      "learning_rate": 6.08950759397797e-05,
+      "loss": 0.6685,
+      "step": 799
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.5600926327379202,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6752,
+      "step": 800
+    },
+    {
+      "epoch": 0.6408,
+      "grad_norm": 0.3536445681248279,
+      "learning_rate": 6.0418470408581774e-05,
+      "loss": 0.6389,
+      "step": 801
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.37019990622790294,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.6817,
+      "step": 802
+    },
+    {
+      "epoch": 0.6424,
+      "grad_norm": 0.4188872710433534,
+      "learning_rate": 5.9942928643364724e-05,
+      "loss": 0.681,
+      "step": 803
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.3777204049485853,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.613,
+      "step": 804
+    },
+    {
+      "epoch": 0.644,
+      "grad_norm": 0.3880905404693292,
+      "learning_rate": 5.946846342446214e-05,
+      "loss": 0.6818,
+      "step": 805
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.44790623765449117,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6785,
+      "step": 806
+    },
+    {
+      "epoch": 0.6456,
+      "grad_norm": 0.44201618408608107,
+      "learning_rate": 5.899508750327501e-05,
+      "loss": 0.6975,
+      "step": 807
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4880898295884519,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6668,
+      "step": 808
+    },
+    {
+      "epoch": 0.6472,
+      "grad_norm": 0.381133231330232,
+      "learning_rate": 5.8522813601929324e-05,
+      "loss": 0.6754,
+      "step": 809
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.363082248641178,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6392,
+      "step": 810
+    },
+    {
+      "epoch": 0.6488,
+      "grad_norm": 0.41303316154592884,
+      "learning_rate": 5.80516544129337e-05,
+      "loss": 0.6765,
+      "step": 811
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.41012381027069006,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.7106,
+      "step": 812
+    },
+    {
+      "epoch": 0.6504,
+      "grad_norm": 0.3805134062461334,
+      "learning_rate": 5.758162259883867e-05,
+      "loss": 0.6033,
+      "step": 813
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.3669597657684683,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.6659,
+      "step": 814
+    },
+    {
+      "epoch": 0.652,
+      "grad_norm": 0.4329871339377622,
+      "learning_rate": 5.7112730791896207e-05,
+      "loss": 0.6515,
+      "step": 815
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.40056243679097336,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.6766,
+      "step": 816
+    },
+    {
+      "epoch": 0.6536,
+      "grad_norm": 0.403521276952749,
+      "learning_rate": 5.664499159372017e-05,
+      "loss": 0.6841,
+      "step": 817
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.4160653655260832,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.6757,
+      "step": 818
+    },
+    {
+      "epoch": 0.6552,
+      "grad_norm": 0.38991222836224504,
+      "learning_rate": 5.617841757494762e-05,
+      "loss": 0.5927,
+      "step": 819
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.39066570019338576,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.6704,
+      "step": 820
+    },
+    {
+      "epoch": 0.6568,
+      "grad_norm": 0.3828468475809412,
+      "learning_rate": 5.5713021274901335e-05,
+      "loss": 0.6492,
+      "step": 821
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.37868487565289066,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.6778,
+      "step": 822
+    },
+    {
+      "epoch": 0.6584,
+      "grad_norm": 0.44244296442123804,
+      "learning_rate": 5.524881520125229e-05,
+      "loss": 0.5999,
+      "step": 823
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.38987052538579636,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6351,
+      "step": 824
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.38199122254894297,
+      "learning_rate": 5.4785811829683764e-05,
+      "loss": 0.6326,
+      "step": 825
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.3639229880437657,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.6247,
+      "step": 826
+    },
+    {
+      "epoch": 0.6616,
+      "grad_norm": 0.5315947852451907,
+      "learning_rate": 5.432402360355615e-05,
+      "loss": 0.6495,
+      "step": 827
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.35014658605865756,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.6392,
+      "step": 828
+    },
+    {
+      "epoch": 0.6632,
+      "grad_norm": 0.47874843630853453,
+      "learning_rate": 5.386346293357242e-05,
+      "loss": 0.7146,
+      "step": 829
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4629085694326274,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.7138,
+      "step": 830
+    },
+    {
+      "epoch": 0.6648,
+      "grad_norm": 0.3933960103130106,
+      "learning_rate": 5.3404142197444506e-05,
+      "loss": 0.7079,
+      "step": 831
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.3852771061375175,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.6126,
+      "step": 832
+    },
+    {
+      "epoch": 0.6664,
+      "grad_norm": 0.434071766724914,
+      "learning_rate": 5.2946073739560706e-05,
+      "loss": 0.6609,
+      "step": 833
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.41621675469846436,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6563,
+      "step": 834
+    },
+    {
+      "epoch": 0.668,
+      "grad_norm": 0.4393344799203817,
+      "learning_rate": 5.248926987065417e-05,
+      "loss": 0.6674,
+      "step": 835
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.39587381501685853,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6809,
+      "step": 836
+    },
+    {
+      "epoch": 0.6696,
+      "grad_norm": 0.4231237232657075,
+      "learning_rate": 5.203374286747158e-05,
+      "loss": 0.7091,
+      "step": 837
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.4103037220562414,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6678,
+      "step": 838
+    },
+    {
+      "epoch": 0.6712,
+      "grad_norm": 0.40847021747295903,
+      "learning_rate": 5.15795049724435e-05,
+      "loss": 0.6726,
+      "step": 839
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4048507165934699,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.5838,
+      "step": 840
+    },
+    {
+      "epoch": 0.6728,
+      "grad_norm": 0.4267656205061373,
+      "learning_rate": 5.112656839335543e-05,
+      "loss": 0.6898,
+      "step": 841
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.40412044905045197,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.6411,
+      "step": 842
+    },
+    {
+      "epoch": 0.6744,
+      "grad_norm": 0.39907703234527525,
+      "learning_rate": 5.0674945303019526e-05,
+      "loss": 0.645,
+      "step": 843
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4313521298063378,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6954,
+      "step": 844
+    },
+    {
+      "epoch": 0.676,
+      "grad_norm": 0.45219077808300984,
+      "learning_rate": 5.022464783894744e-05,
+      "loss": 0.7019,
+      "step": 845
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.34522413336359226,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.615,
+      "step": 846
+    },
+    {
+      "epoch": 0.6776,
+      "grad_norm": 0.42276722714015846,
+      "learning_rate": 4.977568810302432e-05,
+      "loss": 0.6134,
+      "step": 847
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.4450484853316365,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.7124,
+      "step": 848
+    },
+    {
+      "epoch": 0.6792,
+      "grad_norm": 0.4176859898117107,
+      "learning_rate": 4.9328078161183464e-05,
+      "loss": 0.6494,
+      "step": 849
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.3918207673921806,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6935,
+      "step": 850
+    },
+    {
+      "epoch": 0.6808,
+      "grad_norm": 0.4687812943620728,
+      "learning_rate": 4.88818300430819e-05,
+      "loss": 0.708,
+      "step": 851
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.42328745629807996,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6395,
+      "step": 852
+    },
+    {
+      "epoch": 0.6824,
+      "grad_norm": 0.39422881345065175,
+      "learning_rate": 4.843695574177737e-05,
+      "loss": 0.6222,
+      "step": 853
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.42531681271013405,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.7189,
+      "step": 854
+    },
+    {
+      "epoch": 0.684,
+      "grad_norm": 0.39062939224071663,
+      "learning_rate": 4.7993467213405706e-05,
+      "loss": 0.7097,
+      "step": 855
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.5039033039514914,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6743,
+      "step": 856
+    },
+    {
+      "epoch": 0.6856,
+      "grad_norm": 0.44359403445712453,
+      "learning_rate": 4.755137637685979e-05,
+      "loss": 0.6597,
+      "step": 857
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.45656153532577914,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.7161,
+      "step": 858
+    },
+    {
+      "epoch": 0.6872,
+      "grad_norm": 0.3821800534278985,
+      "learning_rate": 4.7110695113469085e-05,
+      "loss": 0.6403,
+      "step": 859
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.5137085229706376,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.6678,
+      "step": 860
+    },
+    {
+      "epoch": 0.6888,
+      "grad_norm": 0.4267622522554168,
+      "learning_rate": 4.6671435266680216e-05,
+      "loss": 0.669,
+      "step": 861
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.3927474593428916,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.5818,
+      "step": 862
+    },
+    {
+      "epoch": 0.6904,
+      "grad_norm": 0.4214596604287487,
+      "learning_rate": 4.623360864173893e-05,
+      "loss": 0.5903,
+      "step": 863
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.4435106768069405,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.654,
+      "step": 864
+    },
+    {
+      "epoch": 0.692,
+      "grad_norm": 0.4635596392892368,
+      "learning_rate": 4.579722700537268e-05,
+      "loss": 0.6736,
+      "step": 865
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.3919107712030242,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.5857,
+      "step": 866
+    },
+    {
+      "epoch": 0.6936,
+      "grad_norm": 0.3903892283671009,
+      "learning_rate": 4.5362302085474254e-05,
+      "loss": 0.6375,
+      "step": 867
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.5529360557826836,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.7635,
+      "step": 868
+    },
+    {
+      "epoch": 0.6952,
+      "grad_norm": 0.4937511215541514,
+      "learning_rate": 4.492884557078688e-05,
+      "loss": 0.7357,
+      "step": 869
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.5695510559614142,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.6876,
+      "step": 870
+    },
+    {
+      "epoch": 0.6968,
+      "grad_norm": 0.42617656739881216,
+      "learning_rate": 4.449686911058992e-05,
+      "loss": 0.6663,
+      "step": 871
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.4357966592095588,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.6121,
+      "step": 872
+    },
+    {
+      "epoch": 0.6984,
+      "grad_norm": 0.4171737223706917,
+      "learning_rate": 4.406638431438576e-05,
+      "loss": 0.6765,
+      "step": 873
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.42444773335585284,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.689,
+      "step": 874
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.38148418948543755,
+      "learning_rate": 4.36374027515878e-05,
+      "loss": 0.6771,
+      "step": 875
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.4222079247626194,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.7185,
+      "step": 876
+    },
+    {
+      "epoch": 0.7016,
+      "grad_norm": 0.36136220572730077,
+      "learning_rate": 4.320993595120969e-05,
+      "loss": 0.6137,
+      "step": 877
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.4429167676676073,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6396,
+      "step": 878
+    },
+    {
+      "epoch": 0.7032,
+      "grad_norm": 0.40876086503481196,
+      "learning_rate": 4.278399540155536e-05,
+      "loss": 0.7096,
+      "step": 879
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4343712585264002,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.6385,
+      "step": 880
+    },
+    {
+      "epoch": 0.7048,
+      "grad_norm": 0.41837706104168265,
+      "learning_rate": 4.2359592549910145e-05,
+      "loss": 0.6652,
+      "step": 881
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.41121496227160353,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.6452,
+      "step": 882
+    },
+    {
+      "epoch": 0.7064,
+      "grad_norm": 0.46487731139413196,
+      "learning_rate": 4.193673880223339e-05,
+      "loss": 0.6356,
+      "step": 883
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.48347206478487204,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6898,
+      "step": 884
+    },
+    {
+      "epoch": 0.708,
+      "grad_norm": 0.4712607622214813,
+      "learning_rate": 4.1515445522851784e-05,
+      "loss": 0.6859,
+      "step": 885
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.3746852972884601,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6656,
+      "step": 886
+    },
+    {
+      "epoch": 0.7096,
+      "grad_norm": 0.3978423947002656,
+      "learning_rate": 4.109572403415386e-05,
+      "loss": 0.5327,
+      "step": 887
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.38911813310438403,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7227,
+      "step": 888
+    },
+    {
+      "epoch": 0.7112,
+      "grad_norm": 0.3797046115124046,
+      "learning_rate": 4.0677585616285774e-05,
+      "loss": 0.6511,
+      "step": 889
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.4430462860485526,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6417,
+      "step": 890
+    },
+    {
+      "epoch": 0.7128,
+      "grad_norm": 0.45331754482416553,
+      "learning_rate": 4.026104150684835e-05,
+      "loss": 0.6687,
+      "step": 891
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.39912659038055504,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6594,
+      "step": 892
+    },
+    {
+      "epoch": 0.7144,
+      "grad_norm": 0.5155553428149129,
+      "learning_rate": 3.984610290059467e-05,
+      "loss": 0.7029,
+      "step": 893
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.37902154478411765,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.6179,
+      "step": 894
+    },
+    {
+      "epoch": 0.716,
+      "grad_norm": 0.47051796061468243,
+      "learning_rate": 3.943278094912946e-05,
+      "loss": 0.6677,
+      "step": 895
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.4022406821297786,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6697,
+      "step": 896
+    },
+    {
+      "epoch": 0.7176,
+      "grad_norm": 0.4257374265228528,
+      "learning_rate": 3.902108676060937e-05,
+      "loss": 0.6392,
+      "step": 897
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.47383434981133715,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.5611,
+      "step": 898
+    },
+    {
+      "epoch": 0.7192,
+      "grad_norm": 0.47514786323404323,
+      "learning_rate": 3.861103139944449e-05,
+      "loss": 0.7033,
+      "step": 899
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.4359384293749101,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.6611,
+      "step": 900
+    },
+    {
+      "epoch": 0.7208,
+      "grad_norm": 0.5329107588299355,
+      "learning_rate": 3.820262588600074e-05,
+      "loss": 0.6733,
+      "step": 901
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.3304655053635868,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6083,
+      "step": 902
+    },
+    {
+      "epoch": 0.7224,
+      "grad_norm": 0.378930117484927,
+      "learning_rate": 3.7795881196303995e-05,
+      "loss": 0.6078,
+      "step": 903
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.3642262972712831,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.6621,
+      "step": 904
+    },
+    {
+      "epoch": 0.724,
+      "grad_norm": 0.4426647290306224,
+      "learning_rate": 3.739080826174498e-05,
+      "loss": 0.6038,
+      "step": 905
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.4059507975046885,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.5983,
+      "step": 906
+    },
+    {
+      "epoch": 0.7256,
+      "grad_norm": 0.4200147388174036,
+      "learning_rate": 3.6987417968785366e-05,
+      "loss": 0.6695,
+      "step": 907
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.4191820120110178,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.6848,
+      "step": 908
+    },
+    {
+      "epoch": 0.7272,
+      "grad_norm": 0.3727689478218868,
+      "learning_rate": 3.658572115866541e-05,
+      "loss": 0.5903,
+      "step": 909
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.34761139526376134,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6076,
+      "step": 910
+    },
+    {
+      "epoch": 0.7288,
+      "grad_norm": 0.5014658551714443,
+      "learning_rate": 3.618572862711247e-05,
+      "loss": 0.649,
+      "step": 911
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.40160927235903393,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.6503,
+      "step": 912
+    },
+    {
+      "epoch": 0.7304,
+      "grad_norm": 0.4308949226680645,
+      "learning_rate": 3.578745112405083e-05,
+      "loss": 0.6698,
+      "step": 913
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.40597972271838906,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.6436,
+      "step": 914
+    },
+    {
+      "epoch": 0.732,
+      "grad_norm": 0.3912854444629693,
+      "learning_rate": 3.539089935331294e-05,
+      "loss": 0.6138,
+      "step": 915
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.39805403278461843,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6474,
+      "step": 916
+    },
+    {
+      "epoch": 0.7336,
+      "grad_norm": 0.35918475556838036,
+      "learning_rate": 3.4996083972351515e-05,
+      "loss": 0.6705,
+      "step": 917
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.3639389173156511,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6059,
+      "step": 918
+    },
+    {
+      "epoch": 0.7352,
+      "grad_norm": 0.4551968784336033,
+      "learning_rate": 3.4603015591953395e-05,
+      "loss": 0.6514,
+      "step": 919
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.5451495288183975,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.7396,
+      "step": 920
+    },
+    {
+      "epoch": 0.7368,
+      "grad_norm": 0.3608943874649447,
+      "learning_rate": 3.421170477595419e-05,
+      "loss": 0.6159,
+      "step": 921
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.41365460998134396,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.6644,
+      "step": 922
+    },
+    {
+      "epoch": 0.7384,
+      "grad_norm": 0.3965052298702251,
+      "learning_rate": 3.3822162040954354e-05,
+      "loss": 0.6753,
+      "step": 923
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.43263803632988235,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6215,
+      "step": 924
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.39280403682379533,
+      "learning_rate": 3.34343978560367e-05,
+      "loss": 0.5863,
+      "step": 925
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.4007479070398563,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.6805,
+      "step": 926
+    },
+    {
+      "epoch": 0.7416,
+      "grad_norm": 0.3912267539224064,
+      "learning_rate": 3.3048422642484886e-05,
+      "loss": 0.6425,
+      "step": 927
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.43134889815808386,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6662,
+      "step": 928
+    },
+    {
+      "epoch": 0.7432,
+      "grad_norm": 0.39112026109024267,
+      "learning_rate": 3.266424677350346e-05,
+      "loss": 0.6069,
+      "step": 929
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.40911415988552713,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6476,
+      "step": 930
+    },
+    {
+      "epoch": 0.7448,
+      "grad_norm": 0.3943492444774961,
+      "learning_rate": 3.228188057393895e-05,
+      "loss": 0.6208,
+      "step": 931
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.4628673671073314,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.6966,
+      "step": 932
+    },
+    {
+      "epoch": 0.7464,
+      "grad_norm": 0.4632930924390139,
+      "learning_rate": 3.190133432000252e-05,
+      "loss": 0.6682,
+      "step": 933
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.441107627731924,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6452,
+      "step": 934
+    },
+    {
+      "epoch": 0.748,
+      "grad_norm": 0.4149189127802021,
+      "learning_rate": 3.1522618238993725e-05,
+      "loss": 0.6694,
+      "step": 935
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.43893968996144517,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.645,
+      "step": 936
+    },
+    {
+      "epoch": 0.7496,
+      "grad_norm": 0.4077552936838339,
+      "learning_rate": 3.114574250902558e-05,
+      "loss": 0.6077,
+      "step": 937
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.40902726640833204,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.7078,
+      "step": 938
+    },
+    {
+      "epoch": 0.7512,
+      "grad_norm": 0.40357820245744364,
+      "learning_rate": 3.077071725875116e-05,
+      "loss": 0.6171,
+      "step": 939
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.4303800840793108,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6688,
+      "step": 940
+    },
+    {
+      "epoch": 0.7528,
+      "grad_norm": 0.41147230904968735,
+      "learning_rate": 3.0397552567091337e-05,
+      "loss": 0.6778,
+      "step": 941
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.4314031095444482,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.7021,
+      "step": 942
+    },
+    {
+      "epoch": 0.7544,
+      "grad_norm": 0.4336949835297048,
+      "learning_rate": 3.0026258462963787e-05,
+      "loss": 0.7098,
+      "step": 943
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3764657579232399,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.5987,
+      "step": 944
+    },
+    {
+      "epoch": 0.756,
+      "grad_norm": 0.38838123922216483,
+      "learning_rate": 2.9656844925013637e-05,
+      "loss": 0.659,
+      "step": 945
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.37617887630060026,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6288,
+      "step": 946
+    },
+    {
+      "epoch": 0.7576,
+      "grad_norm": 0.45464241972856073,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.6545,
+      "step": 947
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.4588920969504832,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.7424,
+      "step": 948
+    },
+    {
+      "epoch": 0.7592,
+      "grad_norm": 0.3791854005804689,
+      "learning_rate": 2.8923699209255284e-05,
+      "loss": 0.6615,
+      "step": 949
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.45851458768557096,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.7496,
+      "step": 950
+    },
+    {
+      "epoch": 0.7608,
+      "grad_norm": 0.44052936313969776,
+      "learning_rate": 2.8559986734967282e-05,
+      "loss": 0.6866,
+      "step": 951
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.5055435530031537,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6968,
+      "step": 952
+    },
+    {
+      "epoch": 0.7624,
+      "grad_norm": 0.6099090576682689,
+      "learning_rate": 2.819819423336775e-05,
+      "loss": 0.6626,
+      "step": 953
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.510700705992291,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.644,
+      "step": 954
+    },
+    {
+      "epoch": 0.764,
+      "grad_norm": 0.3867445958735247,
+      "learning_rate": 2.7838331427743282e-05,
+      "loss": 0.6508,
+      "step": 955
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.41489503739413236,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6463,
+      "step": 956
+    },
+    {
+      "epoch": 0.7656,
+      "grad_norm": 0.4590763119487639,
+      "learning_rate": 2.7480407989519198e-05,
+      "loss": 0.7661,
+      "step": 957
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.46812897926817154,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.7117,
+      "step": 958
+    },
+    {
+      "epoch": 0.7672,
+      "grad_norm": 0.36936210084673,
+      "learning_rate": 2.712443353799984e-05,
+      "loss": 0.6405,
+      "step": 959
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.39761159107484295,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6744,
+      "step": 960
+    },
+    {
+      "epoch": 0.7688,
+      "grad_norm": 0.34329473092381946,
+      "learning_rate": 2.677041764010988e-05,
+      "loss": 0.6376,
+      "step": 961
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.4007428706691422,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6415,
+      "step": 962
+    },
+    {
+      "epoch": 0.7704,
+      "grad_norm": 0.44642802669889675,
+      "learning_rate": 2.6418369810137188e-05,
+      "loss": 0.6554,
+      "step": 963
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.4575153088661458,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6602,
+      "step": 964
+    },
+    {
+      "epoch": 0.772,
+      "grad_norm": 0.4069088341499625,
+      "learning_rate": 2.6068299509477266e-05,
+      "loss": 0.6455,
+      "step": 965
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.4456047523904863,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6204,
+      "step": 966
+    },
+    {
+      "epoch": 0.7736,
+      "grad_norm": 0.3676766196918783,
+      "learning_rate": 2.5720216146378917e-05,
+      "loss": 0.6873,
+      "step": 967
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.433664929507155,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.69,
+      "step": 968
+    },
+    {
+      "epoch": 0.7752,
+      "grad_norm": 0.373162863636098,
+      "learning_rate": 2.5374129075691265e-05,
+      "loss": 0.6281,
+      "step": 969
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.4117490437737598,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.653,
+      "step": 970
+    },
+    {
+      "epoch": 0.7768,
+      "grad_norm": 0.428626925312621,
+      "learning_rate": 2.503004759861258e-05,
+      "loss": 0.6534,
+      "step": 971
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.7975980075263391,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6506,
+      "step": 972
+    },
+    {
+      "epoch": 0.7784,
+      "grad_norm": 0.4338313154865431,
+      "learning_rate": 2.4687980962440072e-05,
+      "loss": 0.6192,
+      "step": 973
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.47683533218375246,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6969,
+      "step": 974
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.3709437292440277,
+      "learning_rate": 2.4347938360321566e-05,
+      "loss": 0.5725,
+      "step": 975
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.42017519667588804,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.671,
+      "step": 976
+    },
+    {
+      "epoch": 0.7816,
+      "grad_norm": 0.4221200609873446,
+      "learning_rate": 2.400992893100822e-05,
+      "loss": 0.6252,
+      "step": 977
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.37923947147275283,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6631,
+      "step": 978
+    },
+    {
+      "epoch": 0.7832,
+      "grad_norm": 0.42255167433776425,
+      "learning_rate": 2.3673961758609152e-05,
+      "loss": 0.6233,
+      "step": 979
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.44733555174892403,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.6334,
+      "step": 980
+    },
+    {
+      "epoch": 0.7848,
+      "grad_norm": 0.3676281287462771,
+      "learning_rate": 2.334004587234717e-05,
+      "loss": 0.6719,
+      "step": 981
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.46024687896834504,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6495,
+      "step": 982
+    },
+    {
+      "epoch": 0.7864,
+      "grad_norm": 0.4360284336010514,
+      "learning_rate": 2.300819024631603e-05,
+      "loss": 0.6484,
+      "step": 983
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.35607065705282054,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.585,
+      "step": 984
+    },
+    {
+      "epoch": 0.788,
+      "grad_norm": 0.4318624799063715,
+      "learning_rate": 2.26784037992395e-05,
+      "loss": 0.6451,
+      "step": 985
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.42463272975768457,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6471,
+      "step": 986
+    },
+    {
+      "epoch": 0.7896,
+      "grad_norm": 0.42297250293547317,
+      "learning_rate": 2.2350695394231345e-05,
+      "loss": 0.6703,
+      "step": 987
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.3743925777371385,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.6716,
+      "step": 988
+    },
+    {
+      "epoch": 0.7912,
+      "grad_norm": 0.4067119571591152,
+      "learning_rate": 2.2025073838557454e-05,
+      "loss": 0.7036,
+      "step": 989
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.41406160097681877,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6514,
+      "step": 990
+    },
+    {
+      "epoch": 0.7928,
+      "grad_norm": 0.4183605374845178,
+      "learning_rate": 2.1701547883398922e-05,
+      "loss": 0.6575,
+      "step": 991
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.6337103314644563,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6252,
+      "step": 992
+    },
+    {
+      "epoch": 0.7944,
+      "grad_norm": 0.4906999686648005,
+      "learning_rate": 2.138012622361689e-05,
+      "loss": 0.695,
+      "step": 993
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.5225891237739163,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6964,
+      "step": 994
+    },
+    {
+      "epoch": 0.796,
+      "grad_norm": 0.4930718308475257,
+      "learning_rate": 2.106081749751897e-05,
+      "loss": 0.7037,
+      "step": 995
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.5276861081614141,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.622,
+      "step": 996
+    },
+    {
+      "epoch": 0.7976,
+      "grad_norm": 0.42832404920430694,
+      "learning_rate": 2.0743630286627002e-05,
+      "loss": 0.6069,
+      "step": 997
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.466458746148192,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6756,
+      "step": 998
+    },
+    {
+      "epoch": 0.7992,
+      "grad_norm": 0.4544678392027321,
+      "learning_rate": 2.0428573115446392e-05,
+      "loss": 0.6864,
+      "step": 999
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3826745785572623,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6347,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8008,
+      "grad_norm": 0.43155318180539814,
+      "learning_rate": 2.011565445123711e-05,
+      "loss": 0.651,
+      "step": 1001
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.4994300693272774,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.7049,
+      "step": 1002
+    },
+    {
+      "epoch": 0.8024,
+      "grad_norm": 0.5943748518338728,
+      "learning_rate": 1.980488270378612e-05,
+      "loss": 0.64,
+      "step": 1003
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.47217079552895913,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6817,
+      "step": 1004
+    },
+    {
+      "epoch": 0.804,
+      "grad_norm": 0.49365401456324437,
+      "learning_rate": 1.9496266225181248e-05,
+      "loss": 0.6657,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.43436189418749294,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.6456,
+      "step": 1006
+    },
+    {
+      "epoch": 0.8056,
+      "grad_norm": 0.6684493187722501,
+      "learning_rate": 1.918981330958678e-05,
+      "loss": 0.636,
+      "step": 1007
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.4405913490891985,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6225,
+      "step": 1008
+    },
+    {
+      "epoch": 0.8072,
+      "grad_norm": 0.410785975404595,
+      "learning_rate": 1.8885532193020704e-05,
+      "loss": 0.6639,
+      "step": 1009
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.36547794828352415,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.5788,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8088,
+      "grad_norm": 0.4372170956601188,
+      "learning_rate": 1.8583431053133127e-05,
+      "loss": 0.6536,
+      "step": 1011
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.5009233718100731,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.757,
+      "step": 1012
+    },
+    {
+      "epoch": 0.8104,
+      "grad_norm": 0.4056217110684542,
+      "learning_rate": 1.8283518008986567e-05,
+      "loss": 0.6467,
+      "step": 1013
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.46129553983661364,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.6756,
+      "step": 1014
+    },
+    {
+      "epoch": 0.812,
+      "grad_norm": 0.46185062607524163,
+      "learning_rate": 1.7985801120837865e-05,
+      "loss": 0.7277,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.41935766814476017,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.5969,
+      "step": 1016
+    },
+    {
+      "epoch": 0.8136,
+      "grad_norm": 0.4443093688967267,
+      "learning_rate": 1.7690288389921493e-05,
+      "loss": 0.6993,
+      "step": 1017
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.3942385246217329,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.6688,
+      "step": 1018
+    },
+    {
+      "epoch": 0.8152,
+      "grad_norm": 0.42821894152387097,
+      "learning_rate": 1.739698775823442e-05,
+      "loss": 0.7111,
+      "step": 1019
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3662724786286339,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.6564,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8168,
+      "grad_norm": 0.36878600508252474,
+      "learning_rate": 1.7105907108322816e-05,
+      "loss": 0.6006,
+      "step": 1021
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.4065974482647705,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.6344,
+      "step": 1022
+    },
+    {
+      "epoch": 0.8184,
+      "grad_norm": 0.39792442091365,
+      "learning_rate": 1.6817054263070174e-05,
+      "loss": 0.6029,
+      "step": 1023
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.37137896118893476,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6465,
+      "step": 1024
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.42008803683308704,
+      "learning_rate": 1.6530436985486996e-05,
+      "loss": 0.6509,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.4386831052560911,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6753,
+      "step": 1026
+    },
+    {
+      "epoch": 0.8216,
+      "grad_norm": 0.4633449001805289,
+      "learning_rate": 1.6246062978502164e-05,
+      "loss": 0.6253,
+      "step": 1027
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.44299630226182,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6711,
+      "step": 1028
+    },
+    {
+      "epoch": 0.8232,
+      "grad_norm": 0.41085416199217467,
+      "learning_rate": 1.5963939884756042e-05,
+      "loss": 0.6538,
+      "step": 1029
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.4484359919195243,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6143,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8248,
+      "grad_norm": 0.4282550300484898,
+      "learning_rate": 1.5684075286394985e-05,
+      "loss": 0.6673,
+      "step": 1031
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.4191224307491595,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6968,
+      "step": 1032
+    },
+    {
+      "epoch": 0.8264,
+      "grad_norm": 0.4396271764513739,
+      "learning_rate": 1.5406476704867524e-05,
+      "loss": 0.6818,
+      "step": 1033
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.37387472496239993,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6773,
+      "step": 1034
+    },
+    {
+      "epoch": 0.828,
+      "grad_norm": 0.48374131896293865,
+      "learning_rate": 1.5131151600722337e-05,
+      "loss": 0.6646,
+      "step": 1035
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.43633236192352065,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.7082,
+      "step": 1036
+    },
+    {
+      "epoch": 0.8296,
+      "grad_norm": 0.4239669218101152,
+      "learning_rate": 1.485810737340767e-05,
+      "loss": 0.6653,
+      "step": 1037
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.3864223719763971,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.6042,
+      "step": 1038
+    },
+    {
+      "epoch": 0.8312,
+      "grad_norm": 0.42555231885020994,
+      "learning_rate": 1.4587351361072454e-05,
+      "loss": 0.6615,
+      "step": 1039
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3722682023776691,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.5767,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8328,
+      "grad_norm": 0.4337623849323105,
+      "learning_rate": 1.4318890840369182e-05,
+      "loss": 0.7483,
+      "step": 1041
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.39865898952737927,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.7382,
+      "step": 1042
+    },
+    {
+      "epoch": 0.8344,
+      "grad_norm": 0.4437071931347057,
+      "learning_rate": 1.4052733026258281e-05,
+      "loss": 0.6463,
+      "step": 1043
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.4008299282027897,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.6396,
+      "step": 1044
+    },
+    {
+      "epoch": 0.836,
+      "grad_norm": 0.4009493165215926,
+      "learning_rate": 1.3788885071814172e-05,
+      "loss": 0.6523,
+      "step": 1045
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.5062076346003773,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.5861,
+      "step": 1046
+    },
+    {
+      "epoch": 0.8376,
+      "grad_norm": 0.4516942937441003,
+      "learning_rate": 1.3527354068033139e-05,
+      "loss": 0.691,
+      "step": 1047
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.4853529279383495,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.6001,
+      "step": 1048
+    },
+    {
+      "epoch": 0.8392,
+      "grad_norm": 0.37905471747272673,
+      "learning_rate": 1.326814704364262e-05,
+      "loss": 0.6328,
+      "step": 1049
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.5284608677347281,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6908,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8408,
+      "grad_norm": 0.38608717956848915,
+      "learning_rate": 1.3011270964912459e-05,
+      "loss": 0.6388,
+      "step": 1051
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.44267069516551083,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6231,
+      "step": 1052
+    },
+    {
+      "epoch": 0.8424,
+      "grad_norm": 0.3786824586900781,
+      "learning_rate": 1.275673273546758e-05,
+      "loss": 0.6136,
+      "step": 1053
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.4270504804937909,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6791,
+      "step": 1054
+    },
+    {
+      "epoch": 0.844,
+      "grad_norm": 0.5038946234523334,
+      "learning_rate": 1.2504539196102439e-05,
+      "loss": 0.7804,
+      "step": 1055
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.4623293214277537,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6766,
+      "step": 1056
+    },
+    {
+      "epoch": 0.8456,
+      "grad_norm": 0.44301346960741594,
+      "learning_rate": 1.2254697124597237e-05,
+      "loss": 0.6477,
+      "step": 1057
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.6954121952109877,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6503,
+      "step": 1058
+    },
+    {
+      "epoch": 0.8472,
+      "grad_norm": 0.4044191973360963,
+      "learning_rate": 1.2007213235535786e-05,
+      "loss": 0.6569,
+      "step": 1059
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4660644586496118,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.686,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8488,
+      "grad_norm": 0.5634031960415944,
+      "learning_rate": 1.176209418012495e-05,
+      "loss": 0.6161,
+      "step": 1061
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.40473907603181797,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6308,
+      "step": 1062
+    },
+    {
+      "epoch": 0.8504,
+      "grad_norm": 0.440199941429629,
+      "learning_rate": 1.1519346546015907e-05,
+      "loss": 0.6437,
+      "step": 1063
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.42154096060283036,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6591,
+      "step": 1064
+    },
+    {
+      "epoch": 0.852,
+      "grad_norm": 0.3875040322959066,
+      "learning_rate": 1.1278976857127311e-05,
+      "loss": 0.6605,
+      "step": 1065
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.4303409905159609,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.659,
+      "step": 1066
+    },
+    {
+      "epoch": 0.8536,
+      "grad_norm": 0.3674634232741053,
+      "learning_rate": 1.1040991573469629e-05,
+      "loss": 0.6518,
+      "step": 1067
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.7316007602226419,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.7053,
+      "step": 1068
+    },
+    {
+      "epoch": 0.8552,
+      "grad_norm": 0.49302790603798097,
+      "learning_rate": 1.0805397090971737e-05,
+      "loss": 0.6461,
+      "step": 1069
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.3723886365425195,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6411,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8568,
+      "grad_norm": 0.37854206745230956,
+      "learning_rate": 1.057219974130903e-05,
+      "loss": 0.644,
+      "step": 1071
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.4193425252301274,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6764,
+      "step": 1072
+    },
+    {
+      "epoch": 0.8584,
+      "grad_norm": 0.35650860727341804,
+      "learning_rate": 1.0341405791733183e-05,
+      "loss": 0.5994,
+      "step": 1073
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.44021936147114227,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6186,
+      "step": 1074
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.4162720336568276,
+      "learning_rate": 1.0113021444903726e-05,
+      "loss": 0.6584,
+      "step": 1075
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.44300557421485726,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6626,
+      "step": 1076
+    },
+    {
+      "epoch": 0.8616,
+      "grad_norm": 0.47213299117838864,
+      "learning_rate": 9.887052838721322e-06,
+      "loss": 0.6772,
+      "step": 1077
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.38534027867094606,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6308,
+      "step": 1078
+    },
+    {
+      "epoch": 0.8632,
+      "grad_norm": 0.4952808746046796,
+      "learning_rate": 9.663506046162985e-06,
+      "loss": 0.6861,
+      "step": 1079
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.4208322108941599,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6718,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8648,
+      "grad_norm": 0.4704840462474498,
+      "learning_rate": 9.44238707511862e-06,
+      "loss": 0.7059,
+      "step": 1081
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.40080120473996755,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6246,
+      "step": 1082
+    },
+    {
+      "epoch": 0.8664,
+      "grad_norm": 0.44759857005995657,
+      "learning_rate": 9.22370186822965e-06,
+      "loss": 0.6728,
+      "step": 1083
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3960185199853362,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.5488,
+      "step": 1084
+    },
+    {
+      "epoch": 0.868,
+      "grad_norm": 0.36610015351743425,
+      "learning_rate": 9.0074563027294e-06,
+      "loss": 0.5978,
+      "step": 1085
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.4472618735917445,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6497,
+      "step": 1086
+    },
+    {
+      "epoch": 0.8696,
+      "grad_norm": 0.464807452611619,
+      "learning_rate": 8.79365619028507e-06,
+      "loss": 0.7345,
+      "step": 1087
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4709506037877623,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.7311,
+      "step": 1088
+    },
+    {
+      "epoch": 0.8712,
+      "grad_norm": 0.3358466208808786,
+      "learning_rate": 8.582307276841462e-06,
+      "loss": 0.564,
+      "step": 1089
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.42600342697067267,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.6216,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8728,
+      "grad_norm": 0.45028467405895545,
+      "learning_rate": 8.37341524246672e-06,
+      "loss": 0.6409,
+      "step": 1091
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.557329627370974,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.7207,
+      "step": 1092
+    },
+    {
+      "epoch": 0.8744,
+      "grad_norm": 0.4136112769160197,
+      "learning_rate": 8.166985701199582e-06,
+      "loss": 0.5935,
+      "step": 1093
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.37329400859832346,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.622,
+      "step": 1094
+    },
+    {
+      "epoch": 0.876,
+      "grad_norm": 0.43682334669658196,
+      "learning_rate": 7.963024200898462e-06,
+      "loss": 0.6404,
+      "step": 1095
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3748636097341092,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6223,
+      "step": 1096
+    },
+    {
+      "epoch": 0.8776,
+      "grad_norm": 0.42722010259960563,
+      "learning_rate": 7.761536223092458e-06,
+      "loss": 0.6276,
+      "step": 1097
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.4518428694340991,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6902,
+      "step": 1098
+    },
+    {
+      "epoch": 0.8792,
+      "grad_norm": 0.3623217419092838,
+      "learning_rate": 7.562527182833978e-06,
+      "loss": 0.6293,
+      "step": 1099
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5573806212574868,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6891,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8808,
+      "grad_norm": 0.3991199845067119,
+      "learning_rate": 7.366002428553153e-06,
+      "loss": 0.7093,
+      "step": 1101
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.4170610132690793,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6367,
+      "step": 1102
+    },
+    {
+      "epoch": 0.8824,
+      "grad_norm": 0.3596897692127439,
+      "learning_rate": 7.171967241914224e-06,
+      "loss": 0.612,
+      "step": 1103
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.41697308922376014,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.688,
+      "step": 1104
+    },
+    {
+      "epoch": 0.884,
+      "grad_norm": 0.4112266335214538,
+      "learning_rate": 6.980426837673437e-06,
+      "loss": 0.596,
+      "step": 1105
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.484535098122851,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6357,
+      "step": 1106
+    },
+    {
+      "epoch": 0.8856,
+      "grad_norm": 0.4145119576764233,
+      "learning_rate": 6.791386363539065e-06,
+      "loss": 0.6603,
+      "step": 1107
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.3845276967918898,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.629,
+      "step": 1108
+    },
+    {
+      "epoch": 0.8872,
+      "grad_norm": 0.478000040837266,
+      "learning_rate": 6.604850900032955e-06,
+      "loss": 0.6603,
+      "step": 1109
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.4637204727474049,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.642,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8888,
+      "grad_norm": 0.3928239925222747,
+      "learning_rate": 6.420825460353974e-06,
+      "loss": 0.6576,
+      "step": 1111
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.4231514273592192,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6574,
+      "step": 1112
+    },
+    {
+      "epoch": 0.8904,
+      "grad_norm": 0.48355232084777794,
+      "learning_rate": 6.239314990243339e-06,
+      "loss": 0.6217,
+      "step": 1113
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.5947856046562446,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.7306,
+      "step": 1114
+    },
+    {
+      "epoch": 0.892,
+      "grad_norm": 0.36458883310346435,
+      "learning_rate": 6.0603243678516995e-06,
+      "loss": 0.6211,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.3960739949887535,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6589,
+      "step": 1116
+    },
+    {
+      "epoch": 0.8936,
+      "grad_norm": 0.36023352854279544,
+      "learning_rate": 5.883858403607967e-06,
+      "loss": 0.6245,
+      "step": 1117
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.40780844491338397,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6174,
+      "step": 1118
+    },
+    {
+      "epoch": 0.8952,
+      "grad_norm": 0.582970005377322,
+      "learning_rate": 5.7099218400900716e-06,
+      "loss": 0.7372,
+      "step": 1119
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.38500948239471194,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.5915,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8968,
+      "grad_norm": 0.4287912176074291,
+      "learning_rate": 5.538519351897575e-06,
+      "loss": 0.6193,
+      "step": 1121
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.4198902015159911,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6236,
+      "step": 1122
+    },
+    {
+      "epoch": 0.8984,
+      "grad_norm": 0.38233551950051503,
+      "learning_rate": 5.369655545525909e-06,
+      "loss": 0.6016,
+      "step": 1123
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.5395772738169392,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6322,
+      "step": 1124
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.44702077463576917,
+      "learning_rate": 5.2033349592426335e-06,
+      "loss": 0.6535,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.41079916482077417,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6167,
+      "step": 1126
+    },
+    {
+      "epoch": 0.9016,
+      "grad_norm": 0.37665556305187514,
+      "learning_rate": 5.039562062965508e-06,
+      "loss": 0.6007,
+      "step": 1127
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.602870889721456,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.5824,
+      "step": 1128
+    },
+    {
+      "epoch": 0.9032,
+      "grad_norm": 0.4234135876004459,
+      "learning_rate": 4.87834125814235e-06,
+      "loss": 0.6431,
+      "step": 1129
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.36822255437976564,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.652,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9048,
+      "grad_norm": 0.36510134079543993,
+      "learning_rate": 4.719676877632639e-06,
+      "loss": 0.5875,
+      "step": 1131
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.3729073618035483,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6507,
+      "step": 1132
+    },
+    {
+      "epoch": 0.9064,
+      "grad_norm": 0.3523972331305135,
+      "learning_rate": 4.563573185591219e-06,
+      "loss": 0.6131,
+      "step": 1133
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.4072440360190982,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6777,
+      "step": 1134
+    },
+    {
+      "epoch": 0.908,
+      "grad_norm": 0.34919183194492326,
+      "learning_rate": 4.4100343773536225e-06,
+      "loss": 0.6133,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.42513790124721773,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6218,
+      "step": 1136
+    },
+    {
+      "epoch": 0.9096,
+      "grad_norm": 0.46900697721960166,
+      "learning_rate": 4.259064579323302e-06,
+      "loss": 0.6291,
+      "step": 1137
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.49638656881266513,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.654,
+      "step": 1138
+    },
+    {
+      "epoch": 0.9112,
+      "grad_norm": 0.4080519919725839,
+      "learning_rate": 4.1106678488607495e-06,
+      "loss": 0.7047,
+      "step": 1139
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4208253803362493,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.5866,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9128,
+      "grad_norm": 0.4925975823169105,
+      "learning_rate": 3.964848174174541e-06,
+      "loss": 0.7029,
+      "step": 1141
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.47362666344716114,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.7168,
+      "step": 1142
+    },
+    {
+      "epoch": 0.9144,
+      "grad_norm": 0.4397762197275533,
+      "learning_rate": 3.821609474213983e-06,
+      "loss": 0.5982,
+      "step": 1143
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.4773330206512832,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6696,
+      "step": 1144
+    },
+    {
+      "epoch": 0.916,
+      "grad_norm": 0.41715927620580495,
+      "learning_rate": 3.6809555985639068e-06,
+      "loss": 0.6236,
+      "step": 1145
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.4872669921994879,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6887,
+      "step": 1146
+    },
+    {
+      "epoch": 0.9176,
+      "grad_norm": 0.3739624404078296,
+      "learning_rate": 3.5428903273411863e-06,
+      "loss": 0.5864,
+      "step": 1147
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.4644012923456862,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.5956,
+      "step": 1148
+    },
+    {
+      "epoch": 0.9192,
+      "grad_norm": 0.5171644364848799,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.6794,
+      "step": 1149
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.5036272549775639,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6631,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9208,
+      "grad_norm": 0.4084323550438053,
+      "learning_rate": 3.2745403706978872e-06,
+      "loss": 0.6714,
+      "step": 1151
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4582317574681205,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6538,
+      "step": 1152
+    },
+    {
+      "epoch": 0.9224,
+      "grad_norm": 0.39260460346022474,
+      "learning_rate": 3.1442628972662704e-06,
+      "loss": 0.6378,
+      "step": 1153
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.40838108087739916,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6872,
+      "step": 1154
+    },
+    {
+      "epoch": 0.924,
+      "grad_norm": 0.38946336528002273,
+      "learning_rate": 3.0165884520461316e-06,
+      "loss": 0.6622,
+      "step": 1155
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.37207487789605137,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.6444,
+      "step": 1156
+    },
+    {
+      "epoch": 0.9256,
+      "grad_norm": 0.49153673402231457,
+      "learning_rate": 2.8915204663281013e-06,
+      "loss": 0.7307,
+      "step": 1157
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.40195076339811375,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.5939,
+      "step": 1158
+    },
+    {
+      "epoch": 0.9272,
+      "grad_norm": 0.41042476999125416,
+      "learning_rate": 2.7690623013533976e-06,
+      "loss": 0.5692,
+      "step": 1159
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4685435198499234,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.5941,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9288,
+      "grad_norm": 0.3563779620133779,
+      "learning_rate": 2.649217248223468e-06,
+      "loss": 0.6011,
+      "step": 1161
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.3983783041419285,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6511,
+      "step": 1162
+    },
+    {
+      "epoch": 0.9304,
+      "grad_norm": 0.3517579195645366,
+      "learning_rate": 2.5319885278115906e-06,
+      "loss": 0.6642,
+      "step": 1163
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.370928425327566,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.6292,
+      "step": 1164
+    },
+    {
+      "epoch": 0.932,
+      "grad_norm": 0.4060622147021784,
+      "learning_rate": 2.4173792906762804e-06,
+      "loss": 0.6731,
+      "step": 1165
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.5931334860219275,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.5995,
+      "step": 1166
+    },
+    {
+      "epoch": 0.9336,
+      "grad_norm": 0.5331085365721472,
+      "learning_rate": 2.3053926169765984e-06,
+      "loss": 0.6593,
+      "step": 1167
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3598383122326138,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6384,
+      "step": 1168
+    },
+    {
+      "epoch": 0.9352,
+      "grad_norm": 0.3915947721446365,
+      "learning_rate": 2.1960315163894075e-06,
+      "loss": 0.5701,
+      "step": 1169
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.3821055121736776,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.6553,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9368,
+      "grad_norm": 0.3602041145233819,
+      "learning_rate": 2.0892989280284823e-06,
+      "loss": 0.7049,
+      "step": 1171
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.44548436852189566,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.668,
+      "step": 1172
+    },
+    {
+      "epoch": 0.9384,
+      "grad_norm": 0.4373904941373431,
+      "learning_rate": 1.9851977203654835e-06,
+      "loss": 0.6402,
+      "step": 1173
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.438827373751763,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6561,
+      "step": 1174
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.4362264956140032,
+      "learning_rate": 1.8837306911529184e-06,
+      "loss": 0.643,
+      "step": 1175
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.4044806923870212,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6624,
+      "step": 1176
+    },
+    {
+      "epoch": 0.9416,
+      "grad_norm": 0.4000772256196655,
+      "learning_rate": 1.7849005673489127e-06,
+      "loss": 0.5853,
+      "step": 1177
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.4083796539848895,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.5917,
+      "step": 1178
+    },
+    {
+      "epoch": 0.9432,
+      "grad_norm": 0.3894230156325759,
+      "learning_rate": 1.6887100050439587e-06,
+      "loss": 0.6288,
+      "step": 1179
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.36107007695240323,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.5881,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9448,
+      "grad_norm": 0.45402894888204026,
+      "learning_rate": 1.595161589389449e-06,
+      "loss": 0.717,
+      "step": 1181
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.34806835664596164,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6233,
+      "step": 1182
+    },
+    {
+      "epoch": 0.9464,
+      "grad_norm": 0.39793870698491896,
+      "learning_rate": 1.5042578345283108e-06,
+      "loss": 0.6874,
+      "step": 1183
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.39401129084286485,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6662,
+      "step": 1184
+    },
+    {
+      "epoch": 0.948,
+      "grad_norm": 0.37772600752742874,
+      "learning_rate": 1.4160011835273934e-06,
+      "loss": 0.6862,
+      "step": 1185
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.37450273581911875,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.6451,
+      "step": 1186
+    },
+    {
+      "epoch": 0.9496,
+      "grad_norm": 0.38176089301787547,
+      "learning_rate": 1.3303940083117527e-06,
+      "loss": 0.6373,
+      "step": 1187
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.35358647926829284,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.663,
+      "step": 1188
+    },
+    {
+      "epoch": 0.9512,
+      "grad_norm": 0.4602139471488554,
+      "learning_rate": 1.2474386096010039e-06,
+      "loss": 0.6226,
+      "step": 1189
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.5416889606730806,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.6362,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9528,
+      "grad_norm": 0.4036750223093132,
+      "learning_rate": 1.1671372168474138e-06,
+      "loss": 0.6532,
+      "step": 1191
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.38404678255442,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6097,
+      "step": 1192
+    },
+    {
+      "epoch": 0.9544,
+      "grad_norm": 0.4267377090747644,
+      "learning_rate": 1.089491988176017e-06,
+      "loss": 0.6923,
+      "step": 1193
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.41348061051676394,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.688,
+      "step": 1194
+    },
+    {
+      "epoch": 0.956,
+      "grad_norm": 0.4185678843563295,
+      "learning_rate": 1.014505010326583e-06,
+      "loss": 0.6636,
+      "step": 1195
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3344176284141208,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.5905,
+      "step": 1196
+    },
+    {
+      "epoch": 0.9576,
+      "grad_norm": 0.43078748964380953,
+      "learning_rate": 9.421782985976068e-07,
+      "loss": 0.6599,
+      "step": 1197
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.3935543093104591,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6527,
+      "step": 1198
+    },
+    {
+      "epoch": 0.9592,
+      "grad_norm": 0.3332478313177662,
+      "learning_rate": 8.725137967920738e-07,
+      "loss": 0.5986,
+      "step": 1199
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.43121211559573536,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6474,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9608,
+      "grad_norm": 0.38363861306199587,
+      "learning_rate": 8.055133771652345e-07,
+      "loss": 0.6647,
+      "step": 1201
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.36619989098104866,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6225,
+      "step": 1202
+    },
+    {
+      "epoch": 0.9624,
+      "grad_norm": 0.4718860301100266,
+      "learning_rate": 7.411788403743237e-07,
+      "loss": 0.7245,
+      "step": 1203
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.40222763581499665,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.642,
+      "step": 1204
+    },
+    {
+      "epoch": 0.964,
+      "grad_norm": 0.4880004363496894,
+      "learning_rate": 6.7951191543012e-07,
+      "loss": 0.7039,
+      "step": 1205
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.3837309331013992,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6673,
+      "step": 1206
+    },
+    {
+      "epoch": 0.9656,
+      "grad_norm": 0.4009821811006916,
+      "learning_rate": 6.205142596505176e-07,
+      "loss": 0.6035,
+      "step": 1207
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.404170233361831,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6529,
+      "step": 1208
+    },
+    {
+      "epoch": 0.9672,
+      "grad_norm": 0.44007382139111645,
+      "learning_rate": 5.64187458615939e-07,
+      "loss": 0.6554,
+      "step": 1209
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.40124653569122576,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6451,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9688,
+      "grad_norm": 0.4067421755106845,
+      "learning_rate": 5.105330261267916e-07,
+      "loss": 0.6618,
+      "step": 1211
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.42871209202919514,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6348,
+      "step": 1212
+    },
+    {
+      "epoch": 0.9704,
+      "grad_norm": 0.40897502568707533,
+      "learning_rate": 4.5955240416271084e-07,
+      "loss": 0.6503,
+      "step": 1213
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.4115778403597554,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6875,
+      "step": 1214
+    },
+    {
+      "epoch": 0.972,
+      "grad_norm": 0.3888626628329367,
+      "learning_rate": 4.112469628438365e-07,
+      "loss": 0.6878,
+      "step": 1215
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.3898464069550168,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6892,
+      "step": 1216
+    },
+    {
+      "epoch": 0.9736,
+      "grad_norm": 0.4240977211745303,
+      "learning_rate": 3.6561800039403016e-07,
+      "loss": 0.6978,
+      "step": 1217
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.4083436974329831,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.648,
+      "step": 1218
+    },
+    {
+      "epoch": 0.9752,
+      "grad_norm": 0.43683232074338657,
+      "learning_rate": 3.2266674310589273e-07,
+      "loss": 0.6371,
+      "step": 1219
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.44405962563885487,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6408,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9768,
+      "grad_norm": 0.40189618267215416,
+      "learning_rate": 2.8239434530792365e-07,
+      "loss": 0.6206,
+      "step": 1221
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.38143520659589464,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.65,
+      "step": 1222
+    },
+    {
+      "epoch": 0.9784,
+      "grad_norm": 0.553594963156896,
+      "learning_rate": 2.448018893333681e-07,
+      "loss": 0.6835,
+      "step": 1223
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.4606935828464384,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.7302,
+      "step": 1224
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.5377134495942296,
+      "learning_rate": 2.098903854912515e-07,
+      "loss": 0.7018,
+      "step": 1225
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.4447776733368485,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6602,
+      "step": 1226
+    },
+    {
+      "epoch": 0.9816,
+      "grad_norm": 0.34772105247308416,
+      "learning_rate": 1.7766077203915655e-07,
+      "loss": 0.6197,
+      "step": 1227
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.4392583568148265,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.7154,
+      "step": 1228
+    },
+    {
+      "epoch": 0.9832,
+      "grad_norm": 0.4253963088334093,
+      "learning_rate": 1.481139151579991e-07,
+      "loss": 0.6482,
+      "step": 1229
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.39185547598418924,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6828,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9848,
+      "grad_norm": 0.4071068410359769,
+      "learning_rate": 1.2125060892881346e-07,
+      "loss": 0.6466,
+      "step": 1231
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.579665557785551,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.6259,
+      "step": 1232
+    },
+    {
+      "epoch": 0.9864,
+      "grad_norm": 0.4045183788046934,
+      "learning_rate": 9.707157531134713e-08,
+      "loss": 0.6416,
+      "step": 1233
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.40827472515450947,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6263,
+      "step": 1234
+    },
+    {
+      "epoch": 0.988,
+      "grad_norm": 0.4048930082559139,
+      "learning_rate": 7.557746412468758e-08,
+      "loss": 0.643,
+      "step": 1235
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.3981307050442245,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.5864,
+      "step": 1236
+    },
+    {
+      "epoch": 0.9896,
+      "grad_norm": 0.4202381520920859,
+      "learning_rate": 5.6768853029787184e-08,
+      "loss": 0.6426,
+      "step": 1237
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.4132844897416911,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6767,
+      "step": 1238
+    },
+    {
+      "epoch": 0.9912,
+      "grad_norm": 0.42446177971468574,
+      "learning_rate": 4.064624751394242e-08,
+      "loss": 0.6668,
+      "step": 1239
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4365754108889452,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.7009,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9928,
+      "grad_norm": 0.4208860054391876,
+      "learning_rate": 2.7210080877237976e-08,
+      "loss": 0.6772,
+      "step": 1241
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.37814548759802874,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6669,
+      "step": 1242
+    },
+    {
+      "epoch": 0.9944,
+      "grad_norm": 0.42343131577798393,
+      "learning_rate": 1.646071422083395e-08,
+      "loss": 0.6502,
+      "step": 1243
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.4327753082312952,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6694,
+      "step": 1244
+    },
+    {
+      "epoch": 0.996,
+      "grad_norm": 0.3943042623046033,
+      "learning_rate": 8.398436437317969e-09,
+      "loss": 0.6288,
+      "step": 1245
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.41688071618548456,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6374,
+      "step": 1246
+    },
+    {
+      "epoch": 0.9976,
+      "grad_norm": 0.5003834831982291,
+      "learning_rate": 3.023464202944748e-09,
+      "loss": 0.6499,
+      "step": 1247
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.45866807006109467,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.6706,
+      "step": 1248
+    },
+    {
+      "epoch": 0.9992,
+      "grad_norm": 0.3916081291545642,
+      "learning_rate": 3.3594197175190745e-10,
+      "loss": 0.6381,
+      "step": 1249
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.4770064931608081,
+      "learning_rate": 0.0,
+      "loss": 0.7777,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0,
+      "step": 1250,
+      "total_flos": 1096393255911424.0,
+      "train_loss": 0.7163409863471984,
+      "train_runtime": 19465.3314,
+      "train_samples_per_second": 1.027,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1096393255911424.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c30a80f55782215f57694327607360c6ad690188
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "gate_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5e0187cc3747e5cfed2719b67b0e3550e2bba270
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a78cb0ee8becea056080a5e85bebb05c98c6596c42b552ea117abb9b85680339
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c30922df6c124f2d37be9677cc351d74fd23c500
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a43ff62a24b2b8a809607b50c5de7f0ee2bdb37db617f9fd96e81da20c929c5
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6483ef95e0c7a1741c80563288ee6861c26cad53
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.9564362205502492,
+      "learning_rate": 5e-05,
+      "loss": 1.3891,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.0670570078098212,
+      "learning_rate": 0.0001,
+      "loss": 1.5647,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.6832004913511157,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.1882,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.4040934204807518,
+      "learning_rate": 0.0002,
+      "loss": 1.2526,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.0594878077292809,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 1.0504,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.7880067192842751,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 1.0003,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.6095190630196066,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 1.0152,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5250870981951875,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 0.961,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5100945930231707,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 0.9172,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.6205753925466002,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 0.9564,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.5508857901203891,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 0.8852,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.6483010342507907,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 1.0694,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5668252221073122,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 0.9364,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5813967382806945,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 0.8932,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.49591366297717443,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.831,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4672597855174128,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 0.9122,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4744190657329229,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 0.889,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4733278011435993,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 0.78,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.5074702125395911,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 0.929,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.46463115723052933,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 0.8786,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 1.1053535634091058,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 0.9157,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.5765958674318559,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 0.8675,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.5301032501211107,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 0.8631,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.48065832362239796,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 0.8291,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.40329194748717834,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 0.8386,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4518990740306019,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.8682,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.4849664224399919,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 0.9325,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4892145977372225,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 0.8931,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.4700856635995607,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 0.8751,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5337272569459788,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 0.8884,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.43616755029040166,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 0.8273,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4802536284656567,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 0.8929,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.48669517596505296,
+      "learning_rate": 0.000172967916579403,
+      "loss": 0.9583,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4006984433764588,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 0.8102,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.3804581210152902,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 0.8244,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4018169810718193,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 0.8337,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.38601705911275547,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.8522,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4770897944661433,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 0.8619,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.4436139687419418,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 0.7643,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4510651437608395,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 0.8198,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.5204234563209176,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 0.8797,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4553293071795067,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 0.7873,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.451874650212316,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 0.8664,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.45660075722936766,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 0.8353,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4578324100950842,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 0.8545,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3901315778595456,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 0.7646,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.4709543810589498,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 0.8438,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.45352980772919727,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.7773,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.5315738799272542,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 0.8384,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.44764161698075966,
+      "learning_rate": 0.000136764169663272,
+      "loss": 0.8097,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.451036919752691,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 0.805,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3772027967638337,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 0.7472,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.40519776999433677,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 0.7577,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.40847094673729234,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 0.7304,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.4061703470017864,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 0.8478,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.5284646726291284,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 0.87,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.4072381711235819,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 0.7865,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4638388427170942,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 0.8448,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4255657720832117,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.8351,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.41307171477445137,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 0.7264,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.39579024422674575,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 0.7356,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4694881217262419,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 0.8502,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.4001519238520662,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.7873,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.42210737351272143,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.8427,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.40525818022776766,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.8052,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.6122927350978119,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.8228,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.42711811000314354,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.7886,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4495790965625436,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.7685,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.4413267802962418,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.7577,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.497785940917474,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.7098,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.39572697753004543,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.7946,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4324613617825247,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.8368,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.5311744381229567,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.8749,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.42782020928719666,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.8097,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4268388603862346,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.7788,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4418608750620497,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.8151,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.4513775352416818,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.8862,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.4288780904259751,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.8145,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.4090576841293381,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.8277,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.45189215953496836,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 0.7855,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.45422260123346536,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.7714,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.91202440455519,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.8794,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.9149529602442982,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.7361,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4546391256597834,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.7817,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.42238075335008324,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.7695,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4424948789637984,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.7948,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.3982729241150964,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.7599,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.44348156245108955,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.7968,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.40263627071712976,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.7015,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.36656617219628396,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 0.7617,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.434413757991748,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.7809,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.5181032894210132,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.7603,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4095041782820746,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.7155,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.42694499417908965,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.734,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.33798217911043893,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.7886,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.41589903056727734,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.7421,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.445056779586077,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.7443,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.40780163543891135,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.789,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.4936441498703536,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.843,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.40784407447552307,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.7378,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.49759805586308553,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.7839,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.40226259358803806,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 0.7403,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.4404451039655544,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.7991,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.5215311872478593,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.767,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.8015353117608185,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.8428,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4033768748067044,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.7629,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.43468570005173046,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.8203,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.4314271380610339,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.7522,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.4099555584962568,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.774,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.39192345338489143,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.7854,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.5606142634633692,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.8025,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.4125228048773097,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.7515,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.4551086684708683,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.8832,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4700100607241755,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.7774,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.4022309925214962,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.7639,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.35237768138172054,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.7439,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.5271039606272312,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.7181,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4071465539898394,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.7526,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.3971023247167748,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.6728,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.40853435166821706,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.7731,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.4279732050533312,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.7563,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.5012557378562927,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.7848,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.4519002693851681,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.8422,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4089364350515701,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.7779,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.40046147854066166,
+      "learning_rate": 0.0,
+      "loss": 0.7812,
+      "step": 125
+    },
+    {
+      "epoch": 1.0,
+      "step": 125,
+      "total_flos": 108391249608704.0,
+      "train_loss": 0.8369061512947082,
+      "train_runtime": 1950.8409,
+      "train_samples_per_second": 1.025,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 108391249608704.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..11cf9228d509db53df520199293cf90dab9e7f42
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "q_proj",
+    "o_proj",
+    "gate_proj",
+    "down_proj",
+    "v_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5121f81caa5aeeb781cc49c4d5d4e9681a90a495
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66d8cbbfc9b287ff28dba1e24911d0eb16199d893f93599beda9d29e924e1a15
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..06bbc4c079ad9a668618f5aa287ecb8da126ce37
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:442b2c77a52040812daa27c790b34f713f5ac98306fae36fb1e83f0dccda2c9d
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..37addc6520be74117afa60ba40d4637e6fb8fdf0
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,476 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.992,
+  "eval_steps": 500,
+  "global_step": 62,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.9378038948956198,
+      "learning_rate": 0.0001,
+      "loss": 1.4769,
+      "step": 1
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.8982165698182389,
+      "learning_rate": 0.0002,
+      "loss": 1.4233,
+      "step": 2
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 1.287847751518021,
+      "learning_rate": 0.0001998629534754574,
+      "loss": 1.26,
+      "step": 3
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 1.0001952815738493,
+      "learning_rate": 0.00019945218953682734,
+      "loss": 1.1422,
+      "step": 4
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.6807775171811575,
+      "learning_rate": 0.00019876883405951377,
+      "loss": 0.985,
+      "step": 5
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5355248224421841,
+      "learning_rate": 0.00019781476007338058,
+      "loss": 1.0363,
+      "step": 6
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.7570224306176186,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.9551,
+      "step": 7
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4201298524092389,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 0.9048,
+      "step": 8
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4980661254746478,
+      "learning_rate": 0.00019335804264972018,
+      "loss": 0.8679,
+      "step": 9
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.47119688466611204,
+      "learning_rate": 0.0001913545457642601,
+      "loss": 0.9412,
+      "step": 10
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4957137205221804,
+      "learning_rate": 0.0001891006524188368,
+      "loss": 0.9211,
+      "step": 11
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4336212420660128,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8708,
+      "step": 12
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.39597291719308214,
+      "learning_rate": 0.00018386705679454242,
+      "loss": 0.8782,
+      "step": 13
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.38390249363179807,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.9323,
+      "step": 14
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3990386813171272,
+      "learning_rate": 0.0001777145961456971,
+      "loss": 0.9005,
+      "step": 15
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.36808856210178786,
+      "learning_rate": 0.00017431448254773944,
+      "loss": 0.8754,
+      "step": 16
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.37805906793374583,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.8996,
+      "step": 17
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.325235795491243,
+      "learning_rate": 0.00016691306063588583,
+      "loss": 0.8416,
+      "step": 18
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.3625820412095005,
+      "learning_rate": 0.00016293203910498376,
+      "loss": 0.8683,
+      "step": 19
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.34192579542551826,
+      "learning_rate": 0.00015877852522924732,
+      "loss": 0.8072,
+      "step": 20
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.37270480043786225,
+      "learning_rate": 0.00015446390350150273,
+      "loss": 0.8442,
+      "step": 21
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.35160707209512304,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8611,
+      "step": 22
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3334062896076315,
+      "learning_rate": 0.00014539904997395468,
+      "loss": 0.8201,
+      "step": 23
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.3906535203730245,
+      "learning_rate": 0.00014067366430758004,
+      "loss": 0.8188,
+      "step": 24
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3577973945208406,
+      "learning_rate": 0.00013583679495453,
+      "loss": 0.8321,
+      "step": 25
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.30630218053753877,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.7872,
+      "step": 26
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.3148202096857398,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.7564,
+      "step": 27
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3563247781912407,
+      "learning_rate": 0.00012079116908177593,
+      "loss": 0.8753,
+      "step": 28
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3187343500667551,
+      "learning_rate": 0.0001156434465040231,
+      "loss": 0.8261,
+      "step": 29
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.32028276858301985,
+      "learning_rate": 0.00011045284632676536,
+      "loss": 0.7956,
+      "step": 30
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3283584125341734,
+      "learning_rate": 0.0001052335956242944,
+      "loss": 0.8078,
+      "step": 31
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3116804238757513,
+      "learning_rate": 0.0001,
+      "loss": 0.8297,
+      "step": 32
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.34278266317249856,
+      "learning_rate": 9.476640437570562e-05,
+      "loss": 0.8239,
+      "step": 33
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.30595761713026426,
+      "learning_rate": 8.954715367323468e-05,
+      "loss": 0.7906,
+      "step": 34
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3265249485330855,
+      "learning_rate": 8.435655349597689e-05,
+      "loss": 0.747,
+      "step": 35
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.31382488863003205,
+      "learning_rate": 7.920883091822408e-05,
+      "loss": 0.8304,
+      "step": 36
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3619161183936626,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.8553,
+      "step": 37
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.32665860366084254,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.8052,
+      "step": 38
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3253192271402816,
+      "learning_rate": 6.416320504546997e-05,
+      "loss": 0.864,
+      "step": 39
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.32960969117819583,
+      "learning_rate": 5.9326335692419995e-05,
+      "loss": 0.8163,
+      "step": 40
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3232580501894908,
+      "learning_rate": 5.4600950026045326e-05,
+      "loss": 0.8427,
+      "step": 41
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3129500054631124,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7694,
+      "step": 42
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3126921363286475,
+      "learning_rate": 4.5536096498497295e-05,
+      "loss": 0.8014,
+      "step": 43
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.2869463355408737,
+      "learning_rate": 4.12214747707527e-05,
+      "loss": 0.7895,
+      "step": 44
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.29201630872249207,
+      "learning_rate": 3.7067960895016275e-05,
+      "loss": 0.7443,
+      "step": 45
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.33987577027712634,
+      "learning_rate": 3.308693936411421e-05,
+      "loss": 0.7886,
+      "step": 46
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3066583607284319,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.7406,
+      "step": 47
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.28152834918909564,
+      "learning_rate": 2.5685517452260567e-05,
+      "loss": 0.7806,
+      "step": 48
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.314371197533863,
+      "learning_rate": 2.2285403854302912e-05,
+      "loss": 0.779,
+      "step": 49
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3236829247628598,
+      "learning_rate": 1.9098300562505266e-05,
+      "loss": 0.8083,
+      "step": 50
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3402350100519127,
+      "learning_rate": 1.6132943205457606e-05,
+      "loss": 0.7785,
+      "step": 51
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.34165645874652545,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.8051,
+      "step": 52
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.36808713946400523,
+      "learning_rate": 1.0899347581163221e-05,
+      "loss": 0.8232,
+      "step": 53
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3154155924180855,
+      "learning_rate": 8.645454235739903e-06,
+      "loss": 0.8058,
+      "step": 54
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3014060919782915,
+      "learning_rate": 6.6419573502798374e-06,
+      "loss": 0.7967,
+      "step": 55
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3309114005539564,
+      "learning_rate": 4.8943483704846475e-06,
+      "loss": 0.7909,
+      "step": 56
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3849480424914164,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.8441,
+      "step": 57
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.2793016248216318,
+      "learning_rate": 2.1852399266194314e-06,
+      "loss": 0.7713,
+      "step": 58
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.31012031023291514,
+      "learning_rate": 1.231165940486234e-06,
+      "loss": 0.7533,
+      "step": 59
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.2877388900526754,
+      "learning_rate": 5.478104631726711e-07,
+      "loss": 0.7386,
+      "step": 60
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3159739171413289,
+      "learning_rate": 1.3704652454261668e-07,
+      "loss": 0.7869,
+      "step": 61
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.30635308472402023,
+      "learning_rate": 0.0,
+      "loss": 0.8289,
+      "step": 62
+    },
+    {
+      "epoch": 0.992,
+      "step": 62,
+      "total_flos": 156003815260160.0,
+      "train_loss": 0.8635893848634535,
+      "train_runtime": 1927.9959,
+      "train_samples_per_second": 1.037,
+      "train_steps_per_second": 0.032
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 62,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 156003815260160.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..08f415cfff38d20d76e34e89b5083acd15830dd4
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "q_proj",
+    "gate_proj",
+    "v_proj",
+    "down_proj",
+    "o_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a40787995328bf0352247bf7aae228eb6b2d5e92
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08db6ba76879f06b7866cf329aab4a2cf1bd74608b416e2835593b1261051b2b
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ff6bffffb7792d517cd7ee9a65c7000e64a2e964
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f56b47db80cd8a3edb0691506eaf739f85f61245f28df904d99f6fdcec3138de
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f270cc7ca048104e2ee2f028352535399104b428
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.9300447735857797,
+      "learning_rate": 5e-05,
+      "loss": 1.4212,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8941623954623172,
+      "learning_rate": 0.0001,
+      "loss": 1.4027,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.7866260657348403,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.3411,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.2147238549075172,
+      "learning_rate": 0.0002,
+      "loss": 1.1727,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.1094493212868144,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 0.9972,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.8828202767363533,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 1.04,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.6522330755796834,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 0.9753,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4661735661736063,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 0.8634,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5330532854787262,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 0.9423,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5732343687041058,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 0.9613,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.531922029530724,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 0.9133,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5695373512799271,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 0.9241,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.48913628922300956,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 0.8069,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.600731959286891,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 0.913,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.4888121753484574,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.8766,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.45838473336493163,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 0.8702,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.5523016279235923,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 0.9263,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 1.1236493815342523,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 0.9017,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.9322800469491774,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 0.896,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.47070914424960786,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 0.8231,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.4502420347383483,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 0.7832,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.42688173145056074,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 0.8332,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.4416543143186891,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 0.8217,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.42523897687726514,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 0.8877,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.44128672624582566,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 0.8929,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4761215857258359,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.7895,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.49618405427508133,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 0.9193,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5019616118160227,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 0.8678,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.45325569465538845,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 0.881,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4328543000849896,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 0.7984,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.5226044226129308,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 0.8251,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.495514036412055,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 0.879,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.42627767818455664,
+      "learning_rate": 0.000172967916579403,
+      "loss": 0.8623,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.447506945125031,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 0.8433,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.4649905236967444,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 0.8377,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.47770566393392916,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 0.8305,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.3799474520513085,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.7657,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4616004668862163,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 0.8183,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.46510246561157265,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 0.9086,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.45510655180869813,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 0.7559,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.4686610313953666,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 0.8561,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4985603334063983,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 0.775,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.4763246817199902,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 0.8489,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.47932086524083634,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 0.8266,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4140084452320215,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 0.7732,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.5037644545683102,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 0.8021,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.511721323509284,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 0.7965,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4453968832294159,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.8072,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.45760166610633574,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 0.7846,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5687538636897856,
+      "learning_rate": 0.000136764169663272,
+      "loss": 0.8548,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.41556308383635204,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 0.7858,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4438322151444504,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 0.8102,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.44920875101367225,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 0.8663,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4562243620080555,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 0.7225,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.4670306606254095,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 0.8674,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.38224700176824034,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 0.7718,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.3958254451198586,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 0.7967,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.43967961784262105,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 0.8683,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.3960108199866462,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.764,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4290206261752591,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 0.8361,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.37343933619751885,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 0.6922,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.39076622437562314,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 0.8454,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.45557849407523243,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.8379,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.40510790499453986,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.8088,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.42655439355809865,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.7868,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4274086029648102,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.7844,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.46709466970320407,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.8366,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4251199525895881,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.8851,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.39032317585957393,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.7611,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4032588998536183,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.8281,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.37713458352496687,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.7498,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4672308735351815,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.8496,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.45563903888910406,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.8047,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.37397857860843065,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.7846,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.3962217079098133,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.8319,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4017100232137172,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.7513,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.4268989423918388,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.7839,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.4079357701727051,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.7715,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.4107276956696718,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.776,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.46619861200542645,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 0.8987,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.44429947024548255,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.8965,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4434324812385255,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.8088,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.3898226333525628,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.7228,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.40085891019440134,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.7796,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.3681891073144277,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.7464,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3807853713661697,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.7585,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.4287329211744062,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.8006,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4879386796896542,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.7965,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.5071758983580438,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.8691,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3766842574391901,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 0.7865,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.377539197544411,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.7921,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.44402436974537496,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.745,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.44449641877820356,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.7177,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.4382952546761995,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.7585,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3999725274248031,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.8022,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.5635995549114391,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.869,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.45441045951837805,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.7141,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.45283175276984133,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.8029,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.41260626672347417,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.8141,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.46538759918111977,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.8154,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.3789292294833141,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.7809,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.41506839343478824,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 0.7874,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.3648788353955197,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.7499,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.4587909838246228,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.775,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.41463387337526747,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.82,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.394693008883327,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.7566,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.43539993591753107,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.8281,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.40760104117287915,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.7678,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.44161157683534574,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.8346,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.39973155716896946,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.8143,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.4248556182588792,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.7733,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.4040332779709177,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.7275,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.4130206933527327,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.7827,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4331445858777044,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.769,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.3985474826850022,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.7983,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4201917197177302,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.7817,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.3816079162312025,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.697,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4266755473386918,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.8057,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.3622774541651628,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.744,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.41675537181777855,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.7613,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.6487814429829765,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.7765,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4230934030543345,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.7546,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.4167473753566679,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.7033,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.37194711508688,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.7665,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.4061978675006465,
+      "learning_rate": 0.0,
+      "loss": 0.7446,
+      "step": 125
+    },
+    {
+      "epoch": 1.0,
+      "step": 125,
+      "total_flos": 112068802379776.0,
+      "train_loss": 0.8331891593933105,
+      "train_runtime": 1965.4906,
+      "train_samples_per_second": 1.018,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 112068802379776.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ca5bf4b3175f756c017116c7f640cbe129e656d
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "gate_proj",
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..792402cae76d3c1972b4f468e13ce7fb0237a240
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:acedc725eb44a6d7255443bc027118a61e62213a28a736a085681fef8bc721bd
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..430e31bcd64c5ca3b20a5814af380429af9e51a3
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb14c93195a8ed1845b7ebb0917ba61adb75db6e4388d846a3863c7fdabe57c6
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f69163d4d2b8e2f75597575e73f23ac0d525acc
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,476 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.992,
+  "eval_steps": 500,
+  "global_step": 62,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.843145838243195,
+      "learning_rate": 0.0001,
+      "loss": 1.4119,
+      "step": 1
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.9380096803116589,
+      "learning_rate": 0.0002,
+      "loss": 1.4634,
+      "step": 2
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.9463053624227347,
+      "learning_rate": 0.0001998629534754574,
+      "loss": 1.2292,
+      "step": 3
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.8489288880806201,
+      "learning_rate": 0.00019945218953682734,
+      "loss": 1.0505,
+      "step": 4
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5599243053149323,
+      "learning_rate": 0.00019876883405951377,
+      "loss": 1.0059,
+      "step": 5
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5443751929373011,
+      "learning_rate": 0.00019781476007338058,
+      "loss": 0.9792,
+      "step": 6
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.42752971478789437,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.8965,
+      "step": 7
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4032131719894276,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 0.9045,
+      "step": 8
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.6875225939389418,
+      "learning_rate": 0.00019335804264972018,
+      "loss": 0.9446,
+      "step": 9
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.45195238831600093,
+      "learning_rate": 0.0001913545457642601,
+      "loss": 0.8833,
+      "step": 10
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.36373689043217144,
+      "learning_rate": 0.0001891006524188368,
+      "loss": 0.8311,
+      "step": 11
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.3723458886036847,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8731,
+      "step": 12
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.3404123418563292,
+      "learning_rate": 0.00018386705679454242,
+      "loss": 0.8563,
+      "step": 13
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.39141927900118156,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.9065,
+      "step": 14
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3377913900856818,
+      "learning_rate": 0.0001777145961456971,
+      "loss": 0.8531,
+      "step": 15
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3516799806634102,
+      "learning_rate": 0.00017431448254773944,
+      "loss": 0.8616,
+      "step": 16
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.34391919997614717,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.8651,
+      "step": 17
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.333954567934155,
+      "learning_rate": 0.00016691306063588583,
+      "loss": 0.8458,
+      "step": 18
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.32630409233069796,
+      "learning_rate": 0.00016293203910498376,
+      "loss": 0.7994,
+      "step": 19
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3627179301532884,
+      "learning_rate": 0.00015877852522924732,
+      "loss": 0.842,
+      "step": 20
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.3664591708206515,
+      "learning_rate": 0.00015446390350150273,
+      "loss": 0.8224,
+      "step": 21
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.40521270822120725,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8444,
+      "step": 22
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.37903876457944136,
+      "learning_rate": 0.00014539904997395468,
+      "loss": 0.8006,
+      "step": 23
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.34049120165740693,
+      "learning_rate": 0.00014067366430758004,
+      "loss": 0.8079,
+      "step": 24
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3877286031540173,
+      "learning_rate": 0.00013583679495453,
+      "loss": 0.8292,
+      "step": 25
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.31924225776411685,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.8034,
+      "step": 26
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.32758604465985947,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.7981,
+      "step": 27
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3347640325790976,
+      "learning_rate": 0.00012079116908177593,
+      "loss": 0.8259,
+      "step": 28
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3296926636369933,
+      "learning_rate": 0.0001156434465040231,
+      "loss": 0.8455,
+      "step": 29
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.367558043246359,
+      "learning_rate": 0.00011045284632676536,
+      "loss": 0.8063,
+      "step": 30
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.32660814025542856,
+      "learning_rate": 0.0001052335956242944,
+      "loss": 0.7778,
+      "step": 31
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.33057989221956413,
+      "learning_rate": 0.0001,
+      "loss": 0.8322,
+      "step": 32
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.35495975739470687,
+      "learning_rate": 9.476640437570562e-05,
+      "loss": 0.7939,
+      "step": 33
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.333883052997245,
+      "learning_rate": 8.954715367323468e-05,
+      "loss": 0.8724,
+      "step": 34
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5342512191259564,
+      "learning_rate": 8.435655349597689e-05,
+      "loss": 0.8032,
+      "step": 35
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.30404137682103216,
+      "learning_rate": 7.920883091822408e-05,
+      "loss": 0.805,
+      "step": 36
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.30505193619981874,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.8082,
+      "step": 37
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.2979225234091672,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.7991,
+      "step": 38
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3370269390268292,
+      "learning_rate": 6.416320504546997e-05,
+      "loss": 0.7908,
+      "step": 39
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.33162910921880767,
+      "learning_rate": 5.9326335692419995e-05,
+      "loss": 0.8453,
+      "step": 40
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.34380203708081264,
+      "learning_rate": 5.4600950026045326e-05,
+      "loss": 0.868,
+      "step": 41
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.2915663459031845,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7608,
+      "step": 42
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.28523801049541647,
+      "learning_rate": 4.5536096498497295e-05,
+      "loss": 0.7654,
+      "step": 43
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3427716619184912,
+      "learning_rate": 4.12214747707527e-05,
+      "loss": 0.8163,
+      "step": 44
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3389826002732601,
+      "learning_rate": 3.7067960895016275e-05,
+      "loss": 0.8405,
+      "step": 45
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.283733099028701,
+      "learning_rate": 3.308693936411421e-05,
+      "loss": 0.7817,
+      "step": 46
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3172649852684197,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.7519,
+      "step": 47
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.360330762850194,
+      "learning_rate": 2.5685517452260567e-05,
+      "loss": 0.851,
+      "step": 48
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.3084947842667855,
+      "learning_rate": 2.2285403854302912e-05,
+      "loss": 0.7728,
+      "step": 49
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3205822742446387,
+      "learning_rate": 1.9098300562505266e-05,
+      "loss": 0.8271,
+      "step": 50
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.29060856532813023,
+      "learning_rate": 1.6132943205457606e-05,
+      "loss": 0.7972,
+      "step": 51
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.30648749167538564,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.7784,
+      "step": 52
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.28288086095491943,
+      "learning_rate": 1.0899347581163221e-05,
+      "loss": 0.8033,
+      "step": 53
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.2989179635932655,
+      "learning_rate": 8.645454235739903e-06,
+      "loss": 0.8131,
+      "step": 54
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3315017220426919,
+      "learning_rate": 6.6419573502798374e-06,
+      "loss": 0.84,
+      "step": 55
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.32214020332980725,
+      "learning_rate": 4.8943483704846475e-06,
+      "loss": 0.7667,
+      "step": 56
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3259571865204803,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.7922,
+      "step": 57
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3067289342641416,
+      "learning_rate": 2.1852399266194314e-06,
+      "loss": 0.8022,
+      "step": 58
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3059666813346514,
+      "learning_rate": 1.231165940486234e-06,
+      "loss": 0.7679,
+      "step": 59
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3321644769509772,
+      "learning_rate": 5.478104631726711e-07,
+      "loss": 0.7719,
+      "step": 60
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3182538954435247,
+      "learning_rate": 1.3704652454261668e-07,
+      "loss": 0.7828,
+      "step": 61
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.29629475427484525,
+      "learning_rate": 0.0,
+      "loss": 0.7501,
+      "step": 62
+    },
+    {
+      "epoch": 0.992,
+      "step": 62,
+      "total_flos": 162629892046848.0,
+      "train_loss": 0.8567078267374346,
+      "train_runtime": 1951.7305,
+      "train_samples_per_second": 1.025,
+      "train_steps_per_second": 0.032
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 62,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 162629892046848.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..14dc4c5478c9fd3f4f0916a025c8fb02eba5d078
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "v_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d0024979e8f6c705d6696a7e2a7ecd43c7556bbc
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7238743f849ee06b63a05a6609fc45e28096664df38b5c595e6c064ed952f54
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2ba09ffb6d0c6495764b3a54d99ef74a347d8603
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:482270ab678dd84cbecf4cb5dacecbb4bbe2ca6f150cb50aefd6bdfedd2ec3c9
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..66cf301ba336f5e6a03d1954b8fbe660ba20af65
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.2054836032909777,
+      "learning_rate": 5e-05,
+      "loss": 1.5017,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.954238989110196,
+      "learning_rate": 0.0001,
+      "loss": 1.4967,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.7678466151736065,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.283,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.6082417908446505,
+      "learning_rate": 0.0002,
+      "loss": 1.1555,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.0562286263106848,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 1.0669,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.8265656805740538,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 1.0068,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.7655842282381125,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 0.9599,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5777687757328043,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 1.0017,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.721581419395825,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 0.9135,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.542168928482532,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 0.9624,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.6108376268031162,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 0.9251,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5244504646150466,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 0.8855,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5837393841633354,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 0.9486,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.6146534952260426,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 0.8341,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5077809580036337,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.8972,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.5351319800406077,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 0.898,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4815175525442992,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 0.8511,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.45271754447994694,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 0.8504,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.4781494508790699,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 0.8543,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.48122762032392724,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 0.823,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.4914747468410434,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 0.8671,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4733738820147795,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 0.8808,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.47069335051320876,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 0.8112,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4738223832737001,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 0.835,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5064877458997864,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 0.8642,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.44858291604670125,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.854,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5177837399916985,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 0.8771,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5501566931582197,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 0.8715,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.4316046486398667,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 0.8968,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.42763874529306595,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 0.8705,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.4450128699392376,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 0.8039,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.46132404658319076,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 0.8485,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.4784554434315401,
+      "learning_rate": 0.000172967916579403,
+      "loss": 0.8429,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.42929968246494943,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 0.8193,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.39407240093282986,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 0.7664,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.5640849704230063,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 0.86,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.5892482999779673,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.9928,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4078559973229823,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 0.711,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.43250872905374294,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 0.7459,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.44988474520884375,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 0.7682,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.4666212178261856,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 0.7974,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4474201623249179,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 0.8009,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.6773915402946965,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 0.9602,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5060905150524829,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 0.8342,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4772552860058863,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 0.8891,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.46843358870389834,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 0.8455,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.5439841894696938,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 0.7722,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.43909813467352327,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.7935,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.4994615028953572,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 0.8643,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.42557838435232637,
+      "learning_rate": 0.000136764169663272,
+      "loss": 0.8597,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.4464800320333645,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 0.8157,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4154749453225568,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 0.7622,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.4423818247673819,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 0.8785,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.48505813932469505,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 0.8218,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.5297224593394644,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 0.9302,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4172622442182296,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 0.8525,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.6155680038093249,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 0.7678,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.46909148631310976,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 0.8107,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4879035231063422,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.8199,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.43485709638018905,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 0.7881,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.4448453191087327,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 0.7868,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.5306331930728724,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 0.8523,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.4992968847325637,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.7766,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.38789752705861585,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.741,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.41232002134377704,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.7716,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4104927091301637,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.791,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.4536062964116601,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.8224,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4209000797113228,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.826,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.43435410119915097,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.8052,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3718755656365949,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.7323,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.3996307000914964,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.7496,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.40822290274043077,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.7065,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.45131242199847604,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.768,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.45472522741851235,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.7355,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4284582880321103,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.8072,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4077950430987271,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.7883,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.443123826971581,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.7785,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.42191143033368195,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.8033,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.544556846459226,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.837,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3897980689268285,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 0.7923,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.4340545855141479,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.7744,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.49582655199280046,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.7324,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4238385179654696,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.7767,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.5308351549134397,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.8437,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.4110401224429797,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.7636,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4873911227349872,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.8241,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.42240756526435774,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.7083,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.42036965303141166,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.8498,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3878007767118561,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.7921,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5086889199469316,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 0.8485,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.38477687023795526,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.7613,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.46085964244404704,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.7689,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4071525230768036,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.8064,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.6214994476872179,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.7933,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.48118329615371713,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.7912,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.39560748056930567,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.755,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.3600598201768951,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.7395,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4652092977802121,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.7723,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.48845591366553076,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.7768,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.4114611715018722,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.8183,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.37871588744645535,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.8128,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3919736583377089,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 0.7329,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.4469471117249365,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.834,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.468934964665669,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.7651,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.48844250247818427,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.7297,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4524364309414198,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.8363,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.45213211484185767,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.763,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.46723958862108017,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.7722,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.43863412802839247,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.7547,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3953813072931627,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.795,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.4415874074419842,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.7514,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.42128134880801454,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.7878,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.3788662324527675,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.7441,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4286441840661642,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.7478,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.45115458519962515,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.818,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4211270204030477,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.811,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.3847476912513861,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.8067,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.41440864739044353,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.805,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4303158521887713,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.8265,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.38697710842550026,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.7795,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.4391608267010154,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.8082,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.5265387648782383,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.7479,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.41646004486406024,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.7377,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4877047827862541,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.7319,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.45270307907667634,
+      "learning_rate": 0.0,
+      "loss": 0.7682,
+      "step": 125
+    },
+    {
+      "epoch": 1.0,
+      "step": 125,
+      "total_flos": 110004521598976.0,
+      "train_loss": 0.8352149829864502,
+      "train_runtime": 1951.6222,
+      "train_samples_per_second": 1.025,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 110004521598976.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7b4f2b72c113b4dbf491f048e0cdfca81e6b93d
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "down_proj",
+    "q_proj",
+    "o_proj",
+    "gate_proj",
+    "k_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..13bf1da3339716dade0983b19e397dd49390d28c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2aafd6948ca148902b2730fe8e1d95c1571e4c2c55ae20b09385ef8b1051908
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..40f2a2fca64ed7d3c4d0ec15bf4004c483212968
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84dc84f62ee656a692e7ad364457c586b7cb19189958b0beefc8b064215fa3eb
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..09912d963c5c8b138e517f69057dcd75e5239775
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,476 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.992,
+  "eval_steps": 500,
+  "global_step": 62,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.9925947841990691,
+      "learning_rate": 0.0001,
+      "loss": 1.4992,
+      "step": 1
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.9063043848088819,
+      "learning_rate": 0.0002,
+      "loss": 1.413,
+      "step": 2
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 1.0488931165337085,
+      "learning_rate": 0.0001998629534754574,
+      "loss": 1.2539,
+      "step": 3
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.9333388718058,
+      "learning_rate": 0.00019945218953682734,
+      "loss": 1.116,
+      "step": 4
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7972030303434287,
+      "learning_rate": 0.00019876883405951377,
+      "loss": 1.0013,
+      "step": 5
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.48274056578620395,
+      "learning_rate": 0.00019781476007338058,
+      "loss": 0.9533,
+      "step": 6
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5434712430562691,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.9351,
+      "step": 7
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.41458095983795856,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 0.9267,
+      "step": 8
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.38026363031320687,
+      "learning_rate": 0.00019335804264972018,
+      "loss": 0.8753,
+      "step": 9
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3959045312773757,
+      "learning_rate": 0.0001913545457642601,
+      "loss": 0.8596,
+      "step": 10
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4235300919941064,
+      "learning_rate": 0.0001891006524188368,
+      "loss": 0.8921,
+      "step": 11
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.38628932587744996,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8455,
+      "step": 12
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.44369543041418,
+      "learning_rate": 0.00018386705679454242,
+      "loss": 0.8781,
+      "step": 13
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4109843283491267,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.8934,
+      "step": 14
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3749483097846325,
+      "learning_rate": 0.0001777145961456971,
+      "loss": 0.8951,
+      "step": 15
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.37971805656126606,
+      "learning_rate": 0.00017431448254773944,
+      "loss": 0.8409,
+      "step": 16
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3754156243492605,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.843,
+      "step": 17
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.38235046938512196,
+      "learning_rate": 0.00016691306063588583,
+      "loss": 0.8271,
+      "step": 18
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.35465995333344363,
+      "learning_rate": 0.00016293203910498376,
+      "loss": 0.8661,
+      "step": 19
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.32753934791861467,
+      "learning_rate": 0.00015877852522924732,
+      "loss": 0.7687,
+      "step": 20
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.33113997504958703,
+      "learning_rate": 0.00015446390350150273,
+      "loss": 0.8097,
+      "step": 21
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.4136216149035267,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.91,
+      "step": 22
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.35190712822057785,
+      "learning_rate": 0.00014539904997395468,
+      "loss": 0.8756,
+      "step": 23
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.34968365505535903,
+      "learning_rate": 0.00014067366430758004,
+      "loss": 0.7906,
+      "step": 24
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.39304907579519915,
+      "learning_rate": 0.00013583679495453,
+      "loss": 0.8758,
+      "step": 25
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.31310585822396536,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.7944,
+      "step": 26
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.355537005877262,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.8596,
+      "step": 27
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3756637193077202,
+      "learning_rate": 0.00012079116908177593,
+      "loss": 0.9009,
+      "step": 28
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.32107783868074957,
+      "learning_rate": 0.0001156434465040231,
+      "loss": 0.7966,
+      "step": 29
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3259594093549215,
+      "learning_rate": 0.00011045284632676536,
+      "loss": 0.8079,
+      "step": 30
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3311002448589628,
+      "learning_rate": 0.0001052335956242944,
+      "loss": 0.8255,
+      "step": 31
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3052727717674524,
+      "learning_rate": 0.0001,
+      "loss": 0.7639,
+      "step": 32
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3337694649290884,
+      "learning_rate": 9.476640437570562e-05,
+      "loss": 0.7875,
+      "step": 33
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3263404508721736,
+      "learning_rate": 8.954715367323468e-05,
+      "loss": 0.8333,
+      "step": 34
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3029279849900034,
+      "learning_rate": 8.435655349597689e-05,
+      "loss": 0.7776,
+      "step": 35
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.31611457076476746,
+      "learning_rate": 7.920883091822408e-05,
+      "loss": 0.7446,
+      "step": 36
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.323006284197396,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.7605,
+      "step": 37
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3148016363568966,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.8099,
+      "step": 38
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3473242390582786,
+      "learning_rate": 6.416320504546997e-05,
+      "loss": 0.8058,
+      "step": 39
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3550223861193946,
+      "learning_rate": 5.9326335692419995e-05,
+      "loss": 0.8288,
+      "step": 40
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.2994454014403082,
+      "learning_rate": 5.4600950026045326e-05,
+      "loss": 0.7598,
+      "step": 41
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.33866595315124687,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.824,
+      "step": 42
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3259479229606084,
+      "learning_rate": 4.5536096498497295e-05,
+      "loss": 0.8082,
+      "step": 43
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3150729309963534,
+      "learning_rate": 4.12214747707527e-05,
+      "loss": 0.792,
+      "step": 44
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.32722828375141205,
+      "learning_rate": 3.7067960895016275e-05,
+      "loss": 0.8406,
+      "step": 45
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.3159616589658377,
+      "learning_rate": 3.308693936411421e-05,
+      "loss": 0.7793,
+      "step": 46
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.32912979548147964,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.8127,
+      "step": 47
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.28949891628996977,
+      "learning_rate": 2.5685517452260567e-05,
+      "loss": 0.7895,
+      "step": 48
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.3166894220074553,
+      "learning_rate": 2.2285403854302912e-05,
+      "loss": 0.772,
+      "step": 49
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.33068916261585946,
+      "learning_rate": 1.9098300562505266e-05,
+      "loss": 0.8098,
+      "step": 50
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.29806269029528093,
+      "learning_rate": 1.6132943205457606e-05,
+      "loss": 0.7874,
+      "step": 51
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.34900463428322714,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.8179,
+      "step": 52
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3414454707988493,
+      "learning_rate": 1.0899347581163221e-05,
+      "loss": 0.7967,
+      "step": 53
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.33107800165050943,
+      "learning_rate": 8.645454235739903e-06,
+      "loss": 0.7805,
+      "step": 54
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3141353833466967,
+      "learning_rate": 6.6419573502798374e-06,
+      "loss": 0.7911,
+      "step": 55
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3116391507758903,
+      "learning_rate": 4.8943483704846475e-06,
+      "loss": 0.7838,
+      "step": 56
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3166462645242051,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.762,
+      "step": 57
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.32702334706441605,
+      "learning_rate": 2.1852399266194314e-06,
+      "loss": 0.8318,
+      "step": 58
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3110840740582141,
+      "learning_rate": 1.231165940486234e-06,
+      "loss": 0.8228,
+      "step": 59
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.2997913312758848,
+      "learning_rate": 5.478104631726711e-07,
+      "loss": 0.8228,
+      "step": 60
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.33605354873969534,
+      "learning_rate": 1.3704652454261668e-07,
+      "loss": 0.7955,
+      "step": 61
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3031227746472349,
+      "learning_rate": 0.0,
+      "loss": 0.7533,
+      "step": 62
+    },
+    {
+      "epoch": 0.992,
+      "step": 62,
+      "total_flos": 157753171902464.0,
+      "train_loss": 0.8592852305981421,
+      "train_runtime": 1934.1804,
+      "train_samples_per_second": 1.034,
+      "train_steps_per_second": 0.032
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 62,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 157753171902464.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..854add7ac5db7e3e908ed889d540ad1779eff2a4
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e6b6d6636eae72163b2f1ef83595a0e963fad996
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6988dc6825c7f45d2b7f3c9c8570387c493c88e64abe1ddf9f219e63b235e85c
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f98d2916fcb473e77ae7f374fe59b7ef47db98d7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65b1c8ec25756d095485d794acaf39c499fd58ee505eeeeabcd5bae8d1614f01
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..45739e4f509123aa2a4bd28da39bdcd56e091204
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,13167 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005333333333333334,
+      "grad_norm": 0.9048587630560018,
+      "learning_rate": 3.5087719298245615e-06,
+      "loss": 1.3101,
+      "step": 1
+    },
+    {
+      "epoch": 0.0010666666666666667,
+      "grad_norm": 1.0519343911287062,
+      "learning_rate": 7.017543859649123e-06,
+      "loss": 1.5194,
+      "step": 2
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.0224268066373816,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.4268,
+      "step": 3
+    },
+    {
+      "epoch": 0.0021333333333333334,
+      "grad_norm": 0.8656445885930367,
+      "learning_rate": 1.4035087719298246e-05,
+      "loss": 1.3686,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026666666666666666,
+      "grad_norm": 0.8733966115001257,
+      "learning_rate": 1.7543859649122806e-05,
+      "loss": 1.39,
+      "step": 5
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.7783883541710935,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.2797,
+      "step": 6
+    },
+    {
+      "epoch": 0.0037333333333333333,
+      "grad_norm": 0.8561524970209158,
+      "learning_rate": 2.456140350877193e-05,
+      "loss": 1.3017,
+      "step": 7
+    },
+    {
+      "epoch": 0.004266666666666667,
+      "grad_norm": 1.0927395440249963,
+      "learning_rate": 2.8070175438596492e-05,
+      "loss": 1.321,
+      "step": 8
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.8281037473311745,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.2032,
+      "step": 9
+    },
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 0.807714070924002,
+      "learning_rate": 3.508771929824561e-05,
+      "loss": 1.0587,
+      "step": 10
+    },
+    {
+      "epoch": 0.005866666666666667,
+      "grad_norm": 1.0483239118173178,
+      "learning_rate": 3.859649122807018e-05,
+      "loss": 1.0284,
+      "step": 11
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8507389995888006,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.0391,
+      "step": 12
+    },
+    {
+      "epoch": 0.006933333333333333,
+      "grad_norm": 0.8208532938213147,
+      "learning_rate": 4.56140350877193e-05,
+      "loss": 0.9867,
+      "step": 13
+    },
+    {
+      "epoch": 0.007466666666666667,
+      "grad_norm": 0.8334380171945804,
+      "learning_rate": 4.912280701754386e-05,
+      "loss": 0.9704,
+      "step": 14
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.7429403910377161,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 0.9036,
+      "step": 15
+    },
+    {
+      "epoch": 0.008533333333333334,
+      "grad_norm": 0.7967026196736772,
+      "learning_rate": 5.6140350877192984e-05,
+      "loss": 1.0404,
+      "step": 16
+    },
+    {
+      "epoch": 0.009066666666666667,
+      "grad_norm": 0.6848930255684692,
+      "learning_rate": 5.9649122807017544e-05,
+      "loss": 0.9056,
+      "step": 17
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.6346302299463533,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 0.8942,
+      "step": 18
+    },
+    {
+      "epoch": 0.010133333333333333,
+      "grad_norm": 0.6535131512797249,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.8365,
+      "step": 19
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": 0.6509491421625535,
+      "learning_rate": 7.017543859649122e-05,
+      "loss": 0.9277,
+      "step": 20
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.4955595210807453,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.8423,
+      "step": 21
+    },
+    {
+      "epoch": 0.011733333333333333,
+      "grad_norm": 0.5361172069164036,
+      "learning_rate": 7.719298245614036e-05,
+      "loss": 0.9436,
+      "step": 22
+    },
+    {
+      "epoch": 0.012266666666666667,
+      "grad_norm": 0.5973579229429707,
+      "learning_rate": 8.070175438596491e-05,
+      "loss": 0.8421,
+      "step": 23
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.5849298010283289,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.8671,
+      "step": 24
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 0.5759168499300029,
+      "learning_rate": 8.771929824561403e-05,
+      "loss": 0.8499,
+      "step": 25
+    },
+    {
+      "epoch": 0.013866666666666666,
+      "grad_norm": 0.515738534877068,
+      "learning_rate": 9.12280701754386e-05,
+      "loss": 0.9084,
+      "step": 26
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.6110132431887146,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9516,
+      "step": 27
+    },
+    {
+      "epoch": 0.014933333333333333,
+      "grad_norm": 0.5725801143409102,
+      "learning_rate": 9.824561403508771e-05,
+      "loss": 0.8457,
+      "step": 28
+    },
+    {
+      "epoch": 0.015466666666666667,
+      "grad_norm": 0.5012871976844285,
+      "learning_rate": 0.0001017543859649123,
+      "loss": 0.8426,
+      "step": 29
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5340718478626321,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.8921,
+      "step": 30
+    },
+    {
+      "epoch": 0.016533333333333334,
+      "grad_norm": 0.6221479080631429,
+      "learning_rate": 0.00010877192982456141,
+      "loss": 0.8718,
+      "step": 31
+    },
+    {
+      "epoch": 0.017066666666666667,
+      "grad_norm": 0.47340491615336217,
+      "learning_rate": 0.00011228070175438597,
+      "loss": 0.8138,
+      "step": 32
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.5906496115820806,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.8141,
+      "step": 33
+    },
+    {
+      "epoch": 0.018133333333333335,
+      "grad_norm": 0.5911771567770389,
+      "learning_rate": 0.00011929824561403509,
+      "loss": 0.8854,
+      "step": 34
+    },
+    {
+      "epoch": 0.018666666666666668,
+      "grad_norm": 0.8272434744116125,
+      "learning_rate": 0.00012280701754385965,
+      "loss": 0.9541,
+      "step": 35
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5348620169937606,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8495,
+      "step": 36
+    },
+    {
+      "epoch": 0.019733333333333332,
+      "grad_norm": 0.4862616162975578,
+      "learning_rate": 0.0001298245614035088,
+      "loss": 0.837,
+      "step": 37
+    },
+    {
+      "epoch": 0.020266666666666665,
+      "grad_norm": 0.559516838035242,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.8702,
+      "step": 38
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.5831454319541884,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.8287,
+      "step": 39
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 0.527006776442556,
+      "learning_rate": 0.00014035087719298245,
+      "loss": 0.7676,
+      "step": 40
+    },
+    {
+      "epoch": 0.021866666666666666,
+      "grad_norm": 0.5946342001260494,
+      "learning_rate": 0.00014385964912280703,
+      "loss": 0.8255,
+      "step": 41
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5359987813729276,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.8617,
+      "step": 42
+    },
+    {
+      "epoch": 0.022933333333333333,
+      "grad_norm": 0.532072440256019,
+      "learning_rate": 0.00015087719298245616,
+      "loss": 0.8779,
+      "step": 43
+    },
+    {
+      "epoch": 0.023466666666666667,
+      "grad_norm": 0.5430218364120416,
+      "learning_rate": 0.0001543859649122807,
+      "loss": 0.8794,
+      "step": 44
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.4524285119863887,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.7943,
+      "step": 45
+    },
+    {
+      "epoch": 0.024533333333333334,
+      "grad_norm": 0.4634163070353862,
+      "learning_rate": 0.00016140350877192982,
+      "loss": 0.7873,
+      "step": 46
+    },
+    {
+      "epoch": 0.025066666666666668,
+      "grad_norm": 0.49667886103870035,
+      "learning_rate": 0.0001649122807017544,
+      "loss": 0.7983,
+      "step": 47
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5049931634295179,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.7981,
+      "step": 48
+    },
+    {
+      "epoch": 0.026133333333333335,
+      "grad_norm": 0.4568261725717611,
+      "learning_rate": 0.00017192982456140353,
+      "loss": 0.8456,
+      "step": 49
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 1.045715299468113,
+      "learning_rate": 0.00017543859649122806,
+      "loss": 0.8974,
+      "step": 50
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.48309628455544334,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8137,
+      "step": 51
+    },
+    {
+      "epoch": 0.027733333333333332,
+      "grad_norm": 0.5679687118692228,
+      "learning_rate": 0.0001824561403508772,
+      "loss": 0.8756,
+      "step": 52
+    },
+    {
+      "epoch": 0.028266666666666666,
+      "grad_norm": 0.520824672017796,
+      "learning_rate": 0.00018596491228070177,
+      "loss": 0.8069,
+      "step": 53
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5430786799958935,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8539,
+      "step": 54
+    },
+    {
+      "epoch": 0.029333333333333333,
+      "grad_norm": 0.5525254109241546,
+      "learning_rate": 0.00019298245614035088,
+      "loss": 0.8129,
+      "step": 55
+    },
+    {
+      "epoch": 0.029866666666666666,
+      "grad_norm": 0.5377953868559492,
+      "learning_rate": 0.00019649122807017543,
+      "loss": 0.8489,
+      "step": 56
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.40336792195406596,
+      "learning_rate": 0.0002,
+      "loss": 0.7662,
+      "step": 57
+    },
+    {
+      "epoch": 0.030933333333333334,
+      "grad_norm": 0.5044050444585633,
+      "learning_rate": 0.00019999985069241055,
+      "loss": 0.8726,
+      "step": 58
+    },
+    {
+      "epoch": 0.031466666666666664,
+      "grad_norm": 0.517946310570053,
+      "learning_rate": 0.00019999940277008808,
+      "loss": 0.8351,
+      "step": 59
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.45762285553642407,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.7832,
+      "step": 60
+    },
+    {
+      "epoch": 0.03253333333333333,
+      "grad_norm": 0.49520896027565536,
+      "learning_rate": 0.00019999761108748597,
+      "loss": 0.7762,
+      "step": 61
+    },
+    {
+      "epoch": 0.03306666666666667,
+      "grad_norm": 0.4841734921934868,
+      "learning_rate": 0.00019999626733255662,
+      "loss": 0.7769,
+      "step": 62
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.5308062158451023,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.9068,
+      "step": 63
+    },
+    {
+      "epoch": 0.034133333333333335,
+      "grad_norm": 0.491633130500264,
+      "learning_rate": 0.00019999268401550447,
+      "loss": 0.8502,
+      "step": 64
+    },
+    {
+      "epoch": 0.034666666666666665,
+      "grad_norm": 0.44318703361045014,
+      "learning_rate": 0.000199990444464082,
+      "loss": 0.8117,
+      "step": 65
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.39215294250553073,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.7808,
+      "step": 66
+    },
+    {
+      "epoch": 0.03573333333333333,
+      "grad_norm": 0.4675337591215834,
+      "learning_rate": 0.00019998506960888256,
+      "loss": 0.8424,
+      "step": 67
+    },
+    {
+      "epoch": 0.03626666666666667,
+      "grad_norm": 0.5095405074681937,
+      "learning_rate": 0.00019998193432115572,
+      "loss": 0.7829,
+      "step": 68
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.48681538869656305,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8225,
+      "step": 69
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 0.4204760302318927,
+      "learning_rate": 0.00019997476807225985,
+      "loss": 0.7648,
+      "step": 70
+    },
+    {
+      "epoch": 0.037866666666666667,
+      "grad_norm": 0.5229961269669894,
+      "learning_rate": 0.0001999707371324904,
+      "loss": 0.8768,
+      "step": 71
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5443752475655127,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.7805,
+      "step": 72
+    },
+    {
+      "epoch": 0.038933333333333334,
+      "grad_norm": 0.5127777436976269,
+      "learning_rate": 0.00019996177968249334,
+      "loss": 0.8312,
+      "step": 73
+    },
+    {
+      "epoch": 0.039466666666666664,
+      "grad_norm": 0.4867549804722123,
+      "learning_rate": 0.0001999568531990141,
+      "loss": 0.8295,
+      "step": 74
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.48800025181975715,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.9258,
+      "step": 75
+    },
+    {
+      "epoch": 0.04053333333333333,
+      "grad_norm": 0.49668936103675393,
+      "learning_rate": 0.00019994610478865011,
+      "loss": 0.8858,
+      "step": 76
+    },
+    {
+      "epoch": 0.04106666666666667,
+      "grad_norm": 0.46659150620401435,
+      "learning_rate": 0.0001999402828938618,
+      "loss": 0.8052,
+      "step": 77
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4743344063668828,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8172,
+      "step": 78
+    },
+    {
+      "epoch": 0.042133333333333335,
+      "grad_norm": 0.4869439291640609,
+      "learning_rate": 0.00019992774381199778,
+      "loss": 0.7756,
+      "step": 79
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 0.45117175186769337,
+      "learning_rate": 0.00019992102666236566,
+      "loss": 0.841,
+      "step": 80
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.5395601612912219,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8084,
+      "step": 81
+    },
+    {
+      "epoch": 0.04373333333333333,
+      "grad_norm": 0.44014531925221906,
+      "learning_rate": 0.00019990669724599336,
+      "loss": 0.7761,
+      "step": 82
+    },
+    {
+      "epoch": 0.04426666666666667,
+      "grad_norm": 0.45721365845663203,
+      "learning_rate": 0.00019989908502204292,
+      "loss": 0.8421,
+      "step": 83
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.43168167968243937,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8152,
+      "step": 84
+    },
+    {
+      "epoch": 0.04533333333333334,
+      "grad_norm": 0.48439026237325494,
+      "learning_rate": 0.00019988296565626987,
+      "loss": 0.8061,
+      "step": 85
+    },
+    {
+      "epoch": 0.04586666666666667,
+      "grad_norm": 0.4314723559695366,
+      "learning_rate": 0.00019987445856258206,
+      "loss": 0.791,
+      "step": 86
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.45774727472954585,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.7741,
+      "step": 87
+    },
+    {
+      "epoch": 0.046933333333333334,
+      "grad_norm": 0.5013043190046269,
+      "learning_rate": 0.00019985654968062122,
+      "loss": 0.8525,
+      "step": 88
+    },
+    {
+      "epoch": 0.047466666666666664,
+      "grad_norm": 0.4160355669759571,
+      "learning_rate": 0.00019984714794582683,
+      "loss": 0.8043,
+      "step": 89
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5359859927158864,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8082,
+      "step": 90
+    },
+    {
+      "epoch": 0.04853333333333333,
+      "grad_norm": 0.4991768151741999,
+      "learning_rate": 0.000199827450028985,
+      "loss": 0.7806,
+      "step": 91
+    },
+    {
+      "epoch": 0.04906666666666667,
+      "grad_norm": 0.4480616596528081,
+      "learning_rate": 0.00019981715390575858,
+      "loss": 0.7346,
+      "step": 92
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5734125655859981,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.807,
+      "step": 93
+    },
+    {
+      "epoch": 0.050133333333333335,
+      "grad_norm": 0.4390162171856093,
+      "learning_rate": 0.00019979566748342347,
+      "loss": 0.7387,
+      "step": 94
+    },
+    {
+      "epoch": 0.050666666666666665,
+      "grad_norm": 0.47565640795131414,
+      "learning_rate": 0.00019978447724847652,
+      "loss": 0.8039,
+      "step": 95
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5507870634162346,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.8111,
+      "step": 96
+    },
+    {
+      "epoch": 0.05173333333333333,
+      "grad_norm": 0.537742020914287,
+      "learning_rate": 0.00019976120289810247,
+      "loss": 0.8338,
+      "step": 97
+    },
+    {
+      "epoch": 0.05226666666666667,
+      "grad_norm": 0.5145928182957982,
+      "learning_rate": 0.00019974911885217608,
+      "loss": 0.7749,
+      "step": 98
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.4570558282445582,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.838,
+      "step": 99
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.4799901830500369,
+      "learning_rate": 0.0001997240571992685,
+      "loss": 0.786,
+      "step": 100
+    },
+    {
+      "epoch": 0.05386666666666667,
+      "grad_norm": 0.5251908959822083,
+      "learning_rate": 0.00019971107966712518,
+      "loss": 0.7709,
+      "step": 101
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.5531788343340106,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.8997,
+      "step": 102
+    },
+    {
+      "epoch": 0.054933333333333334,
+      "grad_norm": 0.4966213467509509,
+      "learning_rate": 0.0001996842313852238,
+      "loss": 0.7913,
+      "step": 103
+    },
+    {
+      "epoch": 0.055466666666666664,
+      "grad_norm": 0.4745035073937513,
+      "learning_rate": 0.00019967036071563877,
+      "loss": 0.7176,
+      "step": 104
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.5021042926736523,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.7945,
+      "step": 105
+    },
+    {
+      "epoch": 0.05653333333333333,
+      "grad_norm": 0.47409043153267333,
+      "learning_rate": 0.0001996417265262996,
+      "loss": 0.7728,
+      "step": 106
+    },
+    {
+      "epoch": 0.05706666666666667,
+      "grad_norm": 0.47479390037974667,
+      "learning_rate": 0.00019962696309205148,
+      "loss": 0.8747,
+      "step": 107
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.465138144925051,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8798,
+      "step": 108
+    },
+    {
+      "epoch": 0.058133333333333335,
+      "grad_norm": 0.49635684069772074,
+      "learning_rate": 0.0001995965437648273,
+      "loss": 0.8658,
+      "step": 109
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 0.4346501860458164,
+      "learning_rate": 0.00019958088796268793,
+      "loss": 0.7793,
+      "step": 110
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.45969112593289096,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.821,
+      "step": 111
+    },
+    {
+      "epoch": 0.05973333333333333,
+      "grad_norm": 0.44301514811842374,
+      "learning_rate": 0.00019954868431510764,
+      "loss": 0.7998,
+      "step": 112
+    },
+    {
+      "epoch": 0.06026666666666667,
+      "grad_norm": 0.5218249227729164,
+      "learning_rate": 0.00019953213656583168,
+      "loss": 0.8215,
+      "step": 113
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.4531048118939405,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.789,
+      "step": 114
+    },
+    {
+      "epoch": 0.06133333333333333,
+      "grad_norm": 0.48172721813636576,
+      "learning_rate": 0.00019949814946337838,
+      "loss": 0.8278,
+      "step": 115
+    },
+    {
+      "epoch": 0.06186666666666667,
+      "grad_norm": 0.5548702921280615,
+      "learning_rate": 0.00019948071021169174,
+      "loss": 0.8377,
+      "step": 116
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.4782174625773169,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.8071,
+      "step": 117
+    },
+    {
+      "epoch": 0.06293333333333333,
+      "grad_norm": 0.48008735393392793,
+      "learning_rate": 0.00019944494056777946,
+      "loss": 0.7957,
+      "step": 118
+    },
+    {
+      "epoch": 0.06346666666666667,
+      "grad_norm": 0.4846278088312286,
+      "learning_rate": 0.00019942661028236745,
+      "loss": 0.8309,
+      "step": 119
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.43094627314402284,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.7827,
+      "step": 120
+    },
+    {
+      "epoch": 0.06453333333333333,
+      "grad_norm": 0.49785114222657906,
+      "learning_rate": 0.00019938905905831654,
+      "loss": 0.9121,
+      "step": 121
+    },
+    {
+      "epoch": 0.06506666666666666,
+      "grad_norm": 0.4529648152248877,
+      "learning_rate": 0.00019936983823181132,
+      "loss": 0.8266,
+      "step": 122
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.44916565811300524,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.8329,
+      "step": 123
+    },
+    {
+      "epoch": 0.06613333333333334,
+      "grad_norm": 0.4058584416064084,
+      "learning_rate": 0.00019933050643682269,
+      "loss": 0.7247,
+      "step": 124
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.46997145543886404,
+      "learning_rate": 0.00019931039558578997,
+      "loss": 0.8424,
+      "step": 125
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4293882737620628,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.842,
+      "step": 126
+    },
+    {
+      "epoch": 0.06773333333333334,
+      "grad_norm": 0.5413296014362735,
+      "learning_rate": 0.00019926928427691786,
+      "loss": 0.8283,
+      "step": 127
+    },
+    {
+      "epoch": 0.06826666666666667,
+      "grad_norm": 0.5018749189961162,
+      "learning_rate": 0.00019924828394184306,
+      "loss": 0.7647,
+      "step": 128
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 1.1300842004771448,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.8378,
+      "step": 129
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 0.4669099932081055,
+      "learning_rate": 0.0001992053942239668,
+      "loss": 0.7556,
+      "step": 130
+    },
+    {
+      "epoch": 0.06986666666666666,
+      "grad_norm": 0.5034657684944928,
+      "learning_rate": 0.0001991835049692405,
+      "loss": 0.8188,
+      "step": 131
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.6057387263076913,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8791,
+      "step": 132
+    },
+    {
+      "epoch": 0.07093333333333333,
+      "grad_norm": 0.4239059530916577,
+      "learning_rate": 0.0001991388379950346,
+      "loss": 0.6973,
+      "step": 133
+    },
+    {
+      "epoch": 0.07146666666666666,
+      "grad_norm": 0.7760985161052981,
+      "learning_rate": 0.0001991160604089374,
+      "loss": 0.8118,
+      "step": 134
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.477661473194432,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.7303,
+      "step": 135
+    },
+    {
+      "epoch": 0.07253333333333334,
+      "grad_norm": 0.4897224336939851,
+      "learning_rate": 0.00019906961737884077,
+      "loss": 0.7916,
+      "step": 136
+    },
+    {
+      "epoch": 0.07306666666666667,
+      "grad_norm": 0.4943394136702606,
+      "learning_rate": 0.00019904595207352737,
+      "loss": 0.8159,
+      "step": 137
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.47864484082798453,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.7898,
+      "step": 138
+    },
+    {
+      "epoch": 0.07413333333333333,
+      "grad_norm": 0.44712867207442375,
+      "learning_rate": 0.000198997734235711,
+      "loss": 0.8483,
+      "step": 139
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.5026585442490596,
+      "learning_rate": 0.00019897318184719385,
+      "loss": 0.8696,
+      "step": 140
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.44196458116495235,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.7583,
+      "step": 141
+    },
+    {
+      "epoch": 0.07573333333333333,
+      "grad_norm": 0.4236005733025835,
+      "learning_rate": 0.0001989231904975272,
+      "loss": 0.736,
+      "step": 142
+    },
+    {
+      "epoch": 0.07626666666666666,
+      "grad_norm": 0.46361836681455065,
+      "learning_rate": 0.00019889775168565943,
+      "loss": 0.7891,
+      "step": 143
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5408348194034053,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7486,
+      "step": 144
+    },
+    {
+      "epoch": 0.07733333333333334,
+      "grad_norm": 0.47465928218004905,
+      "learning_rate": 0.00019884598816767563,
+      "loss": 0.83,
+      "step": 145
+    },
+    {
+      "epoch": 0.07786666666666667,
+      "grad_norm": 0.4484263671568565,
+      "learning_rate": 0.0001988196636161333,
+      "loss": 0.7349,
+      "step": 146
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.43818164281104116,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.7336,
+      "step": 147
+    },
+    {
+      "epoch": 0.07893333333333333,
+      "grad_norm": 0.4813353284056389,
+      "learning_rate": 0.00019876612932099308,
+      "loss": 0.828,
+      "step": 148
+    },
+    {
+      "epoch": 0.07946666666666667,
+      "grad_norm": 0.511981521156402,
+      "learning_rate": 0.0001987389197372567,
+      "loss": 0.8613,
+      "step": 149
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5647154584679596,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.7679,
+      "step": 150
+    },
+    {
+      "epoch": 0.08053333333333333,
+      "grad_norm": 0.44305957564766546,
+      "learning_rate": 0.00019868361610371097,
+      "loss": 0.7845,
+      "step": 151
+    },
+    {
+      "epoch": 0.08106666666666666,
+      "grad_norm": 0.4562664206946426,
+      "learning_rate": 0.00019865552221904665,
+      "loss": 0.794,
+      "step": 152
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.4543083736456288,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.7812,
+      "step": 153
+    },
+    {
+      "epoch": 0.08213333333333334,
+      "grad_norm": 0.460607580393084,
+      "learning_rate": 0.00019859845073339787,
+      "loss": 0.7517,
+      "step": 154
+    },
+    {
+      "epoch": 0.08266666666666667,
+      "grad_norm": 0.4805994336463568,
+      "learning_rate": 0.00019856947330283752,
+      "loss": 0.8167,
+      "step": 155
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.5223439324811999,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.819,
+      "step": 156
+    },
+    {
+      "epoch": 0.08373333333333334,
+      "grad_norm": 0.4707824952310109,
+      "learning_rate": 0.0001985106354988997,
+      "loss": 0.8005,
+      "step": 157
+    },
+    {
+      "epoch": 0.08426666666666667,
+      "grad_norm": 0.4542483658151442,
+      "learning_rate": 0.00019848077530122083,
+      "loss": 0.7088,
+      "step": 158
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.4757409168949553,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.7295,
+      "step": 159
+    },
+    {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.5502377382211688,
+      "learning_rate": 0.00019842017276027832,
+      "loss": 0.8607,
+      "step": 160
+    },
+    {
+      "epoch": 0.08586666666666666,
+      "grad_norm": 0.4735340368833073,
+      "learning_rate": 0.00019838943059798304,
+      "loss": 0.7139,
+      "step": 161
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.4148398649639404,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.7629,
+      "step": 162
+    },
+    {
+      "epoch": 0.08693333333333333,
+      "grad_norm": 0.4821367727124406,
+      "learning_rate": 0.0001983270649487481,
+      "loss": 0.8125,
+      "step": 163
+    },
+    {
+      "epoch": 0.08746666666666666,
+      "grad_norm": 0.44594815157373763,
+      "learning_rate": 0.0001982954416480417,
+      "loss": 0.7765,
+      "step": 164
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.4730271181666054,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.8532,
+      "step": 165
+    },
+    {
+      "epoch": 0.08853333333333334,
+      "grad_norm": 0.421433524862573,
+      "learning_rate": 0.00019823131456661063,
+      "loss": 0.7891,
+      "step": 166
+    },
+    {
+      "epoch": 0.08906666666666667,
+      "grad_norm": 0.4816306325674117,
+      "learning_rate": 0.00019819881097737915,
+      "loss": 0.8018,
+      "step": 167
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5062506020880575,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.799,
+      "step": 168
+    },
+    {
+      "epoch": 0.09013333333333333,
+      "grad_norm": 0.4890455686153939,
+      "learning_rate": 0.00019813292418718732,
+      "loss": 0.8148,
+      "step": 169
+    },
+    {
+      "epoch": 0.09066666666666667,
+      "grad_norm": 0.508208932915509,
+      "learning_rate": 0.0001980995411829749,
+      "loss": 0.847,
+      "step": 170
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.4751378476016209,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8417,
+      "step": 171
+    },
+    {
+      "epoch": 0.09173333333333333,
+      "grad_norm": 0.4308596783807121,
+      "learning_rate": 0.0001980318964547504,
+      "loss": 0.7949,
+      "step": 172
+    },
+    {
+      "epoch": 0.09226666666666666,
+      "grad_norm": 0.44378724952748994,
+      "learning_rate": 0.0001979976349327357,
+      "loss": 0.7418,
+      "step": 173
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.47251508899567907,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.776,
+      "step": 174
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 0.4719056631507855,
+      "learning_rate": 0.00019792823408445174,
+      "loss": 0.8101,
+      "step": 175
+    },
+    {
+      "epoch": 0.09386666666666667,
+      "grad_norm": 0.5154067580332798,
+      "learning_rate": 0.0001978930949654239,
+      "loss": 0.7865,
+      "step": 176
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.4774614610268319,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.7611,
+      "step": 177
+    },
+    {
+      "epoch": 0.09493333333333333,
+      "grad_norm": 0.6279916872210253,
+      "learning_rate": 0.00019782193986224995,
+      "loss": 0.7965,
+      "step": 178
+    },
+    {
+      "epoch": 0.09546666666666667,
+      "grad_norm": 0.5368628256054744,
+      "learning_rate": 0.00019778592409058378,
+      "loss": 0.8515,
+      "step": 179
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5116000985794409,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.8993,
+      "step": 180
+    },
+    {
+      "epoch": 0.09653333333333333,
+      "grad_norm": 0.45360288544505306,
+      "learning_rate": 0.0001977130166448355,
+      "loss": 0.7928,
+      "step": 181
+    },
+    {
+      "epoch": 0.09706666666666666,
+      "grad_norm": 0.43260462670796207,
+      "learning_rate": 0.00019767612518846608,
+      "loss": 0.7726,
+      "step": 182
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.4708179747920768,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8254,
+      "step": 183
+    },
+    {
+      "epoch": 0.09813333333333334,
+      "grad_norm": 0.5668599401104251,
+      "learning_rate": 0.00019760146735955388,
+      "loss": 0.845,
+      "step": 184
+    },
+    {
+      "epoch": 0.09866666666666667,
+      "grad_norm": 0.46800930216492326,
+      "learning_rate": 0.00019756370120995066,
+      "loss": 0.7767,
+      "step": 185
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.5142748820218108,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7425,
+      "step": 186
+    },
+    {
+      "epoch": 0.09973333333333333,
+      "grad_norm": 0.4495781555254168,
+      "learning_rate": 0.000197487295004327,
+      "loss": 0.7721,
+      "step": 187
+    },
+    {
+      "epoch": 0.10026666666666667,
+      "grad_norm": 0.4398194024113899,
+      "learning_rate": 0.00019744865517646706,
+      "loss": 0.7554,
+      "step": 188
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.46769385969747856,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.7729,
+      "step": 189
+    },
+    {
+      "epoch": 0.10133333333333333,
+      "grad_norm": 0.4499968242364586,
+      "learning_rate": 0.0001973705026475726,
+      "loss": 0.835,
+      "step": 190
+    },
+    {
+      "epoch": 0.10186666666666666,
+      "grad_norm": 0.4943486759509741,
+      "learning_rate": 0.00019733099017991341,
+      "loss": 0.8232,
+      "step": 191
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5371693975622313,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.7985,
+      "step": 192
+    },
+    {
+      "epoch": 0.10293333333333334,
+      "grad_norm": 0.4479662076166779,
+      "learning_rate": 0.0001972510934281218,
+      "loss": 0.7876,
+      "step": 193
+    },
+    {
+      "epoch": 0.10346666666666667,
+      "grad_norm": 0.4774784162682743,
+      "learning_rate": 0.00019721070938257324,
+      "loss": 0.7794,
+      "step": 194
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.47837892223465484,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.7131,
+      "step": 195
+    },
+    {
+      "epoch": 0.10453333333333334,
+      "grad_norm": 0.5047249526760721,
+      "learning_rate": 0.0001971290705551347,
+      "loss": 0.7822,
+      "step": 196
+    },
+    {
+      "epoch": 0.10506666666666667,
+      "grad_norm": 0.49697405282551316,
+      "learning_rate": 0.00019708781601703065,
+      "loss": 0.7668,
+      "step": 197
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.486842951541938,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.8311,
+      "step": 198
+    },
+    {
+      "epoch": 0.10613333333333333,
+      "grad_norm": 0.5117164630412789,
+      "learning_rate": 0.00019700443730801413,
+      "loss": 0.8394,
+      "step": 199
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.4430229505434961,
+      "learning_rate": 0.00019696231338608316,
+      "loss": 0.8238,
+      "step": 200
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.49889594388044983,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.8521,
+      "step": 201
+    },
+    {
+      "epoch": 0.10773333333333333,
+      "grad_norm": 0.4265568951015607,
+      "learning_rate": 0.00019687719703631755,
+      "loss": 0.7592,
+      "step": 202
+    },
+    {
+      "epoch": 0.10826666666666666,
+      "grad_norm": 0.4719272197684277,
+      "learning_rate": 0.00019683420486265327,
+      "loss": 0.7673,
+      "step": 203
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.4625187503128492,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.6884,
+      "step": 204
+    },
+    {
+      "epoch": 0.10933333333333334,
+      "grad_norm": 0.5439453074539425,
+      "learning_rate": 0.0001967473531596671,
+      "loss": 0.7456,
+      "step": 205
+    },
+    {
+      "epoch": 0.10986666666666667,
+      "grad_norm": 0.5091163294240805,
+      "learning_rate": 0.0001967034938896976,
+      "loss": 0.7669,
+      "step": 206
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.48946230838595484,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.7849,
+      "step": 207
+    },
+    {
+      "epoch": 0.11093333333333333,
+      "grad_norm": 0.4553021564198512,
+      "learning_rate": 0.0001966149091676575,
+      "loss": 0.7738,
+      "step": 208
+    },
+    {
+      "epoch": 0.11146666666666667,
+      "grad_norm": 0.4795543452934293,
+      "learning_rate": 0.00019657018398011434,
+      "loss": 0.7983,
+      "step": 209
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.454044266104084,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.7982,
+      "step": 210
+    },
+    {
+      "epoch": 0.11253333333333333,
+      "grad_norm": 0.4347171236331447,
+      "learning_rate": 0.00019647986861976246,
+      "loss": 0.8419,
+      "step": 211
+    },
+    {
+      "epoch": 0.11306666666666666,
+      "grad_norm": 0.4052379979410376,
+      "learning_rate": 0.0001964342787166491,
+      "loss": 0.7631,
+      "step": 212
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.43569229168908397,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.8285,
+      "step": 213
+    },
+    {
+      "epoch": 0.11413333333333334,
+      "grad_norm": 0.4398135539279044,
+      "learning_rate": 0.0001963422351452389,
+      "loss": 0.7413,
+      "step": 214
+    },
+    {
+      "epoch": 0.11466666666666667,
+      "grad_norm": 0.4954383627255014,
+      "learning_rate": 0.0001962957817517982,
+      "loss": 0.8204,
+      "step": 215
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.5196432413224092,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.813,
+      "step": 216
+    },
+    {
+      "epoch": 0.11573333333333333,
+      "grad_norm": 0.4998146296917399,
+      "learning_rate": 0.00019620201244302952,
+      "loss": 0.7468,
+      "step": 217
+    },
+    {
+      "epoch": 0.11626666666666667,
+      "grad_norm": 0.44380017905542196,
+      "learning_rate": 0.00019615469680771096,
+      "loss": 0.759,
+      "step": 218
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.4735241465117807,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7539,
+      "step": 219
+    },
+    {
+      "epoch": 0.11733333333333333,
+      "grad_norm": 0.42614767575223617,
+      "learning_rate": 0.00019605920428166323,
+      "loss": 0.7549,
+      "step": 220
+    },
+    {
+      "epoch": 0.11786666666666666,
+      "grad_norm": 0.4912794884557202,
+      "learning_rate": 0.00019601102767608923,
+      "loss": 0.8212,
+      "step": 221
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.40804040740465725,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.7729,
+      "step": 222
+    },
+    {
+      "epoch": 0.11893333333333334,
+      "grad_norm": 0.5057435393007068,
+      "learning_rate": 0.00019591381449915397,
+      "loss": 0.8356,
+      "step": 223
+    },
+    {
+      "epoch": 0.11946666666666667,
+      "grad_norm": 0.4450044987136846,
+      "learning_rate": 0.00019586477821808597,
+      "loss": 0.722,
+      "step": 224
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.40539864012144133,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.7341,
+      "step": 225
+    },
+    {
+      "epoch": 0.12053333333333334,
+      "grad_norm": 0.42945105211040513,
+      "learning_rate": 0.00019576584700289768,
+      "loss": 0.7402,
+      "step": 226
+    },
+    {
+      "epoch": 0.12106666666666667,
+      "grad_norm": 0.4400399253283012,
+      "learning_rate": 0.00019571595236420102,
+      "loss": 0.766,
+      "step": 227
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.47480775379120294,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.7735,
+      "step": 228
+    },
+    {
+      "epoch": 0.12213333333333333,
+      "grad_norm": 0.48083624247350093,
+      "learning_rate": 0.00019561530576956703,
+      "loss": 0.7437,
+      "step": 229
+    },
+    {
+      "epoch": 0.12266666666666666,
+      "grad_norm": 0.4527960167439757,
+      "learning_rate": 0.00019556455411417573,
+      "loss": 0.7084,
+      "step": 230
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.46587383560167145,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.781,
+      "step": 231
+    },
+    {
+      "epoch": 0.12373333333333333,
+      "grad_norm": 0.4359575852677296,
+      "learning_rate": 0.00019546219484500475,
+      "loss": 0.7918,
+      "step": 232
+    },
+    {
+      "epoch": 0.12426666666666666,
+      "grad_norm": 0.4285546310251104,
+      "learning_rate": 0.00019541058753688538,
+      "loss": 0.8369,
+      "step": 233
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.5248001655775915,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.8085,
+      "step": 234
+    },
+    {
+      "epoch": 0.12533333333333332,
+      "grad_norm": 0.4036128329793708,
+      "learning_rate": 0.00019530651834411474,
+      "loss": 0.7202,
+      "step": 235
+    },
+    {
+      "epoch": 0.12586666666666665,
+      "grad_norm": 0.4388416392979749,
+      "learning_rate": 0.00019525405677022989,
+      "loss": 0.7236,
+      "step": 236
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.4057369764587263,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7492,
+      "step": 237
+    },
+    {
+      "epoch": 0.12693333333333334,
+      "grad_norm": 0.43058107831103193,
+      "learning_rate": 0.0001951482804507517,
+      "loss": 0.6945,
+      "step": 238
+    },
+    {
+      "epoch": 0.12746666666666667,
+      "grad_norm": 0.4740706565815538,
+      "learning_rate": 0.00019509496602102252,
+      "loss": 0.6996,
+      "step": 239
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.48517105441975683,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.7643,
+      "step": 240
+    },
+    {
+      "epoch": 0.12853333333333333,
+      "grad_norm": 0.4475993762689496,
+      "learning_rate": 0.00019498748541760846,
+      "loss": 0.8288,
+      "step": 241
+    },
+    {
+      "epoch": 0.12906666666666666,
+      "grad_norm": 0.4660880690247606,
+      "learning_rate": 0.0001949333195648769,
+      "loss": 0.81,
+      "step": 242
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.506717111232454,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8701,
+      "step": 243
+    },
+    {
+      "epoch": 0.13013333333333332,
+      "grad_norm": 0.4676798932942207,
+      "learning_rate": 0.00019482413756610173,
+      "loss": 0.7892,
+      "step": 244
+    },
+    {
+      "epoch": 0.13066666666666665,
+      "grad_norm": 0.39802011134696863,
+      "learning_rate": 0.0001947691217460921,
+      "loss": 0.7463,
+      "step": 245
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.4025814525496451,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.7596,
+      "step": 246
+    },
+    {
+      "epoch": 0.13173333333333334,
+      "grad_norm": 0.4293380673152308,
+      "learning_rate": 0.00019465824128625617,
+      "loss": 0.7349,
+      "step": 247
+    },
+    {
+      "epoch": 0.13226666666666667,
+      "grad_norm": 0.4484093869125715,
+      "learning_rate": 0.00019460237697753577,
+      "loss": 0.7896,
+      "step": 248
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.5745472609444757,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.7758,
+      "step": 249
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.4511690947612965,
+      "learning_rate": 0.00019448980103658613,
+      "loss": 0.785,
+      "step": 250
+    },
+    {
+      "epoch": 0.13386666666666666,
+      "grad_norm": 0.4301389133636024,
+      "learning_rate": 0.0001944330897405257,
+      "loss": 0.7118,
+      "step": 251
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5628558109665334,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8757,
+      "step": 252
+    },
+    {
+      "epoch": 0.13493333333333332,
+      "grad_norm": 0.44825195379467003,
+      "learning_rate": 0.00019431882134397598,
+      "loss": 0.7549,
+      "step": 253
+    },
+    {
+      "epoch": 0.13546666666666668,
+      "grad_norm": 0.4139688294442162,
+      "learning_rate": 0.00019426126458470936,
+      "loss": 0.7216,
+      "step": 254
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4914999085759857,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7857,
+      "step": 255
+    },
+    {
+      "epoch": 0.13653333333333334,
+      "grad_norm": 0.45045861846658547,
+      "learning_rate": 0.00019414530680355837,
+      "loss": 0.7594,
+      "step": 256
+    },
+    {
+      "epoch": 0.13706666666666667,
+      "grad_norm": 0.54415976239134,
+      "learning_rate": 0.00019408690612794148,
+      "loss": 0.8186,
+      "step": 257
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.44770674423486395,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7563,
+      "step": 258
+    },
+    {
+      "epoch": 0.13813333333333333,
+      "grad_norm": 0.4332439167961714,
+      "learning_rate": 0.00019396926207859084,
+      "loss": 0.7765,
+      "step": 259
+    },
+    {
+      "epoch": 0.13866666666666666,
+      "grad_norm": 0.513080777310441,
+      "learning_rate": 0.0001939100190561601,
+      "loss": 0.7722,
+      "step": 260
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.46048250333472324,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8457,
+      "step": 261
+    },
+    {
+      "epoch": 0.13973333333333332,
+      "grad_norm": 0.5409763038472102,
+      "learning_rate": 0.0001937906919003304,
+      "loss": 0.7467,
+      "step": 262
+    },
+    {
+      "epoch": 0.14026666666666668,
+      "grad_norm": 0.5048634432898285,
+      "learning_rate": 0.00019373060812326052,
+      "loss": 0.8176,
+      "step": 263
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.40677339313445815,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.7599,
+      "step": 264
+    },
+    {
+      "epoch": 0.14133333333333334,
+      "grad_norm": 0.5137234723336886,
+      "learning_rate": 0.00019360960106790643,
+      "loss": 0.8256,
+      "step": 265
+    },
+    {
+      "epoch": 0.14186666666666667,
+      "grad_norm": 0.4955039646766089,
+      "learning_rate": 0.0001935486781509677,
+      "loss": 0.7996,
+      "step": 266
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.4549543044910791,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7659,
+      "step": 267
+    },
+    {
+      "epoch": 0.14293333333333333,
+      "grad_norm": 0.514233378986321,
+      "learning_rate": 0.00019342599444819168,
+      "loss": 0.8473,
+      "step": 268
+    },
+    {
+      "epoch": 0.14346666666666666,
+      "grad_norm": 0.4197860898310825,
+      "learning_rate": 0.00019336423402870653,
+      "loss": 0.7577,
+      "step": 269
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4655606876532891,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.7232,
+      "step": 270
+    },
+    {
+      "epoch": 0.14453333333333335,
+      "grad_norm": 0.5219658661940341,
+      "learning_rate": 0.0001932398769756714,
+      "loss": 0.7775,
+      "step": 271
+    },
+    {
+      "epoch": 0.14506666666666668,
+      "grad_norm": 0.45920050910419263,
+      "learning_rate": 0.0001931772807134704,
+      "loss": 0.7591,
+      "step": 272
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.4949167375260932,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7895,
+      "step": 273
+    },
+    {
+      "epoch": 0.14613333333333334,
+      "grad_norm": 0.36550310613011944,
+      "learning_rate": 0.00019305125365231084,
+      "loss": 0.7346,
+      "step": 274
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 0.40501845811731474,
+      "learning_rate": 0.00019298782322968815,
+      "loss": 0.7659,
+      "step": 275
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.4372750459349511,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.8472,
+      "step": 276
+    },
+    {
+      "epoch": 0.14773333333333333,
+      "grad_norm": 0.4554561174672684,
+      "learning_rate": 0.0001928601295474208,
+      "loss": 0.7362,
+      "step": 277
+    },
+    {
+      "epoch": 0.14826666666666666,
+      "grad_norm": 0.49697228865217935,
+      "learning_rate": 0.00019279586666908884,
+      "loss": 0.7209,
+      "step": 278
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.4043197887249852,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7523,
+      "step": 279
+    },
+    {
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.48191173115223146,
+      "learning_rate": 0.00019266650979752136,
+      "loss": 0.7505,
+      "step": 280
+    },
+    {
+      "epoch": 0.14986666666666668,
+      "grad_norm": 0.47714079201647,
+      "learning_rate": 0.00019260141619056507,
+      "loss": 0.6959,
+      "step": 281
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.47070677791037496,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.7352,
+      "step": 282
+    },
+    {
+      "epoch": 0.15093333333333334,
+      "grad_norm": 0.45554044653552067,
+      "learning_rate": 0.0001924703996062038,
+      "loss": 0.7379,
+      "step": 283
+    },
+    {
+      "epoch": 0.15146666666666667,
+      "grad_norm": 0.46584105355548255,
+      "learning_rate": 0.0001924044770200342,
+      "loss": 0.7564,
+      "step": 284
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.4314749264263858,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.7059,
+      "step": 285
+    },
+    {
+      "epoch": 0.15253333333333333,
+      "grad_norm": 0.4695735068859168,
+      "learning_rate": 0.0001922718042439908,
+      "loss": 0.7385,
+      "step": 286
+    },
+    {
+      "epoch": 0.15306666666666666,
+      "grad_norm": 0.5126796493845308,
+      "learning_rate": 0.000192205054450298,
+      "loss": 0.7944,
+      "step": 287
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.44631848104777794,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8395,
+      "step": 288
+    },
+    {
+      "epoch": 0.15413333333333334,
+      "grad_norm": 0.41647207251345697,
+      "learning_rate": 0.00019207072904819486,
+      "loss": 0.7578,
+      "step": 289
+    },
+    {
+      "epoch": 0.15466666666666667,
+      "grad_norm": 0.43351509197473287,
+      "learning_rate": 0.00019200315384090044,
+      "loss": 0.7547,
+      "step": 290
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.4360669876087587,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7135,
+      "step": 291
+    },
+    {
+      "epoch": 0.15573333333333333,
+      "grad_norm": 0.47969797535202524,
+      "learning_rate": 0.00019186717942277462,
+      "loss": 0.7334,
+      "step": 292
+    },
+    {
+      "epoch": 0.15626666666666666,
+      "grad_norm": 0.5321303633183078,
+      "learning_rate": 0.00019179878061798347,
+      "loss": 0.8588,
+      "step": 293
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.4226381310659623,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.7046,
+      "step": 294
+    },
+    {
+      "epoch": 0.15733333333333333,
+      "grad_norm": 0.4321522307672889,
+      "learning_rate": 0.00019166116083819002,
+      "loss": 0.74,
+      "step": 295
+    },
+    {
+      "epoch": 0.15786666666666666,
+      "grad_norm": 0.6109275128340287,
+      "learning_rate": 0.00019159194027414128,
+      "loss": 0.8307,
+      "step": 296
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.561826732824592,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8377,
+      "step": 297
+    },
+    {
+      "epoch": 0.15893333333333334,
+      "grad_norm": 0.46881849298116435,
+      "learning_rate": 0.00019145267883125482,
+      "loss": 0.6666,
+      "step": 298
+    },
+    {
+      "epoch": 0.15946666666666667,
+      "grad_norm": 0.37386184177399606,
+      "learning_rate": 0.00019138263836827288,
+      "loss": 0.7083,
+      "step": 299
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4873371584300598,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.7806,
+      "step": 300
+    },
+    {
+      "epoch": 0.16053333333333333,
+      "grad_norm": 0.49176854968781913,
+      "learning_rate": 0.00019124173900498818,
+      "loss": 0.7992,
+      "step": 301
+    },
+    {
+      "epoch": 0.16106666666666666,
+      "grad_norm": 0.4245751108118662,
+      "learning_rate": 0.00019117088052543233,
+      "loss": 0.7795,
+      "step": 302
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.4913231560990115,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8206,
+      "step": 303
+    },
+    {
+      "epoch": 0.16213333333333332,
+      "grad_norm": 0.404512219030091,
+      "learning_rate": 0.00019102834702846387,
+      "loss": 0.7035,
+      "step": 304
+    },
+    {
+      "epoch": 0.16266666666666665,
+      "grad_norm": 0.48913247597842824,
+      "learning_rate": 0.0001909566724366779,
+      "loss": 0.7747,
+      "step": 305
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.43271391067608156,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.7488,
+      "step": 306
+    },
+    {
+      "epoch": 0.16373333333333334,
+      "grad_norm": 0.4969527465185882,
+      "learning_rate": 0.00019081250863665794,
+      "loss": 0.762,
+      "step": 307
+    },
+    {
+      "epoch": 0.16426666666666667,
+      "grad_norm": 0.42931030111391627,
+      "learning_rate": 0.0001907400198589189,
+      "loss": 0.7261,
+      "step": 308
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.5069524485557306,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7948,
+      "step": 309
+    },
+    {
+      "epoch": 0.16533333333333333,
+      "grad_norm": 0.4465579286557172,
+      "learning_rate": 0.00019059422963029464,
+      "loss": 0.7229,
+      "step": 310
+    },
+    {
+      "epoch": 0.16586666666666666,
+      "grad_norm": 0.40904039416327287,
+      "learning_rate": 0.0001905209286147611,
+      "loss": 0.7192,
+      "step": 311
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4808812498972212,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7615,
+      "step": 312
+    },
+    {
+      "epoch": 0.16693333333333332,
+      "grad_norm": 0.49575520872474643,
+      "learning_rate": 0.0001903735158756905,
+      "loss": 0.7769,
+      "step": 313
+    },
+    {
+      "epoch": 0.16746666666666668,
+      "grad_norm": 0.4463337518123487,
+      "learning_rate": 0.0001902994045923502,
+      "loss": 0.7544,
+      "step": 314
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.44638970362973823,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.7316,
+      "step": 315
+    },
+    {
+      "epoch": 0.16853333333333334,
+      "grad_norm": 0.43747382893287573,
+      "learning_rate": 0.0001901503733045967,
+      "loss": 0.7618,
+      "step": 316
+    },
+    {
+      "epoch": 0.16906666666666667,
+      "grad_norm": 0.47841798593223983,
+      "learning_rate": 0.00019007545374521355,
+      "loss": 0.8292,
+      "step": 317
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.43636901681178347,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7256,
+      "step": 318
+    },
+    {
+      "epoch": 0.17013333333333333,
+      "grad_norm": 0.5180081581480641,
+      "learning_rate": 0.00018992480791403958,
+      "loss": 0.7742,
+      "step": 319
+    },
+    {
+      "epoch": 0.17066666666666666,
+      "grad_norm": 0.5253220855220916,
+      "learning_rate": 0.0001898490820921001,
+      "loss": 0.7741,
+      "step": 320
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.4256667849850581,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.6888,
+      "step": 321
+    },
+    {
+      "epoch": 0.17173333333333332,
+      "grad_norm": 0.4278391289850968,
+      "learning_rate": 0.0001896968257661595,
+      "loss": 0.7634,
+      "step": 322
+    },
+    {
+      "epoch": 0.17226666666666668,
+      "grad_norm": 0.4436933202303663,
+      "learning_rate": 0.00018962029571681886,
+      "loss": 0.7468,
+      "step": 323
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.43321787428379244,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.8549,
+      "step": 324
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 0.4377508413236027,
+      "learning_rate": 0.00018946643298804793,
+      "loss": 0.74,
+      "step": 325
+    },
+    {
+      "epoch": 0.17386666666666667,
+      "grad_norm": 0.3713006997854288,
+      "learning_rate": 0.00018938910076807513,
+      "loss": 0.6814,
+      "step": 326
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.4443506208855899,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7231,
+      "step": 327
+    },
+    {
+      "epoch": 0.17493333333333333,
+      "grad_norm": 0.43526163578696675,
+      "learning_rate": 0.0001892336357715829,
+      "loss": 0.7713,
+      "step": 328
+    },
+    {
+      "epoch": 0.17546666666666666,
+      "grad_norm": 0.38246761070996405,
+      "learning_rate": 0.0001891555034593055,
+      "loss": 0.7667,
+      "step": 329
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4314246271817471,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.7029,
+      "step": 330
+    },
+    {
+      "epoch": 0.17653333333333332,
+      "grad_norm": 0.39923494746519433,
+      "learning_rate": 0.00018899844037326225,
+      "loss": 0.7813,
+      "step": 331
+    },
+    {
+      "epoch": 0.17706666666666668,
+      "grad_norm": 0.4186847398364211,
+      "learning_rate": 0.0001889195100685106,
+      "loss": 0.7105,
+      "step": 332
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.47557197889034186,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7047,
+      "step": 333
+    },
+    {
+      "epoch": 0.17813333333333334,
+      "grad_norm": 0.4132150885926361,
+      "learning_rate": 0.00018876085311403593,
+      "loss": 0.716,
+      "step": 334
+    },
+    {
+      "epoch": 0.17866666666666667,
+      "grad_norm": 0.4372379020786047,
+      "learning_rate": 0.00018868112693808665,
+      "loss": 0.7624,
+      "step": 335
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4344170860589622,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7308,
+      "step": 336
+    },
+    {
+      "epoch": 0.17973333333333333,
+      "grad_norm": 0.4963163084048665,
+      "learning_rate": 0.00018852088037913577,
+      "loss": 0.7996,
+      "step": 337
+    },
+    {
+      "epoch": 0.18026666666666666,
+      "grad_norm": 0.4319581176838081,
+      "learning_rate": 0.0001884403604746547,
+      "loss": 0.7587,
+      "step": 338
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.40989408741441974,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.6997,
+      "step": 339
+    },
+    {
+      "epoch": 0.18133333333333335,
+      "grad_norm": 0.41529156262352523,
+      "learning_rate": 0.00018827852861790398,
+      "loss": 0.7342,
+      "step": 340
+    },
+    {
+      "epoch": 0.18186666666666668,
+      "grad_norm": 0.43645340390430715,
+      "learning_rate": 0.00018819721714888877,
+      "loss": 0.7523,
+      "step": 341
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.5428773112788919,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.8411,
+      "step": 342
+    },
+    {
+      "epoch": 0.18293333333333334,
+      "grad_norm": 0.4257541414345063,
+      "learning_rate": 0.00018803380434362,
+      "loss": 0.6975,
+      "step": 343
+    },
+    {
+      "epoch": 0.18346666666666667,
+      "grad_norm": 0.4826240533571775,
+      "learning_rate": 0.0001879517034953418,
+      "loss": 0.756,
+      "step": 344
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.48733773081361126,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7816,
+      "step": 345
+    },
+    {
+      "epoch": 0.18453333333333333,
+      "grad_norm": 0.4532479494685654,
+      "learning_rate": 0.00018778671413332513,
+      "loss": 0.778,
+      "step": 346
+    },
+    {
+      "epoch": 0.18506666666666666,
+      "grad_norm": 0.4262543331458245,
+      "learning_rate": 0.00018770382611226987,
+      "loss": 0.7006,
+      "step": 347
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.4563004023727352,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.7583,
+      "step": 348
+    },
+    {
+      "epoch": 0.18613333333333335,
+      "grad_norm": 0.4625188794998478,
+      "learning_rate": 0.000187537264627646,
+      "loss": 0.8411,
+      "step": 349
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.5536645615515583,
+      "learning_rate": 0.00018745359166145523,
+      "loss": 0.7795,
+      "step": 350
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.43196130501488406,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.8198,
+      "step": 351
+    },
+    {
+      "epoch": 0.18773333333333334,
+      "grad_norm": 0.42642820835218254,
+      "learning_rate": 0.00018728546253061614,
+      "loss": 0.7555,
+      "step": 352
+    },
+    {
+      "epoch": 0.18826666666666667,
+      "grad_norm": 0.4647214261040618,
+      "learning_rate": 0.00018720100686802694,
+      "loss": 0.7525,
+      "step": 353
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5125122683502902,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.8201,
+      "step": 354
+    },
+    {
+      "epoch": 0.18933333333333333,
+      "grad_norm": 0.4330139678839322,
+      "learning_rate": 0.00018703131460949554,
+      "loss": 0.7307,
+      "step": 355
+    },
+    {
+      "epoch": 0.18986666666666666,
+      "grad_norm": 0.4824467437038003,
+      "learning_rate": 0.0001869460785202802,
+      "loss": 0.8324,
+      "step": 356
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.533861956777296,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.8308,
+      "step": 357
+    },
+    {
+      "epoch": 0.19093333333333334,
+      "grad_norm": 0.4060688080818584,
+      "learning_rate": 0.00018677482769458904,
+      "loss": 0.8042,
+      "step": 358
+    },
+    {
+      "epoch": 0.19146666666666667,
+      "grad_norm": 0.4216719398946027,
+      "learning_rate": 0.00018668881346949417,
+      "loss": 0.7504,
+      "step": 359
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4027708900568809,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7287,
+      "step": 360
+    },
+    {
+      "epoch": 0.19253333333333333,
+      "grad_norm": 0.444151307821528,
+      "learning_rate": 0.00018651600867906272,
+      "loss": 0.7606,
+      "step": 361
+    },
+    {
+      "epoch": 0.19306666666666666,
+      "grad_norm": 0.471332668323808,
+      "learning_rate": 0.00018642921862974742,
+      "loss": 0.782,
+      "step": 362
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.3791440308583429,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.72,
+      "step": 363
+    },
+    {
+      "epoch": 0.19413333333333332,
+      "grad_norm": 0.49731759646196794,
+      "learning_rate": 0.00018625486451875843,
+      "loss": 0.7959,
+      "step": 364
+    },
+    {
+      "epoch": 0.19466666666666665,
+      "grad_norm": 0.46620678123962395,
+      "learning_rate": 0.0001861673009777325,
+      "loss": 0.7923,
+      "step": 365
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.41068869026604143,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.7252,
+      "step": 366
+    },
+    {
+      "epoch": 0.19573333333333334,
+      "grad_norm": 0.3776789165535236,
+      "learning_rate": 0.00018599140223200716,
+      "loss": 0.7547,
+      "step": 367
+    },
+    {
+      "epoch": 0.19626666666666667,
+      "grad_norm": 0.5736993325001151,
+      "learning_rate": 0.0001859030675525681,
+      "loss": 0.8784,
+      "step": 368
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.41020244284924917,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.7154,
+      "step": 369
+    },
+    {
+      "epoch": 0.19733333333333333,
+      "grad_norm": 0.5054142932121306,
+      "learning_rate": 0.0001857256288994402,
+      "loss": 0.7587,
+      "step": 370
+    },
+    {
+      "epoch": 0.19786666666666666,
+      "grad_norm": 0.39883694234628286,
+      "learning_rate": 0.00018563652545561013,
+      "loss": 0.7256,
+      "step": 371
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.43049400761772333,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7944,
+      "step": 372
+    },
+    {
+      "epoch": 0.19893333333333332,
+      "grad_norm": 0.4720332451741232,
+      "learning_rate": 0.000185457551663799,
+      "loss": 0.8115,
+      "step": 373
+    },
+    {
+      "epoch": 0.19946666666666665,
+      "grad_norm": 0.5270018673238716,
+      "learning_rate": 0.00018536768185026083,
+      "loss": 0.7891,
+      "step": 374
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.47994775911430304,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.771,
+      "step": 375
+    },
+    {
+      "epoch": 0.20053333333333334,
+      "grad_norm": 0.4445426182391953,
+      "learning_rate": 0.00018518717772974302,
+      "loss": 0.7426,
+      "step": 376
+    },
+    {
+      "epoch": 0.20106666666666667,
+      "grad_norm": 0.4810788695239798,
+      "learning_rate": 0.00018509654396177609,
+      "loss": 0.7136,
+      "step": 377
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.468624310902255,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.7427,
+      "step": 378
+    },
+    {
+      "epoch": 0.20213333333333333,
+      "grad_norm": 0.4051085347747549,
+      "learning_rate": 0.00018491451436365627,
+      "loss": 0.7665,
+      "step": 379
+    },
+    {
+      "epoch": 0.20266666666666666,
+      "grad_norm": 0.4429745586090362,
+      "learning_rate": 0.0001848231190770714,
+      "loss": 0.7632,
+      "step": 380
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.4243678167327291,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.7301,
+      "step": 381
+    },
+    {
+      "epoch": 0.20373333333333332,
+      "grad_norm": 0.4802588045036027,
+      "learning_rate": 0.00018463956889345194,
+      "loss": 0.7024,
+      "step": 382
+    },
+    {
+      "epoch": 0.20426666666666668,
+      "grad_norm": 0.44080613228848897,
+      "learning_rate": 0.00018454741454452603,
+      "loss": 0.6883,
+      "step": 383
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.6211221715459578,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7807,
+      "step": 384
+    },
+    {
+      "epoch": 0.20533333333333334,
+      "grad_norm": 0.42586430306449,
+      "learning_rate": 0.00018436234870837547,
+      "loss": 0.75,
+      "step": 385
+    },
+    {
+      "epoch": 0.20586666666666667,
+      "grad_norm": 0.40107312109244403,
+      "learning_rate": 0.00018426943777378552,
+      "loss": 0.695,
+      "step": 386
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.442626202174717,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7851,
+      "step": 387
+    },
+    {
+      "epoch": 0.20693333333333333,
+      "grad_norm": 0.4184883405956326,
+      "learning_rate": 0.00018408286125880604,
+      "loss": 0.7674,
+      "step": 388
+    },
+    {
+      "epoch": 0.20746666666666666,
+      "grad_norm": 0.366713571829327,
+      "learning_rate": 0.00018398919623556238,
+      "loss": 0.689,
+      "step": 389
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4414073292564949,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.7457,
+      "step": 390
+    },
+    {
+      "epoch": 0.20853333333333332,
+      "grad_norm": 0.3942549677792474,
+      "learning_rate": 0.0001838011140560562,
+      "loss": 0.6964,
+      "step": 391
+    },
+    {
+      "epoch": 0.20906666666666668,
+      "grad_norm": 0.41503269097137613,
+      "learning_rate": 0.00018370669746143564,
+      "loss": 0.7135,
+      "step": 392
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5245371258298765,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.7034,
+      "step": 393
+    },
+    {
+      "epoch": 0.21013333333333334,
+      "grad_norm": 0.4594749181909736,
+      "learning_rate": 0.0001835171146721701,
+      "loss": 0.7498,
+      "step": 394
+    },
+    {
+      "epoch": 0.21066666666666667,
+      "grad_norm": 0.424235144677964,
+      "learning_rate": 0.00018342194904364813,
+      "loss": 0.7235,
+      "step": 395
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.44602704702768614,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.7556,
+      "step": 396
+    },
+    {
+      "epoch": 0.21173333333333333,
+      "grad_norm": 0.5232426706389673,
+      "learning_rate": 0.00018323087073971993,
+      "loss": 0.8369,
+      "step": 397
+    },
+    {
+      "epoch": 0.21226666666666666,
+      "grad_norm": 0.41560340485086095,
+      "learning_rate": 0.00018313495863490258,
+      "loss": 0.711,
+      "step": 398
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.3944540115776522,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.6959,
+      "step": 399
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.4491732511440907,
+      "learning_rate": 0.00018294238995160094,
+      "loss": 0.7476,
+      "step": 400
+    },
+    {
+      "epoch": 0.21386666666666668,
+      "grad_norm": 0.3921587418990816,
+      "learning_rate": 0.00018284573394815597,
+      "loss": 0.7307,
+      "step": 401
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.486831333616804,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7517,
+      "step": 402
+    },
+    {
+      "epoch": 0.21493333333333334,
+      "grad_norm": 0.4010907138133327,
+      "learning_rate": 0.00018265168006082437,
+      "loss": 0.6992,
+      "step": 403
+    },
+    {
+      "epoch": 0.21546666666666667,
+      "grad_norm": 0.411977049690829,
+      "learning_rate": 0.00018255428275641214,
+      "loss": 0.7785,
+      "step": 404
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.4241612115568198,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7824,
+      "step": 405
+    },
+    {
+      "epoch": 0.21653333333333333,
+      "grad_norm": 0.4386435931679396,
+      "learning_rate": 0.0001823587488803095,
+      "loss": 0.7548,
+      "step": 406
+    },
+    {
+      "epoch": 0.21706666666666666,
+      "grad_norm": 0.5516298765589959,
+      "learning_rate": 0.00018226061289251298,
+      "loss": 0.7903,
+      "step": 407
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.5237223995446475,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.6774,
+      "step": 408
+    },
+    {
+      "epoch": 0.21813333333333335,
+      "grad_norm": 0.408081916675791,
+      "learning_rate": 0.00018206360428267332,
+      "loss": 0.8071,
+      "step": 409
+    },
+    {
+      "epoch": 0.21866666666666668,
+      "grad_norm": 0.4140323106783898,
+      "learning_rate": 0.00018196473224892784,
+      "loss": 0.7333,
+      "step": 410
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.4109352116539475,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.7517,
+      "step": 411
+    },
+    {
+      "epoch": 0.21973333333333334,
+      "grad_norm": 0.45203015421066733,
+      "learning_rate": 0.0001817662542000192,
+      "loss": 0.75,
+      "step": 412
+    },
+    {
+      "epoch": 0.22026666666666667,
+      "grad_norm": 0.47986137265508316,
+      "learning_rate": 0.0001816666487775416,
+      "loss": 0.8174,
+      "step": 413
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.47361437236813453,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.8038,
+      "step": 414
+    },
+    {
+      "epoch": 0.22133333333333333,
+      "grad_norm": 0.38095316941155105,
+      "learning_rate": 0.00018146670662372354,
+      "loss": 0.7278,
+      "step": 415
+    },
+    {
+      "epoch": 0.22186666666666666,
+      "grad_norm": 0.42604931965231435,
+      "learning_rate": 0.0001813663704894407,
+      "loss": 0.7002,
+      "step": 416
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.548002801499899,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.7803,
+      "step": 417
+    },
+    {
+      "epoch": 0.22293333333333334,
+      "grad_norm": 0.5019579540403647,
+      "learning_rate": 0.00018116496960422107,
+      "loss": 0.8137,
+      "step": 418
+    },
+    {
+      "epoch": 0.22346666666666667,
+      "grad_norm": 0.46495451500772306,
+      "learning_rate": 0.00018106390545469795,
+      "loss": 0.7639,
+      "step": 419
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.42686783185227567,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.764,
+      "step": 420
+    },
+    {
+      "epoch": 0.22453333333333333,
+      "grad_norm": 0.4779166686139942,
+      "learning_rate": 0.00018086105125078857,
+      "loss": 0.7343,
+      "step": 421
+    },
+    {
+      "epoch": 0.22506666666666666,
+      "grad_norm": 0.4977038804757904,
+      "learning_rate": 0.00018075926180215576,
+      "loss": 0.7508,
+      "step": 422
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.4172342695672868,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.7518,
+      "step": 423
+    },
+    {
+      "epoch": 0.22613333333333333,
+      "grad_norm": 0.4395016138892947,
+      "learning_rate": 0.0001805549597313267,
+      "loss": 0.7764,
+      "step": 424
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 0.45266053765333597,
+      "learning_rate": 0.0001804524477192075,
+      "loss": 0.799,
+      "step": 425
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.481917880967969,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7135,
+      "step": 426
+    },
+    {
+      "epoch": 0.22773333333333334,
+      "grad_norm": 0.44519020465293857,
+      "learning_rate": 0.00018024670327214084,
+      "loss": 0.8084,
+      "step": 427
+    },
+    {
+      "epoch": 0.22826666666666667,
+      "grad_norm": 0.5670567897416601,
+      "learning_rate": 0.00018014347145157755,
+      "loss": 0.7555,
+      "step": 428
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.4362383554539751,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7766,
+      "step": 429
+    },
+    {
+      "epoch": 0.22933333333333333,
+      "grad_norm": 0.46855866315783895,
+      "learning_rate": 0.0001799362901577196,
+      "loss": 0.7916,
+      "step": 430
+    },
+    {
+      "epoch": 0.22986666666666666,
+      "grad_norm": 0.3999572615788725,
+      "learning_rate": 0.00017983234130309968,
+      "loss": 0.7275,
+      "step": 431
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.4201746754962815,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.6857,
+      "step": 432
+    },
+    {
+      "epoch": 0.23093333333333332,
+      "grad_norm": 0.4306577908340154,
+      "learning_rate": 0.00017962372873051252,
+      "loss": 0.8254,
+      "step": 433
+    },
+    {
+      "epoch": 0.23146666666666665,
+      "grad_norm": 0.454467704188289,
+      "learning_rate": 0.00017951906563549397,
+      "loss": 0.6947,
+      "step": 434
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.43617015792846464,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.6798,
+      "step": 435
+    },
+    {
+      "epoch": 0.23253333333333334,
+      "grad_norm": 0.46513566341592816,
+      "learning_rate": 0.00017930902739070562,
+      "loss": 0.7325,
+      "step": 436
+    },
+    {
+      "epoch": 0.23306666666666667,
+      "grad_norm": 0.5209007191479196,
+      "learning_rate": 0.00017920365286814183,
+      "loss": 0.7075,
+      "step": 437
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.46186549657234066,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7843,
+      "step": 438
+    },
+    {
+      "epoch": 0.23413333333333333,
+      "grad_norm": 0.4225623013491972,
+      "learning_rate": 0.0001789921945959958,
+      "loss": 0.7213,
+      "step": 439
+    },
+    {
+      "epoch": 0.23466666666666666,
+      "grad_norm": 0.5545250438765869,
+      "learning_rate": 0.00017888611147786002,
+      "loss": 0.7348,
+      "step": 440
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.46159909414157596,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7769,
+      "step": 441
+    },
+    {
+      "epoch": 0.23573333333333332,
+      "grad_norm": 0.4608690961702417,
+      "learning_rate": 0.00017867323886136348,
+      "loss": 0.7723,
+      "step": 442
+    },
+    {
+      "epoch": 0.23626666666666668,
+      "grad_norm": 0.4836076943876157,
+      "learning_rate": 0.00017856644999867264,
+      "loss": 0.7675,
+      "step": 443
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.4822012598687289,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7657,
+      "step": 444
+    },
+    {
+      "epoch": 0.23733333333333334,
+      "grad_norm": 0.39066832115403344,
+      "learning_rate": 0.00017835216875884368,
+      "loss": 0.7076,
+      "step": 445
+    },
+    {
+      "epoch": 0.23786666666666667,
+      "grad_norm": 0.43554059934563555,
+      "learning_rate": 0.0001782446770215819,
+      "loss": 0.7195,
+      "step": 446
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.5721257573663748,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7613,
+      "step": 447
+    },
+    {
+      "epoch": 0.23893333333333333,
+      "grad_norm": 0.4723085206785278,
+      "learning_rate": 0.00017802899291729585,
+      "loss": 0.7581,
+      "step": 448
+    },
+    {
+      "epoch": 0.23946666666666666,
+      "grad_norm": 0.4497789103319603,
+      "learning_rate": 0.0001779208011943371,
+      "loss": 0.7162,
+      "step": 449
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4089031649487846,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7948,
+      "step": 450
+    },
+    {
+      "epoch": 0.24053333333333332,
+      "grad_norm": 0.48799499911480854,
+      "learning_rate": 0.00017770372002217172,
+      "loss": 0.7798,
+      "step": 451
+    },
+    {
+      "epoch": 0.24106666666666668,
+      "grad_norm": 0.4518393471897176,
+      "learning_rate": 0.00017759483122120238,
+      "loss": 0.7332,
+      "step": 452
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.4527696046131407,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7543,
+      "step": 453
+    },
+    {
+      "epoch": 0.24213333333333334,
+      "grad_norm": 0.5013859458368709,
+      "learning_rate": 0.00017737635881528196,
+      "loss": 0.7268,
+      "step": 454
+    },
+    {
+      "epoch": 0.24266666666666667,
+      "grad_norm": 0.46679378591767084,
+      "learning_rate": 0.00017726677586272263,
+      "loss": 0.7851,
+      "step": 455
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.5336235970260841,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7894,
+      "step": 456
+    },
+    {
+      "epoch": 0.24373333333333333,
+      "grad_norm": 0.4883559010169363,
+      "learning_rate": 0.00017704691809456143,
+      "loss": 0.7159,
+      "step": 457
+    },
+    {
+      "epoch": 0.24426666666666666,
+      "grad_norm": 0.4820215129604109,
+      "learning_rate": 0.0001769366439354882,
+      "loss": 0.7079,
+      "step": 458
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.39368616328155626,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7186,
+      "step": 459
+    },
+    {
+      "epoch": 0.24533333333333332,
+      "grad_norm": 0.4364515183793448,
+      "learning_rate": 0.00017671540671383243,
+      "loss": 0.7017,
+      "step": 460
+    },
+    {
+      "epoch": 0.24586666666666668,
+      "grad_norm": 0.43274092085542504,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 0.7559,
+      "step": 461
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.48647218982442203,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7824,
+      "step": 462
+    },
+    {
+      "epoch": 0.24693333333333334,
+      "grad_norm": 0.41861598085108487,
+      "learning_rate": 0.00017638183358256696,
+      "loss": 0.7443,
+      "step": 463
+    },
+    {
+      "epoch": 0.24746666666666667,
+      "grad_norm": 0.4062758300109946,
+      "learning_rate": 0.00017627018591992018,
+      "loss": 0.7695,
+      "step": 464
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.43903042467251013,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.7342,
+      "step": 465
+    },
+    {
+      "epoch": 0.24853333333333333,
+      "grad_norm": 0.43820479738922974,
+      "learning_rate": 0.00017604620766564723,
+      "loss": 0.7055,
+      "step": 466
+    },
+    {
+      "epoch": 0.24906666666666666,
+      "grad_norm": 0.4307447215835221,
+      "learning_rate": 0.00017593387774285412,
+      "loss": 0.7517,
+      "step": 467
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.4839227716076366,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.7339,
+      "step": 468
+    },
+    {
+      "epoch": 0.2501333333333333,
+      "grad_norm": 0.41984870190651,
+      "learning_rate": 0.0001757085379831246,
+      "loss": 0.7486,
+      "step": 469
+    },
+    {
+      "epoch": 0.25066666666666665,
+      "grad_norm": 0.4801629623764257,
+      "learning_rate": 0.00017559552881908695,
+      "loss": 0.7392,
+      "step": 470
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.48309462607886405,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.7754,
+      "step": 471
+    },
+    {
+      "epoch": 0.2517333333333333,
+      "grad_norm": 0.46557741123122626,
+      "learning_rate": 0.00017536883360997743,
+      "loss": 0.733,
+      "step": 472
+    },
+    {
+      "epoch": 0.25226666666666664,
+      "grad_norm": 0.45685923269477885,
+      "learning_rate": 0.00017525514824185185,
+      "loss": 0.7812,
+      "step": 473
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4430033811591412,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.7801,
+      "step": 474
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 0.44962046020192703,
+      "learning_rate": 0.00017502710367586687,
+      "loss": 0.769,
+      "step": 475
+    },
+    {
+      "epoch": 0.2538666666666667,
+      "grad_norm": 0.47108421202389944,
+      "learning_rate": 0.0001749127451589832,
+      "loss": 0.8501,
+      "step": 476
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.41875727406366,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7784,
+      "step": 477
+    },
+    {
+      "epoch": 0.25493333333333335,
+      "grad_norm": 0.42190855514124825,
+      "learning_rate": 0.00017468335736489177,
+      "loss": 0.7209,
+      "step": 478
+    },
+    {
+      "epoch": 0.2554666666666667,
+      "grad_norm": 0.38505377328740736,
+      "learning_rate": 0.00017456832877267084,
+      "loss": 0.7375,
+      "step": 479
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4317908545722597,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7465,
+      "step": 480
+    },
+    {
+      "epoch": 0.25653333333333334,
+      "grad_norm": 0.4275197743103051,
+      "learning_rate": 0.00017433760391534167,
+      "loss": 0.717,
+      "step": 481
+    },
+    {
+      "epoch": 0.25706666666666667,
+      "grad_norm": 0.42162843207096956,
+      "learning_rate": 0.00017422190833921283,
+      "loss": 0.7574,
+      "step": 482
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.42833408855858984,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7336,
+      "step": 483
+    },
+    {
+      "epoch": 0.2581333333333333,
+      "grad_norm": 0.45819142099768806,
+      "learning_rate": 0.00017398985261944856,
+      "loss": 0.7822,
+      "step": 484
+    },
+    {
+      "epoch": 0.25866666666666666,
+      "grad_norm": 0.3873481884796177,
+      "learning_rate": 0.00017387349316876666,
+      "loss": 0.7176,
+      "step": 485
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.40982893189153785,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.7623,
+      "step": 486
+    },
+    {
+      "epoch": 0.2597333333333333,
+      "grad_norm": 0.44379926116672774,
+      "learning_rate": 0.0001736401128231373,
+      "loss": 0.7311,
+      "step": 487
+    },
+    {
+      "epoch": 0.26026666666666665,
+      "grad_norm": 0.40282092066670894,
+      "learning_rate": 0.00017352309262509894,
+      "loss": 0.7126,
+      "step": 488
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.47431162991111714,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.6946,
+      "step": 489
+    },
+    {
+      "epoch": 0.2613333333333333,
+      "grad_norm": 0.376782750625978,
+      "learning_rate": 0.0001732883939257742,
+      "loss": 0.7209,
+      "step": 490
+    },
+    {
+      "epoch": 0.2618666666666667,
+      "grad_norm": 0.47210312905548446,
+      "learning_rate": 0.0001731707161253338,
+      "loss": 0.707,
+      "step": 491
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.46458293227031405,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7672,
+      "step": 492
+    },
+    {
+      "epoch": 0.26293333333333335,
+      "grad_norm": 0.4724367145422656,
+      "learning_rate": 0.00017293470537991463,
+      "loss": 0.6756,
+      "step": 493
+    },
+    {
+      "epoch": 0.2634666666666667,
+      "grad_norm": 0.4654749494501859,
+      "learning_rate": 0.00017281637313969978,
+      "loss": 0.8111,
+      "step": 494
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3983425392960808,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.7044,
+      "step": 495
+    },
+    {
+      "epoch": 0.26453333333333334,
+      "grad_norm": 0.4365895782245567,
+      "learning_rate": 0.00017257905669104874,
+      "loss": 0.7704,
+      "step": 496
+    },
+    {
+      "epoch": 0.2650666666666667,
+      "grad_norm": 0.3947808943169786,
+      "learning_rate": 0.00017246007319127545,
+      "loss": 0.7059,
+      "step": 497
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.4128950602547397,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7443,
+      "step": 498
+    },
+    {
+      "epoch": 0.26613333333333333,
+      "grad_norm": 0.44306591083777735,
+      "learning_rate": 0.00017222145741734626,
+      "loss": 0.7221,
+      "step": 499
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.4007250119444036,
+      "learning_rate": 0.00017210182585573327,
+      "loss": 0.6837,
+      "step": 500
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.39105298298237556,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.7669,
+      "step": 501
+    },
+    {
+      "epoch": 0.2677333333333333,
+      "grad_norm": 0.41542512964154166,
+      "learning_rate": 0.00017186191716939944,
+      "loss": 0.7377,
+      "step": 502
+    },
+    {
+      "epoch": 0.26826666666666665,
+      "grad_norm": 0.44365767746746254,
+      "learning_rate": 0.0001717416407610824,
+      "loss": 0.7283,
+      "step": 503
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4359421690092708,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.731,
+      "step": 504
+    },
+    {
+      "epoch": 0.2693333333333333,
+      "grad_norm": 0.3659002525973654,
+      "learning_rate": 0.00017150044560996488,
+      "loss": 0.6292,
+      "step": 505
+    },
+    {
+      "epoch": 0.26986666666666664,
+      "grad_norm": 0.38055418483876924,
+      "learning_rate": 0.00017137952758740978,
+      "loss": 0.6775,
+      "step": 506
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.4944794673497881,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.7685,
+      "step": 507
+    },
+    {
+      "epoch": 0.27093333333333336,
+      "grad_norm": 0.4472600645121646,
+      "learning_rate": 0.00017113705245370368,
+      "loss": 0.7023,
+      "step": 508
+    },
+    {
+      "epoch": 0.2714666666666667,
+      "grad_norm": 0.42356889474137294,
+      "learning_rate": 0.00017101549606662024,
+      "loss": 0.6953,
+      "step": 509
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4167344884222982,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7547,
+      "step": 510
+    },
+    {
+      "epoch": 0.27253333333333335,
+      "grad_norm": 0.44603009027545537,
+      "learning_rate": 0.00017077174746692056,
+      "loss": 0.7221,
+      "step": 511
+    },
+    {
+      "epoch": 0.2730666666666667,
+      "grad_norm": 0.3768434984379742,
+      "learning_rate": 0.00017064955598217462,
+      "loss": 0.6761,
+      "step": 512
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.47799008750919264,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7552,
+      "step": 513
+    },
+    {
+      "epoch": 0.27413333333333334,
+      "grad_norm": 0.3875051521127407,
+      "learning_rate": 0.00017040454046730115,
+      "loss": 0.6764,
+      "step": 514
+    },
+    {
+      "epoch": 0.27466666666666667,
+      "grad_norm": 0.41168483856721627,
+      "learning_rate": 0.00017028171716882714,
+      "loss": 0.7037,
+      "step": 515
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.45857767924392456,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.6801,
+      "step": 516
+    },
+    {
+      "epoch": 0.27573333333333333,
+      "grad_norm": 0.45600457677670214,
+      "learning_rate": 0.00017003544132364846,
+      "loss": 0.7448,
+      "step": 517
+    },
+    {
+      "epoch": 0.27626666666666666,
+      "grad_norm": 0.40262795749401514,
+      "learning_rate": 0.00016991198951236088,
+      "loss": 0.7197,
+      "step": 518
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.4756476115326436,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.7534,
+      "step": 519
+    },
+    {
+      "epoch": 0.2773333333333333,
+      "grad_norm": 0.40655573524205474,
+      "learning_rate": 0.00016966445995561727,
+      "loss": 0.7545,
+      "step": 520
+    },
+    {
+      "epoch": 0.27786666666666665,
+      "grad_norm": 0.5962571163332399,
+      "learning_rate": 0.00016954038294932216,
+      "loss": 0.882,
+      "step": 521
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.4236164890146461,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7514,
+      "step": 522
+    },
+    {
+      "epoch": 0.2789333333333333,
+      "grad_norm": 0.4191905394956668,
+      "learning_rate": 0.0001692916063334479,
+      "loss": 0.6694,
+      "step": 523
+    },
+    {
+      "epoch": 0.27946666666666664,
+      "grad_norm": 0.40519689301537903,
+      "learning_rate": 0.0001691669074667535,
+      "loss": 0.7249,
+      "step": 524
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.3992172252324664,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.7125,
+      "step": 525
+    },
+    {
+      "epoch": 0.28053333333333336,
+      "grad_norm": 0.45261090349463706,
+      "learning_rate": 0.0001689168904776979,
+      "loss": 0.7734,
+      "step": 526
+    },
+    {
+      "epoch": 0.2810666666666667,
+      "grad_norm": 0.5631326151894799,
+      "learning_rate": 0.00016879157310192535,
+      "loss": 0.7948,
+      "step": 527
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4004930942854674,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.6992,
+      "step": 528
+    },
+    {
+      "epoch": 0.28213333333333335,
+      "grad_norm": 0.43682380042413677,
+      "learning_rate": 0.00016854032245897308,
+      "loss": 0.7585,
+      "step": 529
+    },
+    {
+      "epoch": 0.2826666666666667,
+      "grad_norm": 0.46233555406996935,
+      "learning_rate": 0.00016841438994206595,
+      "loss": 0.6966,
+      "step": 530
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.4054285615706168,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7185,
+      "step": 531
+    },
+    {
+      "epoch": 0.28373333333333334,
+      "grad_norm": 0.3673169325312881,
+      "learning_rate": 0.00016816191239765667,
+      "loss": 0.6838,
+      "step": 532
+    },
+    {
+      "epoch": 0.28426666666666667,
+      "grad_norm": 0.5248117352974394,
+      "learning_rate": 0.00016803536812409075,
+      "loss": 0.7785,
+      "step": 533
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.39602694225476154,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.6082,
+      "step": 534
+    },
+    {
+      "epoch": 0.2853333333333333,
+      "grad_norm": 0.5133725626357958,
+      "learning_rate": 0.00016778167046363734,
+      "loss": 0.7176,
+      "step": 535
+    },
+    {
+      "epoch": 0.28586666666666666,
+      "grad_norm": 0.44499807570944844,
+      "learning_rate": 0.00016765451783432953,
+      "loss": 0.7449,
+      "step": 536
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.4630238503557464,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7421,
+      "step": 537
+    },
+    {
+      "epoch": 0.2869333333333333,
+      "grad_norm": 0.4854533678419448,
+      "learning_rate": 0.0001673996068760359,
+      "loss": 0.7793,
+      "step": 538
+    },
+    {
+      "epoch": 0.28746666666666665,
+      "grad_norm": 0.44544379557468916,
+      "learning_rate": 0.00016727184930825288,
+      "loss": 0.7266,
+      "step": 539
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.40447421537567746,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.7453,
+      "step": 540
+    },
+    {
+      "epoch": 0.2885333333333333,
+      "grad_norm": 0.5173650514310973,
+      "learning_rate": 0.00016701573190293077,
+      "loss": 0.6999,
+      "step": 541
+    },
+    {
+      "epoch": 0.2890666666666667,
+      "grad_norm": 0.39231299253018714,
+      "learning_rate": 0.00016688737283019706,
+      "loss": 0.6665,
+      "step": 542
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.4628975315178123,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7475,
+      "step": 543
+    },
+    {
+      "epoch": 0.29013333333333335,
+      "grad_norm": 0.45529339357311305,
+      "learning_rate": 0.00016663005586108176,
+      "loss": 0.742,
+      "step": 544
+    },
+    {
+      "epoch": 0.2906666666666667,
+      "grad_norm": 0.42257598124521273,
+      "learning_rate": 0.00016650109873308765,
+      "loss": 0.6837,
+      "step": 545
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.4717488331588699,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.6761,
+      "step": 546
+    },
+    {
+      "epoch": 0.29173333333333334,
+      "grad_norm": 0.406874476439242,
+      "learning_rate": 0.0001662425891156531,
+      "loss": 0.7217,
+      "step": 547
+    },
+    {
+      "epoch": 0.2922666666666667,
+      "grad_norm": 0.44644268439113843,
+      "learning_rate": 0.00016611303739816168,
+      "loss": 0.698,
+      "step": 548
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.4885011165806191,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7378,
+      "step": 549
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.46157866114898766,
+      "learning_rate": 0.00016585334207993476,
+      "loss": 0.8314,
+      "step": 550
+    },
+    {
+      "epoch": 0.29386666666666666,
+      "grad_norm": 0.4806213326080623,
+      "learning_rate": 0.00016572319925468892,
+      "loss": 0.7766,
+      "step": 551
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4313784252569985,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7742,
+      "step": 552
+    },
+    {
+      "epoch": 0.2949333333333333,
+      "grad_norm": 0.4186941403333472,
+      "learning_rate": 0.0001654623252150624,
+      "loss": 0.7221,
+      "step": 553
+    },
+    {
+      "epoch": 0.29546666666666666,
+      "grad_norm": 0.4153597720533655,
+      "learning_rate": 0.00016533159477969122,
+      "loss": 0.7135,
+      "step": 554
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4301708135822861,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7219,
+      "step": 555
+    },
+    {
+      "epoch": 0.2965333333333333,
+      "grad_norm": 0.38497717673667126,
+      "learning_rate": 0.00016506954902973655,
+      "loss": 0.7413,
+      "step": 556
+    },
+    {
+      "epoch": 0.29706666666666665,
+      "grad_norm": 0.4214776209400845,
+      "learning_rate": 0.00016493823449766136,
+      "loss": 0.7778,
+      "step": 557
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.4485603257497472,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7231,
+      "step": 558
+    },
+    {
+      "epoch": 0.2981333333333333,
+      "grad_norm": 0.4467013277600801,
+      "learning_rate": 0.00016467502407993992,
+      "loss": 0.6717,
+      "step": 559
+    },
+    {
+      "epoch": 0.2986666666666667,
+      "grad_norm": 0.7248352418893502,
+      "learning_rate": 0.0001645431289802799,
+      "loss": 0.8396,
+      "step": 560
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.5316655928965951,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.7603,
+      "step": 561
+    },
+    {
+      "epoch": 0.29973333333333335,
+      "grad_norm": 0.4191163861171123,
+      "learning_rate": 0.00016427876096865394,
+      "loss": 0.7147,
+      "step": 562
+    },
+    {
+      "epoch": 0.3002666666666667,
+      "grad_norm": 0.3901779280880945,
+      "learning_rate": 0.00016414628884613107,
+      "loss": 0.6702,
+      "step": 563
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.42692082886530397,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7633,
+      "step": 564
+    },
+    {
+      "epoch": 0.30133333333333334,
+      "grad_norm": 0.4657478707478943,
+      "learning_rate": 0.00016388077034557355,
+      "loss": 0.7392,
+      "step": 565
+    },
+    {
+      "epoch": 0.30186666666666667,
+      "grad_norm": 0.43968295433399096,
+      "learning_rate": 0.00016374772476041748,
+      "loss": 0.7536,
+      "step": 566
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.4879853331438654,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.8114,
+      "step": 567
+    },
+    {
+      "epoch": 0.30293333333333333,
+      "grad_norm": 0.524883168299829,
+      "learning_rate": 0.00016348106290682118,
+      "loss": 0.7918,
+      "step": 568
+    },
+    {
+      "epoch": 0.30346666666666666,
+      "grad_norm": 0.4090914095307245,
+      "learning_rate": 0.00016334744743467364,
+      "loss": 0.7193,
+      "step": 569
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.43022883747007334,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.5877,
+      "step": 570
+    },
+    {
+      "epoch": 0.3045333333333333,
+      "grad_norm": 0.4273854651825723,
+      "learning_rate": 0.00016307964939465914,
+      "loss": 0.7073,
+      "step": 571
+    },
+    {
+      "epoch": 0.30506666666666665,
+      "grad_norm": 0.49306924109483663,
+      "learning_rate": 0.00016294546762647775,
+      "loss": 0.7834,
+      "step": 572
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.44958407187654065,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7516,
+      "step": 573
+    },
+    {
+      "epoch": 0.3061333333333333,
+      "grad_norm": 0.41517495755784234,
+      "learning_rate": 0.0001626765405972011,
+      "loss": 0.6902,
+      "step": 574
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 0.3853060345524645,
+      "learning_rate": 0.00016254179613916278,
+      "loss": 0.7572,
+      "step": 575
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.40441247254812485,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7149,
+      "step": 576
+    },
+    {
+      "epoch": 0.30773333333333336,
+      "grad_norm": 0.5088494997903442,
+      "learning_rate": 0.000162271747348122,
+      "loss": 0.7483,
+      "step": 577
+    },
+    {
+      "epoch": 0.3082666666666667,
+      "grad_norm": 0.5522199125021355,
+      "learning_rate": 0.0001621364438215262,
+      "loss": 0.846,
+      "step": 578
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.37330826151446855,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7074,
+      "step": 579
+    },
+    {
+      "epoch": 0.30933333333333335,
+      "grad_norm": 0.48956139290248746,
+      "learning_rate": 0.00016186528052636692,
+      "loss": 0.6854,
+      "step": 580
+    },
+    {
+      "epoch": 0.3098666666666667,
+      "grad_norm": 0.41750812621008515,
+      "learning_rate": 0.0001617294215675382,
+      "loss": 0.7007,
+      "step": 581
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.5056192033810395,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7471,
+      "step": 582
+    },
+    {
+      "epoch": 0.31093333333333334,
+      "grad_norm": 0.403218209604543,
+      "learning_rate": 0.0001614571510558588,
+      "loss": 0.7342,
+      "step": 583
+    },
+    {
+      "epoch": 0.31146666666666667,
+      "grad_norm": 0.439841734412921,
+      "learning_rate": 0.00016132074031604917,
+      "loss": 0.7172,
+      "step": 584
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.513975233435866,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7734,
+      "step": 585
+    },
+    {
+      "epoch": 0.31253333333333333,
+      "grad_norm": 0.5124133462407117,
+      "learning_rate": 0.00016104736990520468,
+      "loss": 0.7933,
+      "step": 586
+    },
+    {
+      "epoch": 0.31306666666666666,
+      "grad_norm": 0.45997147328786614,
+      "learning_rate": 0.0001609104110504954,
+      "loss": 0.7437,
+      "step": 587
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.47849333008086586,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.8166,
+      "step": 588
+    },
+    {
+      "epoch": 0.3141333333333333,
+      "grad_norm": 0.44707270851351555,
+      "learning_rate": 0.00016063594808740113,
+      "loss": 0.7327,
+      "step": 589
+    },
+    {
+      "epoch": 0.31466666666666665,
+      "grad_norm": 0.37707704242799883,
+      "learning_rate": 0.00016049844479860422,
+      "loss": 0.6844,
+      "step": 590
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.5556467813011955,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.7432,
+      "step": 591
+    },
+    {
+      "epoch": 0.3157333333333333,
+      "grad_norm": 0.4491670034732749,
+      "learning_rate": 0.00016022289665953808,
+      "loss": 0.7496,
+      "step": 592
+    },
+    {
+      "epoch": 0.31626666666666664,
+      "grad_norm": 0.45034954515495185,
+      "learning_rate": 0.00016008485263209742,
+      "loss": 0.7181,
+      "step": 593
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.4245467770127856,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7253,
+      "step": 594
+    },
+    {
+      "epoch": 0.31733333333333336,
+      "grad_norm": 0.39504189902573533,
+      "learning_rate": 0.0001598082267225018,
+      "loss": 0.7526,
+      "step": 595
+    },
+    {
+      "epoch": 0.3178666666666667,
+      "grad_norm": 0.4750528432017365,
+      "learning_rate": 0.0001596696456663938,
+      "loss": 0.8005,
+      "step": 596
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.46842200890521113,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7005,
+      "step": 597
+    },
+    {
+      "epoch": 0.31893333333333335,
+      "grad_norm": 0.35903952707599596,
+      "learning_rate": 0.00015939194942067646,
+      "loss": 0.6735,
+      "step": 598
+    },
+    {
+      "epoch": 0.3194666666666667,
+      "grad_norm": 0.37995752480766354,
+      "learning_rate": 0.0001592528350603103,
+      "loss": 0.6625,
+      "step": 599
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4476073132273197,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7285,
+      "step": 600
+    },
+    {
+      "epoch": 0.32053333333333334,
+      "grad_norm": 0.43193310514746797,
+      "learning_rate": 0.00015897407594164467,
+      "loss": 0.7437,
+      "step": 601
+    },
+    {
+      "epoch": 0.32106666666666667,
+      "grad_norm": 0.6244911316558347,
+      "learning_rate": 0.00015883443201576225,
+      "loss": 0.7943,
+      "step": 602
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.42134930350758226,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7531,
+      "step": 603
+    },
+    {
+      "epoch": 0.3221333333333333,
+      "grad_norm": 0.4813548073126474,
+      "learning_rate": 0.00015855461751588677,
+      "loss": 0.7492,
+      "step": 604
+    },
+    {
+      "epoch": 0.32266666666666666,
+      "grad_norm": 0.45462078206354734,
+      "learning_rate": 0.0001584144477774623,
+      "loss": 0.7924,
+      "step": 605
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.43853925193295296,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7238,
+      "step": 606
+    },
+    {
+      "epoch": 0.3237333333333333,
+      "grad_norm": 0.4369785313615201,
+      "learning_rate": 0.00015813358541647915,
+      "loss": 0.7512,
+      "step": 607
+    },
+    {
+      "epoch": 0.32426666666666665,
+      "grad_norm": 0.4294524565445153,
+      "learning_rate": 0.00015799289363261813,
+      "loss": 0.7708,
+      "step": 608
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.4119283273720428,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.759,
+      "step": 609
+    },
+    {
+      "epoch": 0.3253333333333333,
+      "grad_norm": 0.4239654817744758,
+      "learning_rate": 0.00015771099095879108,
+      "loss": 0.6857,
+      "step": 610
+    },
+    {
+      "epoch": 0.3258666666666667,
+      "grad_norm": 0.41958723907270873,
+      "learning_rate": 0.0001575697809106292,
+      "loss": 0.7528,
+      "step": 611
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.42760597413040674,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.7488,
+      "step": 612
+    },
+    {
+      "epoch": 0.32693333333333335,
+      "grad_norm": 0.41189489453568245,
+      "learning_rate": 0.00015728684550018064,
+      "loss": 0.71,
+      "step": 613
+    },
+    {
+      "epoch": 0.3274666666666667,
+      "grad_norm": 0.4325046558582601,
+      "learning_rate": 0.0001571451209827821,
+      "loss": 0.6984,
+      "step": 614
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.4689755340313718,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7053,
+      "step": 615
+    },
+    {
+      "epoch": 0.32853333333333334,
+      "grad_norm": 0.5179610692091388,
+      "learning_rate": 0.00015686116043968972,
+      "loss": 0.743,
+      "step": 616
+    },
+    {
+      "epoch": 0.3290666666666667,
+      "grad_norm": 0.4371161166067628,
+      "learning_rate": 0.00015671892526194516,
+      "loss": 0.6869,
+      "step": 617
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.412067856511975,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7089,
+      "step": 618
+    },
+    {
+      "epoch": 0.33013333333333333,
+      "grad_norm": 0.48352704083048004,
+      "learning_rate": 0.0001564339472177373,
+      "loss": 0.7538,
+      "step": 619
+    },
+    {
+      "epoch": 0.33066666666666666,
+      "grad_norm": 0.4114561121520049,
+      "learning_rate": 0.00015629120520226165,
+      "loss": 0.7283,
+      "step": 620
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.4129768740986683,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7181,
+      "step": 621
+    },
+    {
+      "epoch": 0.3317333333333333,
+      "grad_norm": 0.4058308521722823,
+      "learning_rate": 0.0001560052173158123,
+      "loss": 0.7241,
+      "step": 622
+    },
+    {
+      "epoch": 0.33226666666666665,
+      "grad_norm": 0.5313828099613424,
+      "learning_rate": 0.00015586197229884184,
+      "loss": 0.7964,
+      "step": 623
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.44339994785709935,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7237,
+      "step": 624
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.36296656767708446,
+      "learning_rate": 0.00015557498225616487,
+      "loss": 0.6328,
+      "step": 625
+    },
+    {
+      "epoch": 0.33386666666666664,
+      "grad_norm": 0.4200778471709336,
+      "learning_rate": 0.0001554312380874542,
+      "loss": 0.662,
+      "step": 626
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.3714091612897101,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.6658,
+      "step": 627
+    },
+    {
+      "epoch": 0.33493333333333336,
+      "grad_norm": 0.40298315807868135,
+      "learning_rate": 0.00015514325360149668,
+      "loss": 0.6665,
+      "step": 628
+    },
+    {
+      "epoch": 0.3354666666666667,
+      "grad_norm": 0.4312775541358898,
+      "learning_rate": 0.0001549990141442153,
+      "loss": 0.7136,
+      "step": 629
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.434751665612994,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.6922,
+      "step": 630
+    },
+    {
+      "epoch": 0.33653333333333335,
+      "grad_norm": 0.4304044437917915,
+      "learning_rate": 0.00015471004295465035,
+      "loss": 0.766,
+      "step": 631
+    },
+    {
+      "epoch": 0.3370666666666667,
+      "grad_norm": 0.435059891602515,
+      "learning_rate": 0.0001545653120852787,
+      "loss": 0.684,
+      "step": 632
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.4716447319029521,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7709,
+      "step": 633
+    },
+    {
+      "epoch": 0.33813333333333334,
+      "grad_norm": 0.46798986066853526,
+      "learning_rate": 0.00015427536195829742,
+      "loss": 0.6937,
+      "step": 634
+    },
+    {
+      "epoch": 0.33866666666666667,
+      "grad_norm": 0.398112591291493,
+      "learning_rate": 0.00015413014356652286,
+      "loss": 0.6595,
+      "step": 635
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4144261054191747,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.6714,
+      "step": 636
+    },
+    {
+      "epoch": 0.33973333333333333,
+      "grad_norm": 0.40097132071333075,
+      "learning_rate": 0.00015383922229462549,
+      "loss": 0.7164,
+      "step": 637
+    },
+    {
+      "epoch": 0.34026666666666666,
+      "grad_norm": 0.4072626688281069,
+      "learning_rate": 0.00015369352028323774,
+      "loss": 0.7137,
+      "step": 638
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.402793601796227,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.669,
+      "step": 639
+    },
+    {
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.434833795749613,
+      "learning_rate": 0.0001534016356850244,
+      "loss": 0.6841,
+      "step": 640
+    },
+    {
+      "epoch": 0.34186666666666665,
+      "grad_norm": 0.4052645108469939,
+      "learning_rate": 0.0001532554539698105,
+      "loss": 0.7611,
+      "step": 641
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.3805800622358866,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7112,
+      "step": 642
+    },
+    {
+      "epoch": 0.3429333333333333,
+      "grad_norm": 0.41487367876380865,
+      "learning_rate": 0.00015296261388977108,
+      "loss": 0.7288,
+      "step": 643
+    },
+    {
+      "epoch": 0.34346666666666664,
+      "grad_norm": 0.4646233771044,
+      "learning_rate": 0.0001528159563994104,
+      "loss": 0.6995,
+      "step": 644
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.3963892013269609,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.6791,
+      "step": 645
+    },
+    {
+      "epoch": 0.34453333333333336,
+      "grad_norm": 0.4184099001892143,
+      "learning_rate": 0.00015252216870771345,
+      "loss": 0.6969,
+      "step": 646
+    },
+    {
+      "epoch": 0.3450666666666667,
+      "grad_norm": 0.42542898585695293,
+      "learning_rate": 0.00015237503938367186,
+      "loss": 0.7191,
+      "step": 647
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.5448829291630773,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7122,
+      "step": 648
+    },
+    {
+      "epoch": 0.34613333333333335,
+      "grad_norm": 0.47125879567517487,
+      "learning_rate": 0.00015208031197595356,
+      "loss": 0.7616,
+      "step": 649
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.44026883256482013,
+      "learning_rate": 0.0001519327147723776,
+      "loss": 0.7366,
+      "step": 650
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.44467255998295074,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7329,
+      "step": 651
+    },
+    {
+      "epoch": 0.34773333333333334,
+      "grad_norm": 0.47803310845693225,
+      "learning_rate": 0.0001516370555695291,
+      "loss": 0.7534,
+      "step": 652
+    },
+    {
+      "epoch": 0.34826666666666667,
+      "grad_norm": 0.43203894941566023,
+      "learning_rate": 0.00015148899445313981,
+      "loss": 0.656,
+      "step": 653
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.43900267263581866,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7244,
+      "step": 654
+    },
+    {
+      "epoch": 0.34933333333333333,
+      "grad_norm": 0.454298973382149,
+      "learning_rate": 0.00015119241140109467,
+      "loss": 0.7562,
+      "step": 655
+    },
+    {
+      "epoch": 0.34986666666666666,
+      "grad_norm": 0.3802978945649559,
+      "learning_rate": 0.00015104389035108077,
+      "loss": 0.7344,
+      "step": 656
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.3945099100516662,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.7441,
+      "step": 657
+    },
+    {
+      "epoch": 0.3509333333333333,
+      "grad_norm": 0.4475131716099979,
+      "learning_rate": 0.0001507463914206012,
+      "loss": 0.7483,
+      "step": 658
+    },
+    {
+      "epoch": 0.35146666666666665,
+      "grad_norm": 0.4642469578161393,
+      "learning_rate": 0.0001505974144285124,
+      "loss": 0.6875,
+      "step": 659
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5176302593343155,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.8067,
+      "step": 660
+    },
+    {
+      "epoch": 0.3525333333333333,
+      "grad_norm": 0.41986391465678924,
+      "learning_rate": 0.00015029900761497506,
+      "loss": 0.7475,
+      "step": 661
+    },
+    {
+      "epoch": 0.35306666666666664,
+      "grad_norm": 0.46136378205709,
+      "learning_rate": 0.00015014957868461458,
+      "loss": 0.6895,
+      "step": 662
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.5259786387214987,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.784,
+      "step": 663
+    },
+    {
+      "epoch": 0.35413333333333336,
+      "grad_norm": 0.43018111896826905,
+      "learning_rate": 0.000149850272007796,
+      "loss": 0.7328,
+      "step": 664
+    },
+    {
+      "epoch": 0.3546666666666667,
+      "grad_norm": 0.44617404140289296,
+      "learning_rate": 0.00014970039515511304,
+      "loss": 0.7792,
+      "step": 665
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.37068315607963076,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.6809,
+      "step": 666
+    },
+    {
+      "epoch": 0.35573333333333335,
+      "grad_norm": 0.4268403834428016,
+      "learning_rate": 0.0001494001966589736,
+      "loss": 0.7044,
+      "step": 667
+    },
+    {
+      "epoch": 0.3562666666666667,
+      "grad_norm": 0.49275362102142706,
+      "learning_rate": 0.00014924987591195547,
+      "loss": 0.7278,
+      "step": 668
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.4119689333350128,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.6608,
+      "step": 669
+    },
+    {
+      "epoch": 0.35733333333333334,
+      "grad_norm": 0.48239976045422317,
+      "learning_rate": 0.0001489487936644237,
+      "loss": 0.7022,
+      "step": 670
+    },
+    {
+      "epoch": 0.35786666666666667,
+      "grad_norm": 0.444957256161989,
+      "learning_rate": 0.00014879803306298736,
+      "loss": 0.6812,
+      "step": 671
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.41397582982901027,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7159,
+      "step": 672
+    },
+    {
+      "epoch": 0.3589333333333333,
+      "grad_norm": 0.41256809240312126,
+      "learning_rate": 0.00014849607515574276,
+      "loss": 0.6932,
+      "step": 673
+    },
+    {
+      "epoch": 0.35946666666666666,
+      "grad_norm": 0.45527592581249554,
+      "learning_rate": 0.00014834487875162657,
+      "loss": 0.7266,
+      "step": 674
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.44174574312687004,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.7564,
+      "step": 675
+    },
+    {
+      "epoch": 0.3605333333333333,
+      "grad_norm": 0.45677830394976854,
+      "learning_rate": 0.00014804205329988225,
+      "loss": 0.7419,
+      "step": 676
+    },
+    {
+      "epoch": 0.36106666666666665,
+      "grad_norm": 0.39187838181442464,
+      "learning_rate": 0.00014789042515653687,
+      "loss": 0.7346,
+      "step": 677
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.5075723274296824,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.8275,
+      "step": 678
+    },
+    {
+      "epoch": 0.3621333333333333,
+      "grad_norm": 0.4658019119180805,
+      "learning_rate": 0.00014758674029882152,
+      "loss": 0.6864,
+      "step": 679
+    },
+    {
+      "epoch": 0.3626666666666667,
+      "grad_norm": 0.38307767363207407,
+      "learning_rate": 0.00014743468449130063,
+      "loss": 0.7388,
+      "step": 680
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.43750010476245005,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7457,
+      "step": 681
+    },
+    {
+      "epoch": 0.36373333333333335,
+      "grad_norm": 0.43833249022891085,
+      "learning_rate": 0.00014713014838923976,
+      "loss": 0.7074,
+      "step": 682
+    },
+    {
+      "epoch": 0.3642666666666667,
+      "grad_norm": 0.4314878527261997,
+      "learning_rate": 0.00014697766900409074,
+      "loss": 0.7794,
+      "step": 683
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.48154869168406245,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7915,
+      "step": 684
+    },
+    {
+      "epoch": 0.36533333333333334,
+      "grad_norm": 0.4327408829170178,
+      "learning_rate": 0.0001466722898421873,
+      "loss": 0.7173,
+      "step": 685
+    },
+    {
+      "epoch": 0.3658666666666667,
+      "grad_norm": 0.47887799882065063,
+      "learning_rate": 0.0001465193909773413,
+      "loss": 0.8572,
+      "step": 686
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.45107242319571567,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.7086,
+      "step": 687
+    },
+    {
+      "epoch": 0.36693333333333333,
+      "grad_norm": 0.4629615233997194,
+      "learning_rate": 0.00014621317696275564,
+      "loss": 0.6792,
+      "step": 688
+    },
+    {
+      "epoch": 0.36746666666666666,
+      "grad_norm": 0.4239680655581829,
+      "learning_rate": 0.00014605986272741748,
+      "loss": 0.7308,
+      "step": 689
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.43004699919410794,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7725,
+      "step": 690
+    },
+    {
+      "epoch": 0.3685333333333333,
+      "grad_norm": 0.4434971876446726,
+      "learning_rate": 0.00014575282208974702,
+      "loss": 0.7257,
+      "step": 691
+    },
+    {
+      "epoch": 0.36906666666666665,
+      "grad_norm": 0.43702022329417833,
+      "learning_rate": 0.00014559909660428468,
+      "loss": 0.7078,
+      "step": 692
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.5165487230674608,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.753,
+      "step": 693
+    },
+    {
+      "epoch": 0.3701333333333333,
+      "grad_norm": 0.43730629587059877,
+      "learning_rate": 0.00014529123759534255,
+      "loss": 0.7266,
+      "step": 694
+    },
+    {
+      "epoch": 0.37066666666666664,
+      "grad_norm": 0.4353705569266859,
+      "learning_rate": 0.00014513710499117647,
+      "loss": 0.6965,
+      "step": 695
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.42291324813123504,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.7302,
+      "step": 696
+    },
+    {
+      "epoch": 0.37173333333333336,
+      "grad_norm": 0.43608599164904627,
+      "learning_rate": 0.00014482843588476974,
+      "loss": 0.6879,
+      "step": 697
+    },
+    {
+      "epoch": 0.3722666666666667,
+      "grad_norm": 0.44640764356716217,
+      "learning_rate": 0.00014467390030426186,
+      "loss": 0.6458,
+      "step": 698
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.48226250089051803,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.6977,
+      "step": 699
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.49380259764442747,
+      "learning_rate": 0.0001443644293959693,
+      "loss": 0.7087,
+      "step": 700
+    },
+    {
+      "epoch": 0.3738666666666667,
+      "grad_norm": 0.436743758192173,
+      "learning_rate": 0.00014420949499231172,
+      "loss": 0.6975,
+      "step": 701
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4334747817473767,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.7104,
+      "step": 702
+    },
+    {
+      "epoch": 0.37493333333333334,
+      "grad_norm": 0.43374268546338396,
+      "learning_rate": 0.00014389923059926062,
+      "loss": 0.7169,
+      "step": 703
+    },
+    {
+      "epoch": 0.37546666666666667,
+      "grad_norm": 0.3925376896201814,
+      "learning_rate": 0.0001437439015363638,
+      "loss": 0.6734,
+      "step": 704
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.39449819892584087,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.6673,
+      "step": 705
+    },
+    {
+      "epoch": 0.37653333333333333,
+      "grad_norm": 0.4509103518457081,
+      "learning_rate": 0.00014343285199700683,
+      "loss": 0.6778,
+      "step": 706
+    },
+    {
+      "epoch": 0.37706666666666666,
+      "grad_norm": 0.5082219387763629,
+      "learning_rate": 0.0001432771324493879,
+      "loss": 0.7513,
+      "step": 707
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.40216967606345727,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.7431,
+      "step": 708
+    },
+    {
+      "epoch": 0.3781333333333333,
+      "grad_norm": 0.40969266203255283,
+      "learning_rate": 0.00014296530612327863,
+      "loss": 0.7277,
+      "step": 709
+    },
+    {
+      "epoch": 0.37866666666666665,
+      "grad_norm": 0.41888905431993123,
+      "learning_rate": 0.00014280920027594907,
+      "loss": 0.6775,
+      "step": 710
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.4093731187807328,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.739,
+      "step": 711
+    },
+    {
+      "epoch": 0.3797333333333333,
+      "grad_norm": 0.4178473476692447,
+      "learning_rate": 0.00014249660554351752,
+      "loss": 0.5942,
+      "step": 712
+    },
+    {
+      "epoch": 0.38026666666666664,
+      "grad_norm": 0.44171104100805797,
+      "learning_rate": 0.00014234011759187083,
+      "loss": 0.739,
+      "step": 713
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.4252349083840047,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.6972,
+      "step": 714
+    },
+    {
+      "epoch": 0.38133333333333336,
+      "grad_norm": 0.377200795606724,
+      "learning_rate": 0.00014202676285419812,
+      "loss": 0.6597,
+      "step": 715
+    },
+    {
+      "epoch": 0.3818666666666667,
+      "grad_norm": 0.4818691640691401,
+      "learning_rate": 0.00014186989700389687,
+      "loss": 0.7594,
+      "step": 716
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.46462827650401956,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7223,
+      "step": 717
+    },
+    {
+      "epoch": 0.38293333333333335,
+      "grad_norm": 0.45987811453490407,
+      "learning_rate": 0.0001415557906824895,
+      "loss": 0.7314,
+      "step": 718
+    },
+    {
+      "epoch": 0.3834666666666667,
+      "grad_norm": 0.4369841928085185,
+      "learning_rate": 0.00014139855114935252,
+      "loss": 0.6808,
+      "step": 719
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.38233064870972155,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.683,
+      "step": 720
+    },
+    {
+      "epoch": 0.38453333333333334,
+      "grad_norm": 0.46603230782706917,
+      "learning_rate": 0.0001410837016859161,
+      "loss": 0.7672,
+      "step": 721
+    },
+    {
+      "epoch": 0.38506666666666667,
+      "grad_norm": 0.41753317232563836,
+      "learning_rate": 0.00014092609269580496,
+      "loss": 0.7436,
+      "step": 722
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.3799296803428843,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7286,
+      "step": 723
+    },
+    {
+      "epoch": 0.38613333333333333,
+      "grad_norm": 0.47292296820852897,
+      "learning_rate": 0.00014061050855201723,
+      "loss": 0.7185,
+      "step": 724
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 0.3895957480245421,
+      "learning_rate": 0.0001404525343407228,
+      "loss": 0.7048,
+      "step": 725
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.4066293338823343,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7521,
+      "step": 726
+    },
+    {
+      "epoch": 0.3877333333333333,
+      "grad_norm": 0.4387329715532074,
+      "learning_rate": 0.00014013622399800627,
+      "loss": 0.7655,
+      "step": 727
+    },
+    {
+      "epoch": 0.38826666666666665,
+      "grad_norm": 0.3931358838546744,
+      "learning_rate": 0.00013997788881113489,
+      "loss": 0.6756,
+      "step": 728
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.45033280252042396,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.662,
+      "step": 729
+    },
+    {
+      "epoch": 0.3893333333333333,
+      "grad_norm": 0.4523685749264847,
+      "learning_rate": 0.0001396608607704289,
+      "loss": 0.7158,
+      "step": 730
+    },
+    {
+      "epoch": 0.38986666666666664,
+      "grad_norm": 0.38155286027533103,
+      "learning_rate": 0.0001395021688632882,
+      "loss": 0.6829,
+      "step": 731
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.40167710016071684,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7151,
+      "step": 732
+    },
+    {
+      "epoch": 0.39093333333333335,
+      "grad_norm": 0.4597203398268425,
+      "learning_rate": 0.00013918443164482046,
+      "loss": 0.6922,
+      "step": 733
+    },
+    {
+      "epoch": 0.3914666666666667,
+      "grad_norm": 0.42083485098131285,
+      "learning_rate": 0.000139025387282305,
+      "loss": 0.6411,
+      "step": 734
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.3778948458076018,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.6016,
+      "step": 735
+    },
+    {
+      "epoch": 0.39253333333333335,
+      "grad_norm": 0.4087111865952575,
+      "learning_rate": 0.0001387069494253626,
+      "loss": 0.7388,
+      "step": 736
+    },
+    {
+      "epoch": 0.3930666666666667,
+      "grad_norm": 0.4049422496340538,
+      "learning_rate": 0.0001385475568818394,
+      "loss": 0.6783,
+      "step": 737
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.3992656661882664,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7054,
+      "step": 738
+    },
+    {
+      "epoch": 0.39413333333333334,
+      "grad_norm": 0.36378149445487457,
+      "learning_rate": 0.00013822842694453924,
+      "loss": 0.6654,
+      "step": 739
+    },
+    {
+      "epoch": 0.39466666666666667,
+      "grad_norm": 0.4882319040471909,
+      "learning_rate": 0.0001380686905037327,
+      "loss": 0.7128,
+      "step": 740
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.4269943520410961,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7005,
+      "step": 741
+    },
+    {
+      "epoch": 0.3957333333333333,
+      "grad_norm": 0.36804409928283105,
+      "learning_rate": 0.00013774887706279165,
+      "loss": 0.6559,
+      "step": 742
+    },
+    {
+      "epoch": 0.39626666666666666,
+      "grad_norm": 0.4195207951945363,
+      "learning_rate": 0.0001375888010176686,
+      "loss": 0.7012,
+      "step": 743
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.44710522148863835,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.6932,
+      "step": 744
+    },
+    {
+      "epoch": 0.3973333333333333,
+      "grad_norm": 0.4204747676370217,
+      "learning_rate": 0.00013726831266817278,
+      "loss": 0.7129,
+      "step": 745
+    },
+    {
+      "epoch": 0.39786666666666665,
+      "grad_norm": 0.42163735336974956,
+      "learning_rate": 0.00013710790132082692,
+      "loss": 0.717,
+      "step": 746
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.4757406531134025,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.6804,
+      "step": 747
+    },
+    {
+      "epoch": 0.3989333333333333,
+      "grad_norm": 0.4315771333847926,
+      "learning_rate": 0.00013678674667600102,
+      "loss": 0.6173,
+      "step": 748
+    },
+    {
+      "epoch": 0.3994666666666667,
+      "grad_norm": 0.43237685072831544,
+      "learning_rate": 0.00013662600433753745,
+      "loss": 0.6933,
+      "step": 749
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4158118707747596,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7036,
+      "step": 750
+    },
+    {
+      "epoch": 0.40053333333333335,
+      "grad_norm": 0.4307855486428975,
+      "learning_rate": 0.00013630419202851284,
+      "loss": 0.7496,
+      "step": 751
+    },
+    {
+      "epoch": 0.4010666666666667,
+      "grad_norm": 0.429025745211691,
+      "learning_rate": 0.00013614312301893223,
+      "loss": 0.7247,
+      "step": 752
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.3576468631216106,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.6742,
+      "step": 753
+    },
+    {
+      "epoch": 0.40213333333333334,
+      "grad_norm": 0.4966019064934107,
+      "learning_rate": 0.00013582066169451535,
+      "loss": 0.6822,
+      "step": 754
+    },
+    {
+      "epoch": 0.4026666666666667,
+      "grad_norm": 0.45456433226106024,
+      "learning_rate": 0.0001356592703425976,
+      "loss": 0.7619,
+      "step": 755
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.36235700010315813,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.697,
+      "step": 756
+    },
+    {
+      "epoch": 0.40373333333333333,
+      "grad_norm": 0.40115755508883477,
+      "learning_rate": 0.00013533616866903735,
+      "loss": 0.6888,
+      "step": 757
+    },
+    {
+      "epoch": 0.40426666666666666,
+      "grad_norm": 0.39301021367659733,
+      "learning_rate": 0.0001351744593122255,
+      "loss": 0.6737,
+      "step": 758
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.37689418356760185,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.67,
+      "step": 759
+    },
+    {
+      "epoch": 0.4053333333333333,
+      "grad_norm": 0.454763983645677,
+      "learning_rate": 0.00013485072597298038,
+      "loss": 0.6682,
+      "step": 760
+    },
+    {
+      "epoch": 0.40586666666666665,
+      "grad_norm": 0.4043439631314288,
+      "learning_rate": 0.00013468870295726398,
+      "loss": 0.673,
+      "step": 761
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.44132088073018694,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.6663,
+      "step": 762
+    },
+    {
+      "epoch": 0.4069333333333333,
+      "grad_norm": 0.4664887460490766,
+      "learning_rate": 0.00013436434665276865,
+      "loss": 0.6323,
+      "step": 763
+    },
+    {
+      "epoch": 0.40746666666666664,
+      "grad_norm": 0.5079194071561257,
+      "learning_rate": 0.00013420201433256689,
+      "loss": 0.708,
+      "step": 764
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.5136320484094405,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7072,
+      "step": 765
+    },
+    {
+      "epoch": 0.40853333333333336,
+      "grad_norm": 0.5002124512143936,
+      "learning_rate": 0.00013387704377999842,
+      "loss": 0.7624,
+      "step": 766
+    },
+    {
+      "epoch": 0.4090666666666667,
+      "grad_norm": 0.41959518702885695,
+      "learning_rate": 0.00013371440651804313,
+      "loss": 0.6944,
+      "step": 767
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4161606124666796,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.6911,
+      "step": 768
+    },
+    {
+      "epoch": 0.41013333333333335,
+      "grad_norm": 0.5116685264471817,
+      "learning_rate": 0.00013338883045108674,
+      "loss": 0.7093,
+      "step": 769
+    },
+    {
+      "epoch": 0.4106666666666667,
+      "grad_norm": 0.384814734377533,
+      "learning_rate": 0.00013322589261830517,
+      "loss": 0.7271,
+      "step": 770
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.3820311467510209,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.6721,
+      "step": 771
+    },
+    {
+      "epoch": 0.41173333333333334,
+      "grad_norm": 0.43601952633834995,
+      "learning_rate": 0.0001328997197869194,
+      "loss": 0.7368,
+      "step": 772
+    },
+    {
+      "epoch": 0.41226666666666667,
+      "grad_norm": 0.4331297679878163,
+      "learning_rate": 0.0001327364857623168,
+      "loss": 0.7048,
+      "step": 773
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3776924995843434,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.637,
+      "step": 774
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 0.3659212952711077,
+      "learning_rate": 0.00013240972493249847,
+      "loss": 0.6467,
+      "step": 775
+    },
+    {
+      "epoch": 0.41386666666666666,
+      "grad_norm": 0.4670658459953559,
+      "learning_rate": 0.0001322461991030402,
+      "loss": 0.6784,
+      "step": 776
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.46150008468924536,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.6896,
+      "step": 777
+    },
+    {
+      "epoch": 0.4149333333333333,
+      "grad_norm": 0.3960956573102029,
+      "learning_rate": 0.00013191885905658872,
+      "loss": 0.6774,
+      "step": 778
+    },
+    {
+      "epoch": 0.41546666666666665,
+      "grad_norm": 0.4856950320911442,
+      "learning_rate": 0.0001317550458170826,
+      "loss": 0.7489,
+      "step": 779
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.41523717872542065,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.691,
+      "step": 780
+    },
+    {
+      "epoch": 0.4165333333333333,
+      "grad_norm": 0.43311712891722615,
+      "learning_rate": 0.00013142713535136414,
+      "loss": 0.7437,
+      "step": 781
+    },
+    {
+      "epoch": 0.41706666666666664,
+      "grad_norm": 0.417547226064576,
+      "learning_rate": 0.00013126303910434214,
+      "loss": 0.671,
+      "step": 782
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.4302231301735469,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7273,
+      "step": 783
+    },
+    {
+      "epoch": 0.41813333333333336,
+      "grad_norm": 0.40642630783593625,
+      "learning_rate": 0.00013093456703205288,
+      "loss": 0.798,
+      "step": 784
+    },
+    {
+      "epoch": 0.4186666666666667,
+      "grad_norm": 0.5149084196264678,
+      "learning_rate": 0.00013077019218765305,
+      "loss": 0.7864,
+      "step": 785
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.401441193039279,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.6718,
+      "step": 786
+    },
+    {
+      "epoch": 0.41973333333333335,
+      "grad_norm": 0.39953693235491955,
+      "learning_rate": 0.0001304411673365826,
+      "loss": 0.6589,
+      "step": 787
+    },
+    {
+      "epoch": 0.4202666666666667,
+      "grad_norm": 0.4883869855102983,
+      "learning_rate": 0.0001302765183124302,
+      "loss": 0.7537,
+      "step": 788
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.43684597712350054,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7148,
+      "step": 789
+    },
+    {
+      "epoch": 0.42133333333333334,
+      "grad_norm": 0.37729354638193896,
+      "learning_rate": 0.00012994694952522435,
+      "loss": 0.6764,
+      "step": 790
+    },
+    {
+      "epoch": 0.42186666666666667,
+      "grad_norm": 0.4147481636668755,
+      "learning_rate": 0.00012978203074631334,
+      "loss": 0.6456,
+      "step": 791
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.47940626886163656,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7393,
+      "step": 792
+    },
+    {
+      "epoch": 0.42293333333333333,
+      "grad_norm": 0.39817914406276367,
+      "learning_rate": 0.00012945192688023624,
+      "loss": 0.6675,
+      "step": 793
+    },
+    {
+      "epoch": 0.42346666666666666,
+      "grad_norm": 0.5584313468882833,
+      "learning_rate": 0.0001292867427788104,
+      "loss": 0.7667,
+      "step": 794
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.4367114712993521,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7107,
+      "step": 795
+    },
+    {
+      "epoch": 0.4245333333333333,
+      "grad_norm": 0.36940731022190015,
+      "learning_rate": 0.00012895611270550666,
+      "loss": 0.7088,
+      "step": 796
+    },
+    {
+      "epoch": 0.42506666666666665,
+      "grad_norm": 0.4887139556790538,
+      "learning_rate": 0.0001287906677209403,
+      "loss": 0.7472,
+      "step": 797
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.5148049545306919,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.75,
+      "step": 798
+    },
+    {
+      "epoch": 0.4261333333333333,
+      "grad_norm": 0.39045927155655474,
+      "learning_rate": 0.0001284595203261965,
+      "loss": 0.6184,
+      "step": 799
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.41338525824422334,
+      "learning_rate": 0.00012829381890487536,
+      "loss": 0.6686,
+      "step": 800
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.4580040152660515,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7759,
+      "step": 801
+    },
+    {
+      "epoch": 0.42773333333333335,
+      "grad_norm": 0.39197016661405115,
+      "learning_rate": 0.00012796216308838117,
+      "loss": 0.6802,
+      "step": 802
+    },
+    {
+      "epoch": 0.4282666666666667,
+      "grad_norm": 0.4060787668598037,
+      "learning_rate": 0.00012779620968358273,
+      "loss": 0.6979,
+      "step": 803
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.38749826680060884,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.6528,
+      "step": 804
+    },
+    {
+      "epoch": 0.42933333333333334,
+      "grad_norm": 0.44255595065983705,
+      "learning_rate": 0.00012746405435869198,
+      "loss": 0.7141,
+      "step": 805
+    },
+    {
+      "epoch": 0.4298666666666667,
+      "grad_norm": 0.37420480713147775,
+      "learning_rate": 0.00012729785343046588,
+      "loss": 0.6887,
+      "step": 806
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.4178451644372548,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7626,
+      "step": 807
+    },
+    {
+      "epoch": 0.43093333333333333,
+      "grad_norm": 0.40716545195879217,
+      "learning_rate": 0.00012696520752395672,
+      "loss": 0.7046,
+      "step": 808
+    },
+    {
+      "epoch": 0.43146666666666667,
+      "grad_norm": 0.37260467690368004,
+      "learning_rate": 0.00012679876353900482,
+      "loss": 0.6957,
+      "step": 809
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.392410196841189,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7572,
+      "step": 810
+    },
+    {
+      "epoch": 0.4325333333333333,
+      "grad_norm": 0.540225482513355,
+      "learning_rate": 0.00012646563599083996,
+      "loss": 0.7078,
+      "step": 811
+    },
+    {
+      "epoch": 0.43306666666666666,
+      "grad_norm": 0.4290983314062885,
+      "learning_rate": 0.00012629895342239643,
+      "loss": 0.6923,
+      "step": 812
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.45463577400665817,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.6582,
+      "step": 813
+    },
+    {
+      "epoch": 0.4341333333333333,
+      "grad_norm": 0.42506359020091006,
+      "learning_rate": 0.00012596535318548289,
+      "loss": 0.7068,
+      "step": 814
+    },
+    {
+      "epoch": 0.43466666666666665,
+      "grad_norm": 0.4052264833834573,
+      "learning_rate": 0.0001257984365131938,
+      "loss": 0.6754,
+      "step": 815
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.42067284781427855,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.6647,
+      "step": 816
+    },
+    {
+      "epoch": 0.4357333333333333,
+      "grad_norm": 0.3893801590118544,
+      "learning_rate": 0.00012546437255314222,
+      "loss": 0.6829,
+      "step": 817
+    },
+    {
+      "epoch": 0.4362666666666667,
+      "grad_norm": 0.40396908137241916,
+      "learning_rate": 0.0001252972262629454,
+      "loss": 0.6611,
+      "step": 818
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.41467895442675545,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.6611,
+      "step": 819
+    },
+    {
+      "epoch": 0.43733333333333335,
+      "grad_norm": 0.4092569145701477,
+      "learning_rate": 0.00012496270755782914,
+      "loss": 0.6829,
+      "step": 820
+    },
+    {
+      "epoch": 0.4378666666666667,
+      "grad_norm": 0.3735402941611452,
+      "learning_rate": 0.00012479533614183334,
+      "loss": 0.6455,
+      "step": 821
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.49159857340071456,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.752,
+      "step": 822
+    },
+    {
+      "epoch": 0.43893333333333334,
+      "grad_norm": 0.46477841449404667,
+      "learning_rate": 0.00012446037168194714,
+      "loss": 0.691,
+      "step": 823
+    },
+    {
+      "epoch": 0.43946666666666667,
+      "grad_norm": 0.5256131810881728,
+      "learning_rate": 0.00012429277963831148,
+      "loss": 0.7473,
+      "step": 824
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.3994308410818447,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.689,
+      "step": 825
+    },
+    {
+      "epoch": 0.44053333333333333,
+      "grad_norm": 0.38590268814176476,
+      "learning_rate": 0.00012395737842592995,
+      "loss": 0.7055,
+      "step": 826
+    },
+    {
+      "epoch": 0.44106666666666666,
+      "grad_norm": 0.39839738685156506,
+      "learning_rate": 0.000123789570258743,
+      "loss": 0.6476,
+      "step": 827
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.4560486773170303,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.7591,
+      "step": 828
+    },
+    {
+      "epoch": 0.4421333333333333,
+      "grad_norm": 0.41140782608567117,
+      "learning_rate": 0.00012345374130787854,
+      "loss": 0.6936,
+      "step": 829
+    },
+    {
+      "epoch": 0.44266666666666665,
+      "grad_norm": 0.43694387286818914,
+      "learning_rate": 0.00012328572152703725,
+      "loss": 0.7166,
+      "step": 830
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.43371584027130167,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7039,
+      "step": 831
+    },
+    {
+      "epoch": 0.4437333333333333,
+      "grad_norm": 0.42122043137786364,
+      "learning_rate": 0.00012294947386319794,
+      "loss": 0.6919,
+      "step": 832
+    },
+    {
+      "epoch": 0.44426666666666664,
+      "grad_norm": 0.43458846282659935,
+      "learning_rate": 0.0001227812469842864,
+      "loss": 0.6906,
+      "step": 833
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.3964420694030289,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.6506,
+      "step": 834
+    },
+    {
+      "epoch": 0.44533333333333336,
+      "grad_norm": 0.4522492130358685,
+      "learning_rate": 0.00012244458964423327,
+      "loss": 0.6737,
+      "step": 835
+    },
+    {
+      "epoch": 0.4458666666666667,
+      "grad_norm": 0.4414585602518746,
+      "learning_rate": 0.00012227616018840154,
+      "loss": 0.8038,
+      "step": 836
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.39697720943345677,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7218,
+      "step": 837
+    },
+    {
+      "epoch": 0.44693333333333335,
+      "grad_norm": 0.4639089157109346,
+      "learning_rate": 0.00012193910221990581,
+      "loss": 0.7024,
+      "step": 838
+    },
+    {
+      "epoch": 0.4474666666666667,
+      "grad_norm": 0.4252895100417365,
+      "learning_rate": 0.00012177047471374807,
+      "loss": 0.7765,
+      "step": 839
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.428348675853124,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.6838,
+      "step": 840
+    },
+    {
+      "epoch": 0.44853333333333334,
+      "grad_norm": 0.42427234741851616,
+      "learning_rate": 0.0001214330251753481,
+      "loss": 0.6729,
+      "step": 841
+    },
+    {
+      "epoch": 0.44906666666666667,
+      "grad_norm": 0.44690930235755943,
+      "learning_rate": 0.00012126420415078132,
+      "loss": 0.6534,
+      "step": 842
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.43840045498549246,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.7117,
+      "step": 843
+    },
+    {
+      "epoch": 0.45013333333333333,
+      "grad_norm": 0.44494519520031595,
+      "learning_rate": 0.00012092637211153885,
+      "loss": 0.6616,
+      "step": 844
+    },
+    {
+      "epoch": 0.45066666666666666,
+      "grad_norm": 0.37584479532435616,
+      "learning_rate": 0.0001207573621056809,
+      "loss": 0.6595,
+      "step": 845
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.416988876074108,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7148,
+      "step": 846
+    },
+    {
+      "epoch": 0.4517333333333333,
+      "grad_norm": 0.448417536573754,
+      "learning_rate": 0.00012041915664493761,
+      "loss": 0.6679,
+      "step": 847
+    },
+    {
+      "epoch": 0.45226666666666665,
+      "grad_norm": 0.4112498483095486,
+      "learning_rate": 0.00012024996219998517,
+      "loss": 0.7651,
+      "step": 848
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.3811467925411851,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.6415,
+      "step": 849
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.40735345211114443,
+      "learning_rate": 0.00011991139240711857,
+      "loss": 0.7183,
+      "step": 850
+    },
+    {
+      "epoch": 0.45386666666666664,
+      "grad_norm": 0.365712004883673,
+      "learning_rate": 0.00011974201807022525,
+      "loss": 0.6531,
+      "step": 851
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3399382117782691,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.6058,
+      "step": 852
+    },
+    {
+      "epoch": 0.45493333333333336,
+      "grad_norm": 0.4379389015780378,
+      "learning_rate": 0.00011940309304440433,
+      "loss": 0.7172,
+      "step": 853
+    },
+    {
+      "epoch": 0.4554666666666667,
+      "grad_norm": 0.4369172442435986,
+      "learning_rate": 0.00011923354336755835,
+      "loss": 0.7079,
+      "step": 854
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.45164358615309574,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.7473,
+      "step": 855
+    },
+    {
+      "epoch": 0.45653333333333335,
+      "grad_norm": 0.3659976138113478,
+      "learning_rate": 0.00011889427221749916,
+      "loss": 0.722,
+      "step": 856
+    },
+    {
+      "epoch": 0.4570666666666667,
+      "grad_norm": 0.3777699951256542,
+      "learning_rate": 0.00011872455175740112,
+      "loss": 0.6876,
+      "step": 857
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.5690609248953112,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.8172,
+      "step": 858
+    },
+    {
+      "epoch": 0.45813333333333334,
+      "grad_norm": 0.5600529431049095,
+      "learning_rate": 0.00011838494360112185,
+      "loss": 0.7175,
+      "step": 859
+    },
+    {
+      "epoch": 0.45866666666666667,
+      "grad_norm": 0.38616115760346603,
+      "learning_rate": 0.00011821505691906216,
+      "loss": 0.6684,
+      "step": 860
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.37757205127412863,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.6449,
+      "step": 861
+    },
+    {
+      "epoch": 0.4597333333333333,
+      "grad_norm": 0.4227552376754661,
+      "learning_rate": 0.00011787512088363817,
+      "loss": 0.6942,
+      "step": 862
+    },
+    {
+      "epoch": 0.46026666666666666,
+      "grad_norm": 0.37636659984609294,
+      "learning_rate": 0.00011770507254537453,
+      "loss": 0.6686,
+      "step": 863
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.42958155859538183,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.6565,
+      "step": 864
+    },
+    {
+      "epoch": 0.4613333333333333,
+      "grad_norm": 0.4676737883015861,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.6748,
+      "step": 865
+    },
+    {
+      "epoch": 0.46186666666666665,
+      "grad_norm": 0.4298123565151547,
+      "learning_rate": 0.00011719461234232764,
+      "loss": 0.689,
+      "step": 866
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.38712485850856665,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.6774,
+      "step": 867
+    },
+    {
+      "epoch": 0.4629333333333333,
+      "grad_norm": 0.4538316685132117,
+      "learning_rate": 0.00011685404796484225,
+      "loss": 0.7661,
+      "step": 868
+    },
+    {
+      "epoch": 0.4634666666666667,
+      "grad_norm": 0.3902900523149195,
+      "learning_rate": 0.00011668369002869912,
+      "loss": 0.7253,
+      "step": 869
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4549690943672534,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7087,
+      "step": 870
+    },
+    {
+      "epoch": 0.46453333333333335,
+      "grad_norm": 0.37214768787751934,
+      "learning_rate": 0.00011634282520518383,
+      "loss": 0.6301,
+      "step": 871
+    },
+    {
+      "epoch": 0.4650666666666667,
+      "grad_norm": 0.4197220669283357,
+      "learning_rate": 0.00011617231933568578,
+      "loss": 0.6499,
+      "step": 872
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.4573007121363443,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.709,
+      "step": 873
+    },
+    {
+      "epoch": 0.46613333333333334,
+      "grad_norm": 0.3750780611770427,
+      "learning_rate": 0.00011583116322698935,
+      "loss": 0.6437,
+      "step": 874
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.42598393273853435,
+      "learning_rate": 0.00011566051400653486,
+      "loss": 0.6453,
+      "step": 875
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.4043634490015629,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7175,
+      "step": 876
+    },
+    {
+      "epoch": 0.46773333333333333,
+      "grad_norm": 0.4100748462912414,
+      "learning_rate": 0.00011531907578133429,
+      "loss": 0.7281,
+      "step": 877
+    },
+    {
+      "epoch": 0.46826666666666666,
+      "grad_norm": 0.4083345582958824,
+      "learning_rate": 0.00011514828779617459,
+      "loss": 0.6993,
+      "step": 878
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.5386085667416355,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7671,
+      "step": 879
+    },
+    {
+      "epoch": 0.4693333333333333,
+      "grad_norm": 0.5047367466998479,
+      "learning_rate": 0.00011480657663072896,
+      "loss": 0.7901,
+      "step": 880
+    },
+    {
+      "epoch": 0.46986666666666665,
+      "grad_norm": 0.4082332890498897,
+      "learning_rate": 0.00011463565447084445,
+      "loss": 0.7327,
+      "step": 881
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3957954699780059,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.6939,
+      "step": 882
+    },
+    {
+      "epoch": 0.4709333333333333,
+      "grad_norm": 0.4196723976406624,
+      "learning_rate": 0.00011429367954874819,
+      "loss": 0.7432,
+      "step": 883
+    },
+    {
+      "epoch": 0.47146666666666665,
+      "grad_norm": 0.39789366030622275,
+      "learning_rate": 0.0001141226278077254,
+      "loss": 0.6599,
+      "step": 884
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4066334113966666,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.6858,
+      "step": 885
+    },
+    {
+      "epoch": 0.47253333333333336,
+      "grad_norm": 0.426114928047437,
+      "learning_rate": 0.00011378039831966134,
+      "loss": 0.695,
+      "step": 886
+    },
+    {
+      "epoch": 0.4730666666666667,
+      "grad_norm": 0.3657732022256318,
+      "learning_rate": 0.00011360922159456928,
+      "loss": 0.6794,
+      "step": 887
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.44820333186437933,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.6628,
+      "step": 888
+    },
+    {
+      "epoch": 0.47413333333333335,
+      "grad_norm": 0.4191481015202587,
+      "learning_rate": 0.00011326674673806195,
+      "loss": 0.7187,
+      "step": 889
+    },
+    {
+      "epoch": 0.4746666666666667,
+      "grad_norm": 0.4211627256751037,
+      "learning_rate": 0.00011309544962932862,
+      "loss": 0.7215,
+      "step": 890
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.45212699420089375,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7439,
+      "step": 891
+    },
+    {
+      "epoch": 0.47573333333333334,
+      "grad_norm": 0.3838447013917968,
+      "learning_rate": 0.00011275273860849684,
+      "loss": 0.6623,
+      "step": 892
+    },
+    {
+      "epoch": 0.47626666666666667,
+      "grad_norm": 0.4618605125531973,
+      "learning_rate": 0.00011258132571978555,
+      "loss": 0.699,
+      "step": 893
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.36043551834248494,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.6604,
+      "step": 894
+    },
+    {
+      "epoch": 0.47733333333333333,
+      "grad_norm": 0.4366509150795268,
+      "learning_rate": 0.00011223838774509514,
+      "loss": 0.675,
+      "step": 895
+    },
+    {
+      "epoch": 0.47786666666666666,
+      "grad_norm": 0.37824659917516085,
+      "learning_rate": 0.00011206686368318086,
+      "loss": 0.6837,
+      "step": 896
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.38607101421346063,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.6727,
+      "step": 897
+    },
+    {
+      "epoch": 0.4789333333333333,
+      "grad_norm": 0.4170767756378652,
+      "learning_rate": 0.00011172370797119712,
+      "loss": 0.6756,
+      "step": 898
+    },
+    {
+      "epoch": 0.47946666666666665,
+      "grad_norm": 0.4192517271303245,
+      "learning_rate": 0.00011155207734584263,
+      "loss": 0.6877,
+      "step": 899
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3883207568038762,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.6879,
+      "step": 900
+    },
+    {
+      "epoch": 0.4805333333333333,
+      "grad_norm": 0.43154513623271423,
+      "learning_rate": 0.00011120871311898254,
+      "loss": 0.6631,
+      "step": 901
+    },
+    {
+      "epoch": 0.48106666666666664,
+      "grad_norm": 0.41906673225970287,
+      "learning_rate": 0.0001110369805428146,
+      "loss": 0.6692,
+      "step": 902
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.42328968585937776,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.6827,
+      "step": 903
+    },
+    {
+      "epoch": 0.48213333333333336,
+      "grad_norm": 0.4059000813630314,
+      "learning_rate": 0.0001106934170290991,
+      "loss": 0.6998,
+      "step": 904
+    },
+    {
+      "epoch": 0.4826666666666667,
+      "grad_norm": 0.4246838021330245,
+      "learning_rate": 0.00011052158711748434,
+      "loss": 0.6727,
+      "step": 905
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.41161990655189423,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.7129,
+      "step": 906
+    },
+    {
+      "epoch": 0.48373333333333335,
+      "grad_norm": 0.3830816169496887,
+      "learning_rate": 0.00011017783355029026,
+      "loss": 0.6859,
+      "step": 907
+    },
+    {
+      "epoch": 0.4842666666666667,
+      "grad_norm": 0.3474729385730149,
+      "learning_rate": 0.00011000591092121127,
+      "loss": 0.6747,
+      "step": 908
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.4336246830572765,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7288,
+      "step": 909
+    },
+    {
+      "epoch": 0.48533333333333334,
+      "grad_norm": 0.4665382299598991,
+      "learning_rate": 0.0001096619765390232,
+      "loss": 0.7506,
+      "step": 910
+    },
+    {
+      "epoch": 0.48586666666666667,
+      "grad_norm": 0.368425197903905,
+      "learning_rate": 0.00010948996581295436,
+      "loss": 0.6389,
+      "step": 911
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.6781212965291331,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.835,
+      "step": 912
+    },
+    {
+      "epoch": 0.48693333333333333,
+      "grad_norm": 0.4054103481988646,
+      "learning_rate": 0.00010914585985911632,
+      "loss": 0.6339,
+      "step": 913
+    },
+    {
+      "epoch": 0.48746666666666666,
+      "grad_norm": 0.4205111294204396,
+      "learning_rate": 0.00010897376565889971,
+      "loss": 0.7192,
+      "step": 914
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.39019261778083597,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.6808,
+      "step": 915
+    },
+    {
+      "epoch": 0.4885333333333333,
+      "grad_norm": 0.4834038492568575,
+      "learning_rate": 0.00010862949738136681,
+      "loss": 0.7897,
+      "step": 916
+    },
+    {
+      "epoch": 0.48906666666666665,
+      "grad_norm": 0.3999334184659683,
+      "learning_rate": 0.00010845732433208779,
+      "loss": 0.6131,
+      "step": 917
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3907108326637154,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.7158,
+      "step": 918
+    },
+    {
+      "epoch": 0.4901333333333333,
+      "grad_norm": 0.36783800739417705,
+      "learning_rate": 0.00010811290298317755,
+      "loss": 0.6646,
+      "step": 919
+    },
+    {
+      "epoch": 0.49066666666666664,
+      "grad_norm": 0.38766088148844946,
+      "learning_rate": 0.00010794065571204072,
+      "loss": 0.6523,
+      "step": 920
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.4088791170143303,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.6498,
+      "step": 921
+    },
+    {
+      "epoch": 0.49173333333333336,
+      "grad_norm": 0.46091962482195553,
+      "learning_rate": 0.00010759609054818458,
+      "loss": 0.7326,
+      "step": 922
+    },
+    {
+      "epoch": 0.4922666666666667,
+      "grad_norm": 0.44779596810688416,
+      "learning_rate": 0.00010742377368438914,
+      "loss": 0.6948,
+      "step": 923
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4832443428848463,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.7584,
+      "step": 924
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 0.5520058904988282,
+      "learning_rate": 0.00010707907396588361,
+      "loss": 0.7798,
+      "step": 925
+    },
+    {
+      "epoch": 0.4938666666666667,
+      "grad_norm": 0.3983116907161026,
+      "learning_rate": 0.0001069066921404992,
+      "loss": 0.6778,
+      "step": 926
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.44715659040357375,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7322,
+      "step": 927
+    },
+    {
+      "epoch": 0.49493333333333334,
+      "grad_norm": 0.3797034213767869,
+      "learning_rate": 0.00010656186713125689,
+      "loss": 0.7212,
+      "step": 928
+    },
+    {
+      "epoch": 0.49546666666666667,
+      "grad_norm": 0.4820125235481935,
+      "learning_rate": 0.0001063894249770989,
+      "loss": 0.7215,
+      "step": 929
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4606848863956204,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7216,
+      "step": 930
+    },
+    {
+      "epoch": 0.4965333333333333,
+      "grad_norm": 0.35579528211098854,
+      "learning_rate": 0.00010604448394439983,
+      "loss": 0.624,
+      "step": 931
+    },
+    {
+      "epoch": 0.49706666666666666,
+      "grad_norm": 0.41696230084871505,
+      "learning_rate": 0.00010587198609590505,
+      "loss": 0.7461,
+      "step": 932
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.41051001450779595,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.6704,
+      "step": 933
+    },
+    {
+      "epoch": 0.4981333333333333,
+      "grad_norm": 0.4704820800628924,
+      "learning_rate": 0.00010552693831014726,
+      "loss": 0.68,
+      "step": 934
+    },
+    {
+      "epoch": 0.49866666666666665,
+      "grad_norm": 0.3643874726929824,
+      "learning_rate": 0.0001053543894032493,
+      "loss": 0.6765,
+      "step": 935
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.389512071010762,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.6597,
+      "step": 936
+    },
+    {
+      "epoch": 0.4997333333333333,
+      "grad_norm": 0.41278035073042324,
+      "learning_rate": 0.00010500924413769988,
+      "loss": 0.6714,
+      "step": 937
+    },
+    {
+      "epoch": 0.5002666666666666,
+      "grad_norm": 0.4436583315702858,
+      "learning_rate": 0.00010483664880970457,
+      "loss": 0.7382,
+      "step": 938
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.4109632758758607,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.6692,
+      "step": 939
+    },
+    {
+      "epoch": 0.5013333333333333,
+      "grad_norm": 0.430103663623662,
+      "learning_rate": 0.00010449141534025045,
+      "loss": 0.6749,
+      "step": 940
+    },
+    {
+      "epoch": 0.5018666666666667,
+      "grad_norm": 0.4466133002880243,
+      "learning_rate": 0.00010431877822971117,
+      "loss": 0.6864,
+      "step": 941
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.5412950980017701,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.7182,
+      "step": 942
+    },
+    {
+      "epoch": 0.5029333333333333,
+      "grad_norm": 0.4406866731902667,
+      "learning_rate": 0.00010397346583460971,
+      "loss": 0.7105,
+      "step": 943
+    },
+    {
+      "epoch": 0.5034666666666666,
+      "grad_norm": 0.40633350588685896,
+      "learning_rate": 0.0001038007915812028,
+      "loss": 0.6471,
+      "step": 944
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.42005885337753446,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.6788,
+      "step": 945
+    },
+    {
+      "epoch": 0.5045333333333333,
+      "grad_norm": 0.41525024156942714,
+      "learning_rate": 0.0001034554095408326,
+      "loss": 0.6532,
+      "step": 946
+    },
+    {
+      "epoch": 0.5050666666666667,
+      "grad_norm": 0.3599678570866186,
+      "learning_rate": 0.00010328270278523256,
+      "loss": 0.6282,
+      "step": 947
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.4219733547454359,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.6445,
+      "step": 948
+    },
+    {
+      "epoch": 0.5061333333333333,
+      "grad_norm": 0.3693070442514145,
+      "learning_rate": 0.00010293726038184393,
+      "loss": 0.6684,
+      "step": 949
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.40317218230251783,
+      "learning_rate": 0.00010276452576559879,
+      "loss": 0.6847,
+      "step": 950
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.3667600449620981,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.6463,
+      "step": 951
+    },
+    {
+      "epoch": 0.5077333333333334,
+      "grad_norm": 0.4575459676412633,
+      "learning_rate": 0.00010241903228306431,
+      "loss": 0.7436,
+      "step": 952
+    },
+    {
+      "epoch": 0.5082666666666666,
+      "grad_norm": 0.4143052409106989,
+      "learning_rate": 0.0001022462744484709,
+      "loss": 0.6948,
+      "step": 953
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.5551592758193366,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.8077,
+      "step": 954
+    },
+    {
+      "epoch": 0.5093333333333333,
+      "grad_norm": 0.36552580644611515,
+      "learning_rate": 0.00010190073917203589,
+      "loss": 0.6584,
+      "step": 955
+    },
+    {
+      "epoch": 0.5098666666666667,
+      "grad_norm": 0.44283388291390147,
+      "learning_rate": 0.00010172796276201503,
+      "loss": 0.6412,
+      "step": 956
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.3989293668128008,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.6487,
+      "step": 957
+    },
+    {
+      "epoch": 0.5109333333333334,
+      "grad_norm": 0.3776659622179366,
+      "learning_rate": 0.00010138239497804804,
+      "loss": 0.6177,
+      "step": 958
+    },
+    {
+      "epoch": 0.5114666666666666,
+      "grad_norm": 0.4196106291201286,
+      "learning_rate": 0.00010120960463601976,
+      "loss": 0.7107,
+      "step": 959
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4356598894725207,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.6155,
+      "step": 960
+    },
+    {
+      "epoch": 0.5125333333333333,
+      "grad_norm": 0.48373090615016834,
+      "learning_rate": 0.00010086401363176305,
+      "loss": 0.6167,
+      "step": 961
+    },
+    {
+      "epoch": 0.5130666666666667,
+      "grad_norm": 0.3961007416470711,
+      "learning_rate": 0.00010069121400152181,
+      "loss": 0.6412,
+      "step": 962
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.38454925146692004,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.6678,
+      "step": 963
+    },
+    {
+      "epoch": 0.5141333333333333,
+      "grad_norm": 0.4929824910516295,
+      "learning_rate": 0.0001003456090648416,
+      "loss": 0.724,
+      "step": 964
+    },
+    {
+      "epoch": 0.5146666666666667,
+      "grad_norm": 0.4124728750458972,
+      "learning_rate": 0.00010017280479043147,
+      "loss": 0.644,
+      "step": 965
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4081739368224594,
+      "learning_rate": 0.0001,
+      "loss": 0.6731,
+      "step": 966
+    },
+    {
+      "epoch": 0.5157333333333334,
+      "grad_norm": 0.41193118141534857,
+      "learning_rate": 9.982719520956855e-05,
+      "loss": 0.6871,
+      "step": 967
+    },
+    {
+      "epoch": 0.5162666666666667,
+      "grad_norm": 0.40752089452997164,
+      "learning_rate": 9.965439093515841e-05,
+      "loss": 0.6802,
+      "step": 968
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.4527288918603455,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.6869,
+      "step": 969
+    },
+    {
+      "epoch": 0.5173333333333333,
+      "grad_norm": 0.40886209920259825,
+      "learning_rate": 9.930878599847821e-05,
+      "loss": 0.6658,
+      "step": 970
+    },
+    {
+      "epoch": 0.5178666666666667,
+      "grad_norm": 0.3767957878760789,
+      "learning_rate": 9.913598636823693e-05,
+      "loss": 0.6953,
+      "step": 971
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3913033172556406,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.6215,
+      "step": 972
+    },
+    {
+      "epoch": 0.5189333333333334,
+      "grad_norm": 0.3811126425119513,
+      "learning_rate": 9.879039536398024e-05,
+      "loss": 0.6931,
+      "step": 973
+    },
+    {
+      "epoch": 0.5194666666666666,
+      "grad_norm": 0.4046478317760797,
+      "learning_rate": 9.861760502195197e-05,
+      "loss": 0.6852,
+      "step": 974
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3855872010526326,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.6597,
+      "step": 975
+    },
+    {
+      "epoch": 0.5205333333333333,
+      "grad_norm": 0.433739458764419,
+      "learning_rate": 9.827203723798498e-05,
+      "loss": 0.6829,
+      "step": 976
+    },
+    {
+      "epoch": 0.5210666666666667,
+      "grad_norm": 0.40624916566457947,
+      "learning_rate": 9.809926082796415e-05,
+      "loss": 0.7005,
+      "step": 977
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.45049264698191427,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.7653,
+      "step": 978
+    },
+    {
+      "epoch": 0.5221333333333333,
+      "grad_norm": 0.3537365765479179,
+      "learning_rate": 9.775372555152912e-05,
+      "loss": 0.6309,
+      "step": 979
+    },
+    {
+      "epoch": 0.5226666666666666,
+      "grad_norm": 0.34679338832774076,
+      "learning_rate": 9.758096771693573e-05,
+      "loss": 0.5918,
+      "step": 980
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.4130211561321415,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.671,
+      "step": 981
+    },
+    {
+      "epoch": 0.5237333333333334,
+      "grad_norm": 0.39958354895664644,
+      "learning_rate": 9.723547423440122e-05,
+      "loss": 0.6542,
+      "step": 982
+    },
+    {
+      "epoch": 0.5242666666666667,
+      "grad_norm": 0.3933026033310428,
+      "learning_rate": 9.70627396181561e-05,
+      "loss": 0.719,
+      "step": 983
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4187629179794375,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.665,
+      "step": 984
+    },
+    {
+      "epoch": 0.5253333333333333,
+      "grad_norm": 0.3512871037451522,
+      "learning_rate": 9.671729721476746e-05,
+      "loss": 0.5699,
+      "step": 985
+    },
+    {
+      "epoch": 0.5258666666666667,
+      "grad_norm": 0.43032520290837295,
+      "learning_rate": 9.654459045916743e-05,
+      "loss": 0.7252,
+      "step": 986
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.42372236008862924,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7716,
+      "step": 987
+    },
+    {
+      "epoch": 0.5269333333333334,
+      "grad_norm": 0.4500511015150906,
+      "learning_rate": 9.619920841879725e-05,
+      "loss": 0.702,
+      "step": 988
+    },
+    {
+      "epoch": 0.5274666666666666,
+      "grad_norm": 0.4369864297812008,
+      "learning_rate": 9.602653416539031e-05,
+      "loss": 0.7133,
+      "step": 989
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.42738110216480996,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.7108,
+      "step": 990
+    },
+    {
+      "epoch": 0.5285333333333333,
+      "grad_norm": 0.4546254298926973,
+      "learning_rate": 9.568122177028884e-05,
+      "loss": 0.7021,
+      "step": 991
+    },
+    {
+      "epoch": 0.5290666666666667,
+      "grad_norm": 0.47534555261597616,
+      "learning_rate": 9.550858465974958e-05,
+      "loss": 0.7792,
+      "step": 992
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.35009175970476775,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.6577,
+      "step": 993
+    },
+    {
+      "epoch": 0.5301333333333333,
+      "grad_norm": 0.4146495775983063,
+      "learning_rate": 9.516335119029546e-05,
+      "loss": 0.6847,
+      "step": 994
+    },
+    {
+      "epoch": 0.5306666666666666,
+      "grad_norm": 0.35274495325837385,
+      "learning_rate": 9.499075586230013e-05,
+      "loss": 0.626,
+      "step": 995
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.3946345924588137,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.7274,
+      "step": 996
+    },
+    {
+      "epoch": 0.5317333333333333,
+      "grad_norm": 0.40445474619161476,
+      "learning_rate": 9.464561059675073e-05,
+      "loss": 0.6666,
+      "step": 997
+    },
+    {
+      "epoch": 0.5322666666666667,
+      "grad_norm": 0.4420847337796016,
+      "learning_rate": 9.44730616898528e-05,
+      "loss": 0.6106,
+      "step": 998
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.5086383552008388,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.7165,
+      "step": 999
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.37050778943519386,
+      "learning_rate": 9.412801390409497e-05,
+      "loss": 0.6512,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5338666666666667,
+      "grad_norm": 0.43013196569499396,
+      "learning_rate": 9.395551605560018e-05,
+      "loss": 0.7179,
+      "step": 1001
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.4578202856490542,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.7517,
+      "step": 1002
+    },
+    {
+      "epoch": 0.5349333333333334,
+      "grad_norm": 0.4495694920653057,
+      "learning_rate": 9.361057502290113e-05,
+      "loss": 0.7634,
+      "step": 1003
+    },
+    {
+      "epoch": 0.5354666666666666,
+      "grad_norm": 0.5851523217246388,
+      "learning_rate": 9.343813286874312e-05,
+      "loss": 0.735,
+      "step": 1004
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.4621421543442429,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.6984,
+      "step": 1005
+    },
+    {
+      "epoch": 0.5365333333333333,
+      "grad_norm": 0.4168786941520585,
+      "learning_rate": 9.309330785950086e-05,
+      "loss": 0.7452,
+      "step": 1006
+    },
+    {
+      "epoch": 0.5370666666666667,
+      "grad_norm": 0.40325108999711745,
+      "learning_rate": 9.292092603411641e-05,
+      "loss": 0.6807,
+      "step": 1007
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3727693370035713,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.651,
+      "step": 1008
+    },
+    {
+      "epoch": 0.5381333333333334,
+      "grad_norm": 0.44769379418292204,
+      "learning_rate": 9.257622631561085e-05,
+      "loss": 0.699,
+      "step": 1009
+    },
+    {
+      "epoch": 0.5386666666666666,
+      "grad_norm": 0.5111125142072811,
+      "learning_rate": 9.240390945181543e-05,
+      "loss": 0.6924,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.4251550782857545,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.7157,
+      "step": 1011
+    },
+    {
+      "epoch": 0.5397333333333333,
+      "grad_norm": 0.3769181059561603,
+      "learning_rate": 9.205934428795929e-05,
+      "loss": 0.6644,
+      "step": 1012
+    },
+    {
+      "epoch": 0.5402666666666667,
+      "grad_norm": 0.45443832424297664,
+      "learning_rate": 9.188709701682247e-05,
+      "loss": 0.7169,
+      "step": 1013
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.4511348245112982,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7126,
+      "step": 1014
+    },
+    {
+      "epoch": 0.5413333333333333,
+      "grad_norm": 0.4141737647376624,
+      "learning_rate": 9.154267566791223e-05,
+      "loss": 0.633,
+      "step": 1015
+    },
+    {
+      "epoch": 0.5418666666666667,
+      "grad_norm": 0.44494223441210634,
+      "learning_rate": 9.137050261863324e-05,
+      "loss": 0.6231,
+      "step": 1016
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.3946276691986154,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.6472,
+      "step": 1017
+    },
+    {
+      "epoch": 0.5429333333333334,
+      "grad_norm": 0.4421635577070011,
+      "learning_rate": 9.102623434110028e-05,
+      "loss": 0.6987,
+      "step": 1018
+    },
+    {
+      "epoch": 0.5434666666666667,
+      "grad_norm": 0.4266538788112514,
+      "learning_rate": 9.085414014088369e-05,
+      "loss": 0.7181,
+      "step": 1019
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.39834575331728844,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.6221,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5445333333333333,
+      "grad_norm": 0.4717264032877369,
+      "learning_rate": 9.051003418704565e-05,
+      "loss": 0.7339,
+      "step": 1021
+    },
+    {
+      "epoch": 0.5450666666666667,
+      "grad_norm": 0.43136951491275893,
+      "learning_rate": 9.033802346097682e-05,
+      "loss": 0.6747,
+      "step": 1022
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.401431272925709,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.7016,
+      "step": 1023
+    },
+    {
+      "epoch": 0.5461333333333334,
+      "grad_norm": 0.4865990810439014,
+      "learning_rate": 8.999408907878877e-05,
+      "loss": 0.6874,
+      "step": 1024
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 0.447148097712422,
+      "learning_rate": 8.982216644970979e-05,
+      "loss": 0.6572,
+      "step": 1025
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.3543191237955773,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.6392,
+      "step": 1026
+    },
+    {
+      "epoch": 0.5477333333333333,
+      "grad_norm": 0.3984007462717862,
+      "learning_rate": 8.947841288251568e-05,
+      "loss": 0.6716,
+      "step": 1027
+    },
+    {
+      "epoch": 0.5482666666666667,
+      "grad_norm": 0.4730393750162832,
+      "learning_rate": 8.930658297090091e-05,
+      "loss": 0.6661,
+      "step": 1028
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.42607149707485553,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.6913,
+      "step": 1029
+    },
+    {
+      "epoch": 0.5493333333333333,
+      "grad_norm": 0.4075380632043871,
+      "learning_rate": 8.896301945718541e-05,
+      "loss": 0.6374,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5498666666666666,
+      "grad_norm": 0.4386141817128692,
+      "learning_rate": 8.879128688101749e-05,
+      "loss": 0.6773,
+      "step": 1031
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3528487154856506,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.6702,
+      "step": 1032
+    },
+    {
+      "epoch": 0.5509333333333334,
+      "grad_norm": 0.37987693605543144,
+      "learning_rate": 8.844792265415738e-05,
+      "loss": 0.6798,
+      "step": 1033
+    },
+    {
+      "epoch": 0.5514666666666667,
+      "grad_norm": 0.434132619873488,
+      "learning_rate": 8.827629202880293e-05,
+      "loss": 0.6544,
+      "step": 1034
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.4271382707165829,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7142,
+      "step": 1035
+    },
+    {
+      "epoch": 0.5525333333333333,
+      "grad_norm": 0.3605931608885955,
+      "learning_rate": 8.793313631681915e-05,
+      "loss": 0.6657,
+      "step": 1036
+    },
+    {
+      "epoch": 0.5530666666666667,
+      "grad_norm": 0.39329044283106057,
+      "learning_rate": 8.776161225490489e-05,
+      "loss": 0.6878,
+      "step": 1037
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.3995556394988917,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.7098,
+      "step": 1038
+    },
+    {
+      "epoch": 0.5541333333333334,
+      "grad_norm": 0.4255688945635287,
+      "learning_rate": 8.741867428021446e-05,
+      "loss": 0.6487,
+      "step": 1039
+    },
+    {
+      "epoch": 0.5546666666666666,
+      "grad_norm": 0.43986765751893153,
+      "learning_rate": 8.724726139150318e-05,
+      "loss": 0.6796,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.39980228004220597,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.6828,
+      "step": 1041
+    },
+    {
+      "epoch": 0.5557333333333333,
+      "grad_norm": 0.3816100710105836,
+      "learning_rate": 8.690455037067141e-05,
+      "loss": 0.6751,
+      "step": 1042
+    },
+    {
+      "epoch": 0.5562666666666667,
+      "grad_norm": 0.4180911252006521,
+      "learning_rate": 8.673325326193806e-05,
+      "loss": 0.6411,
+      "step": 1043
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.396115105457432,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.6321,
+      "step": 1044
+    },
+    {
+      "epoch": 0.5573333333333333,
+      "grad_norm": 0.396360727759017,
+      "learning_rate": 8.639077840543077e-05,
+      "loss": 0.7039,
+      "step": 1045
+    },
+    {
+      "epoch": 0.5578666666666666,
+      "grad_norm": 0.37612404333459937,
+      "learning_rate": 8.621960168033867e-05,
+      "loss": 0.6406,
+      "step": 1046
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.3749842980949236,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.6159,
+      "step": 1047
+    },
+    {
+      "epoch": 0.5589333333333333,
+      "grad_norm": 0.385694310353475,
+      "learning_rate": 8.587737219227462e-05,
+      "loss": 0.6532,
+      "step": 1048
+    },
+    {
+      "epoch": 0.5594666666666667,
+      "grad_norm": 0.3933784309864641,
+      "learning_rate": 8.570632045125185e-05,
+      "loss": 0.6225,
+      "step": 1049
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4003557685321842,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.703,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5605333333333333,
+      "grad_norm": 0.44846692986705655,
+      "learning_rate": 8.536434552915556e-05,
+      "loss": 0.6939,
+      "step": 1051
+    },
+    {
+      "epoch": 0.5610666666666667,
+      "grad_norm": 0.351065732506569,
+      "learning_rate": 8.519342336927105e-05,
+      "loss": 0.6578,
+      "step": 1052
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.3840655001604711,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6534,
+      "step": 1053
+    },
+    {
+      "epoch": 0.5621333333333334,
+      "grad_norm": 0.4057819927263224,
+      "learning_rate": 8.485171220382545e-05,
+      "loss": 0.6888,
+      "step": 1054
+    },
+    {
+      "epoch": 0.5626666666666666,
+      "grad_norm": 0.4539176876201716,
+      "learning_rate": 8.468092421866573e-05,
+      "loss": 0.6938,
+      "step": 1055
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.39433805090457724,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.6768,
+      "step": 1056
+    },
+    {
+      "epoch": 0.5637333333333333,
+      "grad_norm": 0.36916749602491866,
+      "learning_rate": 8.433948599346516e-05,
+      "loss": 0.6298,
+      "step": 1057
+    },
+    {
+      "epoch": 0.5642666666666667,
+      "grad_norm": 0.5354961974864721,
+      "learning_rate": 8.416883677301069e-05,
+      "loss": 0.7371,
+      "step": 1058
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.3856750948532204,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.621,
+      "step": 1059
+    },
+    {
+      "epoch": 0.5653333333333334,
+      "grad_norm": 0.44416444073599765,
+      "learning_rate": 8.382768066431425e-05,
+      "loss": 0.6204,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5658666666666666,
+      "grad_norm": 0.47608622183479576,
+      "learning_rate": 8.36571747948162e-05,
+      "loss": 0.7543,
+      "step": 1061
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.47805091710578973,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.6586,
+      "step": 1062
+    },
+    {
+      "epoch": 0.5669333333333333,
+      "grad_norm": 0.5255071484114089,
+      "learning_rate": 8.33163099713009e-05,
+      "loss": 0.7729,
+      "step": 1063
+    },
+    {
+      "epoch": 0.5674666666666667,
+      "grad_norm": 0.4375427571144395,
+      "learning_rate": 8.31459520351578e-05,
+      "loss": 0.6628,
+      "step": 1064
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.4052738746215365,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.7356,
+      "step": 1065
+    },
+    {
+      "epoch": 0.5685333333333333,
+      "grad_norm": 0.40261689141324786,
+      "learning_rate": 8.280538765767235e-05,
+      "loss": 0.6551,
+      "step": 1066
+    },
+    {
+      "epoch": 0.5690666666666667,
+      "grad_norm": 0.39752088590860307,
+      "learning_rate": 8.263518223330697e-05,
+      "loss": 0.6715,
+      "step": 1067
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3692470280595934,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.6715,
+      "step": 1068
+    },
+    {
+      "epoch": 0.5701333333333334,
+      "grad_norm": 0.4286308729777144,
+      "learning_rate": 8.22949274546255e-05,
+      "loss": 0.6297,
+      "step": 1069
+    },
+    {
+      "epoch": 0.5706666666666667,
+      "grad_norm": 0.5670954690384703,
+      "learning_rate": 8.212487911636184e-05,
+      "loss": 0.7251,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.4267321456844729,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6814,
+      "step": 1071
+    },
+    {
+      "epoch": 0.5717333333333333,
+      "grad_norm": 0.38861456786538684,
+      "learning_rate": 8.178494308093789e-05,
+      "loss": 0.6271,
+      "step": 1072
+    },
+    {
+      "epoch": 0.5722666666666667,
+      "grad_norm": 0.38957989693957745,
+      "learning_rate": 8.161505639887817e-05,
+      "loss": 0.6861,
+      "step": 1073
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.4398872258989492,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.6615,
+      "step": 1074
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 0.3598987307930638,
+      "learning_rate": 8.127544824259889e-05,
+      "loss": 0.6296,
+      "step": 1075
+    },
+    {
+      "epoch": 0.5738666666666666,
+      "grad_norm": 0.44545085996388406,
+      "learning_rate": 8.110572778250085e-05,
+      "loss": 0.7323,
+      "step": 1076
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.4521438135371745,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.6646,
+      "step": 1077
+    },
+    {
+      "epoch": 0.5749333333333333,
+      "grad_norm": 0.4133136448568273,
+      "learning_rate": 8.076645663244168e-05,
+      "loss": 0.7028,
+      "step": 1078
+    },
+    {
+      "epoch": 0.5754666666666667,
+      "grad_norm": 0.5112361936765261,
+      "learning_rate": 8.059690695559568e-05,
+      "loss": 0.7229,
+      "step": 1079
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3608855051180709,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.6265,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5765333333333333,
+      "grad_norm": 0.37581199774305873,
+      "learning_rate": 8.025798192977481e-05,
+      "loss": 0.641,
+      "step": 1081
+    },
+    {
+      "epoch": 0.5770666666666666,
+      "grad_norm": 0.36645220592842015,
+      "learning_rate": 8.008860759288147e-05,
+      "loss": 0.6683,
+      "step": 1082
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.327679199766857,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6201,
+      "step": 1083
+    },
+    {
+      "epoch": 0.5781333333333334,
+      "grad_norm": 0.41461517959529814,
+      "learning_rate": 7.975003780001485e-05,
+      "loss": 0.6508,
+      "step": 1084
+    },
+    {
+      "epoch": 0.5786666666666667,
+      "grad_norm": 0.4596071811646076,
+      "learning_rate": 7.958084335506239e-05,
+      "loss": 0.6781,
+      "step": 1085
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.38161433704655506,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6177,
+      "step": 1086
+    },
+    {
+      "epoch": 0.5797333333333333,
+      "grad_norm": 0.41597134919896617,
+      "learning_rate": 7.924263789431912e-05,
+      "loss": 0.669,
+      "step": 1087
+    },
+    {
+      "epoch": 0.5802666666666667,
+      "grad_norm": 0.36479135825180353,
+      "learning_rate": 7.907362788846116e-05,
+      "loss": 0.6433,
+      "step": 1088
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.40702327606263927,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.6458,
+      "step": 1089
+    },
+    {
+      "epoch": 0.5813333333333334,
+      "grad_norm": 0.34467219913279784,
+      "learning_rate": 7.873579584921869e-05,
+      "loss": 0.6305,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5818666666666666,
+      "grad_norm": 0.4073720433870432,
+      "learning_rate": 7.856697482465196e-05,
+      "loss": 0.6297,
+      "step": 1091
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.5012422499598385,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6812,
+      "step": 1092
+    },
+    {
+      "epoch": 0.5829333333333333,
+      "grad_norm": 0.4581412119281737,
+      "learning_rate": 7.822952528625191e-05,
+      "loss": 0.6534,
+      "step": 1093
+    },
+    {
+      "epoch": 0.5834666666666667,
+      "grad_norm": 0.3929723623011824,
+      "learning_rate": 7.806089778009421e-05,
+      "loss": 0.692,
+      "step": 1094
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3847513426944313,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.651,
+      "step": 1095
+    },
+    {
+      "epoch": 0.5845333333333333,
+      "grad_norm": 0.37002841059123026,
+      "learning_rate": 7.772383981159849e-05,
+      "loss": 0.6352,
+      "step": 1096
+    },
+    {
+      "epoch": 0.5850666666666666,
+      "grad_norm": 0.4035406759186194,
+      "learning_rate": 7.755541035576677e-05,
+      "loss": 0.6741,
+      "step": 1097
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.47409102297656225,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6994,
+      "step": 1098
+    },
+    {
+      "epoch": 0.5861333333333333,
+      "grad_norm": 0.3905904036526821,
+      "learning_rate": 7.721875301571359e-05,
+      "loss": 0.6307,
+      "step": 1099
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.40843841427179084,
+      "learning_rate": 7.705052613680211e-05,
+      "loss": 0.6517,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.5688501524807306,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.7094,
+      "step": 1101
+    },
+    {
+      "epoch": 0.5877333333333333,
+      "grad_norm": 0.43581669103870135,
+      "learning_rate": 7.671427847296275e-05,
+      "loss": 0.7384,
+      "step": 1102
+    },
+    {
+      "epoch": 0.5882666666666667,
+      "grad_norm": 0.41014916555280057,
+      "learning_rate": 7.654625869212146e-05,
+      "loss": 0.6538,
+      "step": 1103
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3725949082075007,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.6651,
+      "step": 1104
+    },
+    {
+      "epoch": 0.5893333333333334,
+      "grad_norm": 0.36963459874364757,
+      "learning_rate": 7.6210429741257e-05,
+      "loss": 0.6866,
+      "step": 1105
+    },
+    {
+      "epoch": 0.5898666666666667,
+      "grad_norm": 0.3849175928397143,
+      "learning_rate": 7.604262157407007e-05,
+      "loss": 0.6724,
+      "step": 1106
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.4581545019985593,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.6591,
+      "step": 1107
+    },
+    {
+      "epoch": 0.5909333333333333,
+      "grad_norm": 0.41913596172880435,
+      "learning_rate": 7.570722036168854e-05,
+      "loss": 0.6591,
+      "step": 1108
+    },
+    {
+      "epoch": 0.5914666666666667,
+      "grad_norm": 0.4737884387010747,
+      "learning_rate": 7.55396283180529e-05,
+      "loss": 0.6145,
+      "step": 1109
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.370518586779289,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.6183,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5925333333333334,
+      "grad_norm": 0.40280941037817003,
+      "learning_rate": 7.520466385816671e-05,
+      "loss": 0.6555,
+      "step": 1111
+    },
+    {
+      "epoch": 0.5930666666666666,
+      "grad_norm": 0.3646693721554753,
+      "learning_rate": 7.503729244217086e-05,
+      "loss": 0.6597,
+      "step": 1112
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.3663775479204596,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6379,
+      "step": 1113
+    },
+    {
+      "epoch": 0.5941333333333333,
+      "grad_norm": 0.4488435817649355,
+      "learning_rate": 7.470277373705461e-05,
+      "loss": 0.6758,
+      "step": 1114
+    },
+    {
+      "epoch": 0.5946666666666667,
+      "grad_norm": 0.4469665095587836,
+      "learning_rate": 7.453562744685778e-05,
+      "loss": 0.6415,
+      "step": 1115
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.38464340027405997,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.6508,
+      "step": 1116
+    },
+    {
+      "epoch": 0.5957333333333333,
+      "grad_norm": 0.45714227358367127,
+      "learning_rate": 7.42015634868062e-05,
+      "loss": 0.6618,
+      "step": 1117
+    },
+    {
+      "epoch": 0.5962666666666666,
+      "grad_norm": 0.38550320196660004,
+      "learning_rate": 7.403464681451715e-05,
+      "loss": 0.6569,
+      "step": 1118
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.4515411273502641,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6018,
+      "step": 1119
+    },
+    {
+      "epoch": 0.5973333333333334,
+      "grad_norm": 0.44006921412224026,
+      "learning_rate": 7.370104657760361e-05,
+      "loss": 0.6698,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5978666666666667,
+      "grad_norm": 0.3812764903976601,
+      "learning_rate": 7.353436400916004e-05,
+      "loss": 0.6439,
+      "step": 1121
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4447058234378918,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.6575,
+      "step": 1122
+    },
+    {
+      "epoch": 0.5989333333333333,
+      "grad_norm": 0.392188011838177,
+      "learning_rate": 7.320123646099519e-05,
+      "loss": 0.6589,
+      "step": 1123
+    },
+    {
+      "epoch": 0.5994666666666667,
+      "grad_norm": 0.38304190214514855,
+      "learning_rate": 7.303479247604332e-05,
+      "loss": 0.6556,
+      "step": 1124
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.5405090465021215,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.712,
+      "step": 1125
+    },
+    {
+      "epoch": 0.6005333333333334,
+      "grad_norm": 0.3543253711580587,
+      "learning_rate": 7.270214656953415e-05,
+      "loss": 0.5963,
+      "step": 1126
+    },
+    {
+      "epoch": 0.6010666666666666,
+      "grad_norm": 0.5013105388465979,
+      "learning_rate": 7.253594564130804e-05,
+      "loss": 0.6463,
+      "step": 1127
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.45059783404888487,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.6656,
+      "step": 1128
+    },
+    {
+      "epoch": 0.6021333333333333,
+      "grad_norm": 0.4330921252061515,
+      "learning_rate": 7.22037903164173e-05,
+      "loss": 0.6927,
+      "step": 1129
+    },
+    {
+      "epoch": 0.6026666666666667,
+      "grad_norm": 0.6987513883069537,
+      "learning_rate": 7.203783691161883e-05,
+      "loss": 0.6964,
+      "step": 1130
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.3816638781520124,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.6299,
+      "step": 1131
+    },
+    {
+      "epoch": 0.6037333333333333,
+      "grad_norm": 0.3751370995542225,
+      "learning_rate": 7.170618109512465e-05,
+      "loss": 0.6594,
+      "step": 1132
+    },
+    {
+      "epoch": 0.6042666666666666,
+      "grad_norm": 0.4299281410382147,
+      "learning_rate": 7.154047967380354e-05,
+      "loss": 0.7386,
+      "step": 1133
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.34164488022808387,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.6669,
+      "step": 1134
+    },
+    {
+      "epoch": 0.6053333333333333,
+      "grad_norm": 0.39787968534787277,
+      "learning_rate": 7.12093322790597e-05,
+      "loss": 0.6836,
+      "step": 1135
+    },
+    {
+      "epoch": 0.6058666666666667,
+      "grad_norm": 0.39106954459068427,
+      "learning_rate": 7.104388729449338e-05,
+      "loss": 0.6351,
+      "step": 1136
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.3947541795328532,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.6774,
+      "step": 1137
+    },
+    {
+      "epoch": 0.6069333333333333,
+      "grad_norm": 0.4213082829415851,
+      "learning_rate": 7.071325722118963e-05,
+      "loss": 0.642,
+      "step": 1138
+    },
+    {
+      "epoch": 0.6074666666666667,
+      "grad_norm": 0.3800845021173878,
+      "learning_rate": 7.054807311976379e-05,
+      "loss": 0.6721,
+      "step": 1139
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4071479381839004,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.631,
+      "step": 1140
+    },
+    {
+      "epoch": 0.6085333333333334,
+      "grad_norm": 0.40252912914646516,
+      "learning_rate": 7.021796925368667e-05,
+      "loss": 0.616,
+      "step": 1141
+    },
+    {
+      "epoch": 0.6090666666666666,
+      "grad_norm": 0.40052306331925214,
+      "learning_rate": 7.005305047477566e-05,
+      "loss": 0.5895,
+      "step": 1142
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.467091632151271,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.7255,
+      "step": 1143
+    },
+    {
+      "epoch": 0.6101333333333333,
+      "grad_norm": 0.4630037079137472,
+      "learning_rate": 6.972348168756983e-05,
+      "loss": 0.6549,
+      "step": 1144
+    },
+    {
+      "epoch": 0.6106666666666667,
+      "grad_norm": 0.46378430219557376,
+      "learning_rate": 6.955883266341741e-05,
+      "loss": 0.7068,
+      "step": 1145
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.4052886342275947,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6561,
+      "step": 1146
+    },
+    {
+      "epoch": 0.6117333333333334,
+      "grad_norm": 0.3999075024423376,
+      "learning_rate": 6.922980781234699e-05,
+      "loss": 0.7029,
+      "step": 1147
+    },
+    {
+      "epoch": 0.6122666666666666,
+      "grad_norm": 0.5325865529164777,
+      "learning_rate": 6.906543296794714e-05,
+      "loss": 0.7503,
+      "step": 1148
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.5905952008527307,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.674,
+      "step": 1149
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.42646157498059994,
+      "learning_rate": 6.873696089565786e-05,
+      "loss": 0.626,
+      "step": 1150
+    },
+    {
+      "epoch": 0.6138666666666667,
+      "grad_norm": 0.4255902318426631,
+      "learning_rate": 6.85728646486359e-05,
+      "loss": 0.6575,
+      "step": 1151
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.4297025470038566,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6237,
+      "step": 1152
+    },
+    {
+      "epoch": 0.6149333333333333,
+      "grad_norm": 0.40347903351671394,
+      "learning_rate": 6.82449541829174e-05,
+      "loss": 0.6588,
+      "step": 1153
+    },
+    {
+      "epoch": 0.6154666666666667,
+      "grad_norm": 0.3827082474076491,
+      "learning_rate": 6.80811409434113e-05,
+      "loss": 0.6632,
+      "step": 1154
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.37832045458505154,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.6616,
+      "step": 1155
+    },
+    {
+      "epoch": 0.6165333333333334,
+      "grad_norm": 0.4111492464118394,
+      "learning_rate": 6.775380089695986e-05,
+      "loss": 0.7275,
+      "step": 1156
+    },
+    {
+      "epoch": 0.6170666666666667,
+      "grad_norm": 0.37288782144974,
+      "learning_rate": 6.759027506750158e-05,
+      "loss": 0.597,
+      "step": 1157
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.40002496138954113,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.69,
+      "step": 1158
+    },
+    {
+      "epoch": 0.6181333333333333,
+      "grad_norm": 0.36710877035634026,
+      "learning_rate": 6.726351423768322e-05,
+      "loss": 0.5799,
+      "step": 1159
+    },
+    {
+      "epoch": 0.6186666666666667,
+      "grad_norm": 0.39759880488318194,
+      "learning_rate": 6.710028021308061e-05,
+      "loss": 0.6172,
+      "step": 1160
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.4254248194110287,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.6473,
+      "step": 1161
+    },
+    {
+      "epoch": 0.6197333333333334,
+      "grad_norm": 0.4216400213794304,
+      "learning_rate": 6.677410738169485e-05,
+      "loss": 0.6838,
+      "step": 1162
+    },
+    {
+      "epoch": 0.6202666666666666,
+      "grad_norm": 0.4060220364568944,
+      "learning_rate": 6.661116954891328e-05,
+      "loss": 0.6731,
+      "step": 1163
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.42450269698261156,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.6783,
+      "step": 1164
+    },
+    {
+      "epoch": 0.6213333333333333,
+      "grad_norm": 0.4406882807056827,
+      "learning_rate": 6.62855934819569e-05,
+      "loss": 0.6606,
+      "step": 1165
+    },
+    {
+      "epoch": 0.6218666666666667,
+      "grad_norm": 0.38146261694436473,
+      "learning_rate": 6.612295622000162e-05,
+      "loss": 0.6835,
+      "step": 1166
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.3960931033382964,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6869,
+      "step": 1167
+    },
+    {
+      "epoch": 0.6229333333333333,
+      "grad_norm": 0.3920942872948423,
+      "learning_rate": 6.579798566743314e-05,
+      "loss": 0.65,
+      "step": 1168
+    },
+    {
+      "epoch": 0.6234666666666666,
+      "grad_norm": 0.502974696853216,
+      "learning_rate": 6.563565334723134e-05,
+      "loss": 0.8013,
+      "step": 1169
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3809516734862799,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.6536,
+      "step": 1170
+    },
+    {
+      "epoch": 0.6245333333333334,
+      "grad_norm": 0.37018149655501825,
+      "learning_rate": 6.531129704273604e-05,
+      "loss": 0.6641,
+      "step": 1171
+    },
+    {
+      "epoch": 0.6250666666666667,
+      "grad_norm": 0.4034959080754938,
+      "learning_rate": 6.514927402701964e-05,
+      "loss": 0.6341,
+      "step": 1172
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.4134862147996729,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.65,
+      "step": 1173
+    },
+    {
+      "epoch": 0.6261333333333333,
+      "grad_norm": 0.3414312601353073,
+      "learning_rate": 6.48255406877745e-05,
+      "loss": 0.604,
+      "step": 1174
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 0.3959407841702227,
+      "learning_rate": 6.466383133096267e-05,
+      "loss": 0.6536,
+      "step": 1175
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.4133756851660163,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6885,
+      "step": 1176
+    },
+    {
+      "epoch": 0.6277333333333334,
+      "grad_norm": 0.37222803311698316,
+      "learning_rate": 6.434072965740242e-05,
+      "loss": 0.6272,
+      "step": 1177
+    },
+    {
+      "epoch": 0.6282666666666666,
+      "grad_norm": 0.369139299145551,
+      "learning_rate": 6.417933830548467e-05,
+      "loss": 0.5868,
+      "step": 1178
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.4815835051436008,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7594,
+      "step": 1179
+    },
+    {
+      "epoch": 0.6293333333333333,
+      "grad_norm": 0.37513170722998457,
+      "learning_rate": 6.385687698106781e-05,
+      "loss": 0.6671,
+      "step": 1180
+    },
+    {
+      "epoch": 0.6298666666666667,
+      "grad_norm": 0.44751068701941626,
+      "learning_rate": 6.369580797148718e-05,
+      "loss": 0.6568,
+      "step": 1181
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.3853514665076848,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6967,
+      "step": 1182
+    },
+    {
+      "epoch": 0.6309333333333333,
+      "grad_norm": 0.41773830193071393,
+      "learning_rate": 6.337399566246257e-05,
+      "loss": 0.7183,
+      "step": 1183
+    },
+    {
+      "epoch": 0.6314666666666666,
+      "grad_norm": 0.45915612060049255,
+      "learning_rate": 6.321325332399903e-05,
+      "loss": 0.6561,
+      "step": 1184
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.41944476778817447,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.6754,
+      "step": 1185
+    },
+    {
+      "epoch": 0.6325333333333333,
+      "grad_norm": 0.35693564797134475,
+      "learning_rate": 6.289209867917312e-05,
+      "loss": 0.6096,
+      "step": 1186
+    },
+    {
+      "epoch": 0.6330666666666667,
+      "grad_norm": 0.3560566522051134,
+      "learning_rate": 6.273168733182722e-05,
+      "loss": 0.6437,
+      "step": 1187
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.5149668677272893,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6776,
+      "step": 1188
+    },
+    {
+      "epoch": 0.6341333333333333,
+      "grad_norm": 0.4248207701770222,
+      "learning_rate": 6.241119898233144e-05,
+      "loss": 0.6916,
+      "step": 1189
+    },
+    {
+      "epoch": 0.6346666666666667,
+      "grad_norm": 0.4732728561650195,
+      "learning_rate": 6.225112293720836e-05,
+      "loss": 0.7072,
+      "step": 1190
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.3574158497559156,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.6159,
+      "step": 1191
+    },
+    {
+      "epoch": 0.6357333333333334,
+      "grad_norm": 0.3482225731913979,
+      "learning_rate": 6.19313094962673e-05,
+      "loss": 0.5944,
+      "step": 1192
+    },
+    {
+      "epoch": 0.6362666666666666,
+      "grad_norm": 0.4001535338430282,
+      "learning_rate": 6.177157305546078e-05,
+      "loss": 0.6678,
+      "step": 1193
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.4162071393998801,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.5732,
+      "step": 1194
+    },
+    {
+      "epoch": 0.6373333333333333,
+      "grad_norm": 0.4703542404400119,
+      "learning_rate": 6.145244311816063e-05,
+      "loss": 0.6813,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6378666666666667,
+      "grad_norm": 0.5039572272619236,
+      "learning_rate": 6.129305057463741e-05,
+      "loss": 0.8289,
+      "step": 1196
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.39333888188660837,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.6746,
+      "step": 1197
+    },
+    {
+      "epoch": 0.6389333333333334,
+      "grad_norm": 0.3737531333883778,
+      "learning_rate": 6.0974612717695004e-05,
+      "loss": 0.6262,
+      "step": 1198
+    },
+    {
+      "epoch": 0.6394666666666666,
+      "grad_norm": 0.3626593145896875,
+      "learning_rate": 6.0815568355179556e-05,
+      "loss": 0.6566,
+      "step": 1199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.4379935971854093,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6222,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6405333333333333,
+      "grad_norm": 0.37867647437644275,
+      "learning_rate": 6.0497831136711836e-05,
+      "loss": 0.5945,
+      "step": 1201
+    },
+    {
+      "epoch": 0.6410666666666667,
+      "grad_norm": 0.3730232202108895,
+      "learning_rate": 6.0339139229571116e-05,
+      "loss": 0.6218,
+      "step": 1202
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.36030794230806007,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.6305,
+      "step": 1203
+    },
+    {
+      "epoch": 0.6421333333333333,
+      "grad_norm": 0.4183560884240375,
+      "learning_rate": 6.002211118886514e-05,
+      "loss": 0.6397,
+      "step": 1204
+    },
+    {
+      "epoch": 0.6426666666666667,
+      "grad_norm": 0.444780817450603,
+      "learning_rate": 5.986377600199371e-05,
+      "loss": 0.6384,
+      "step": 1205
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.45802748310803115,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.7336,
+      "step": 1206
+    },
+    {
+      "epoch": 0.6437333333333334,
+      "grad_norm": 0.3647774377980463,
+      "learning_rate": 5.9547465659277215e-05,
+      "loss": 0.6562,
+      "step": 1207
+    },
+    {
+      "epoch": 0.6442666666666667,
+      "grad_norm": 0.36960262362588187,
+      "learning_rate": 5.938949144798279e-05,
+      "loss": 0.6779,
+      "step": 1208
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.40527322757344364,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.63,
+      "step": 1209
+    },
+    {
+      "epoch": 0.6453333333333333,
+      "grad_norm": 0.3939893103860662,
+      "learning_rate": 5.907390730419507e-05,
+      "loss": 0.6928,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6458666666666667,
+      "grad_norm": 0.3826699602637233,
+      "learning_rate": 5.8916298314083915e-05,
+      "loss": 0.6269,
+      "step": 1211
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.3501685236675613,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6623,
+      "step": 1212
+    },
+    {
+      "epoch": 0.6469333333333334,
+      "grad_norm": 0.43616076580584456,
+      "learning_rate": 5.860144885064751e-05,
+      "loss": 0.7103,
+      "step": 1213
+    },
+    {
+      "epoch": 0.6474666666666666,
+      "grad_norm": 0.3726378790672664,
+      "learning_rate": 5.8444209317510514e-05,
+      "loss": 0.6285,
+      "step": 1214
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.41488579979296436,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6485,
+      "step": 1215
+    },
+    {
+      "epoch": 0.6485333333333333,
+      "grad_norm": 0.38689674009819797,
+      "learning_rate": 5.813010299610313e-05,
+      "loss": 0.6335,
+      "step": 1216
+    },
+    {
+      "epoch": 0.6490666666666667,
+      "grad_norm": 0.437200128890619,
+      "learning_rate": 5.797323714580192e-05,
+      "loss": 0.6312,
+      "step": 1217
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.3919662080702626,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6619,
+      "step": 1218
+    },
+    {
+      "epoch": 0.6501333333333333,
+      "grad_norm": 0.3724718876940257,
+      "learning_rate": 5.765988240812921e-05,
+      "loss": 0.6356,
+      "step": 1219
+    },
+    {
+      "epoch": 0.6506666666666666,
+      "grad_norm": 1.7639155606560177,
+      "learning_rate": 5.750339445648252e-05,
+      "loss": 0.6326,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.40587213435186836,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.6528,
+      "step": 1221
+    },
+    {
+      "epoch": 0.6517333333333334,
+      "grad_norm": 0.3922059812193428,
+      "learning_rate": 5.7190799724050924e-05,
+      "loss": 0.6448,
+      "step": 1222
+    },
+    {
+      "epoch": 0.6522666666666667,
+      "grad_norm": 0.4343148971683679,
+      "learning_rate": 5.7034693876721376e-05,
+      "loss": 0.7322,
+      "step": 1223
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.46419027714746003,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.744,
+      "step": 1224
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 0.4321260594814531,
+      "learning_rate": 5.6722867550612116e-05,
+      "loss": 0.6241,
+      "step": 1225
+    },
+    {
+      "epoch": 0.6538666666666667,
+      "grad_norm": 0.5739554542647625,
+      "learning_rate": 5.6567148002993164e-05,
+      "loss": 0.7038,
+      "step": 1226
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.43183095298941515,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.6493,
+      "step": 1227
+    },
+    {
+      "epoch": 0.6549333333333334,
+      "grad_norm": 0.395184329350968,
+      "learning_rate": 5.625609846363622e-05,
+      "loss": 0.6287,
+      "step": 1228
+    },
+    {
+      "epoch": 0.6554666666666666,
+      "grad_norm": 0.3732252380750418,
+      "learning_rate": 5.6100769400739383e-05,
+      "loss": 0.6378,
+      "step": 1229
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.39210240392021667,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.5881,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6565333333333333,
+      "grad_norm": 0.3779633989621703,
+      "learning_rate": 5.579050500768836e-05,
+      "loss": 0.63,
+      "step": 1231
+    },
+    {
+      "epoch": 0.6570666666666667,
+      "grad_norm": 0.42191238495334316,
+      "learning_rate": 5.5635570604030705e-05,
+      "loss": 0.6875,
+      "step": 1232
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.4026515419401466,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.6618,
+      "step": 1233
+    },
+    {
+      "epoch": 0.6581333333333333,
+      "grad_norm": 0.4192148150372642,
+      "learning_rate": 5.53260996957381e-05,
+      "loss": 0.6588,
+      "step": 1234
+    },
+    {
+      "epoch": 0.6586666666666666,
+      "grad_norm": 0.4149332064145483,
+      "learning_rate": 5.5171564115230254e-05,
+      "loss": 0.646,
+      "step": 1235
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.4034071104473392,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6219,
+      "step": 1236
+    },
+    {
+      "epoch": 0.6597333333333333,
+      "grad_norm": 0.4511181451729584,
+      "learning_rate": 5.486289500882355e-05,
+      "loss": 0.6585,
+      "step": 1237
+    },
+    {
+      "epoch": 0.6602666666666667,
+      "grad_norm": 0.47643267698354524,
+      "learning_rate": 5.47087624046575e-05,
+      "loss": 0.6782,
+      "step": 1238
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.4086591914425235,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.6718,
+      "step": 1239
+    },
+    {
+      "epoch": 0.6613333333333333,
+      "grad_norm": 0.4010432208685754,
+      "learning_rate": 5.4400903395715366e-05,
+      "loss": 0.6339,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6618666666666667,
+      "grad_norm": 0.4626664532030345,
+      "learning_rate": 5.424717791025302e-05,
+      "loss": 0.6472,
+      "step": 1241
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4308470746273194,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.6923,
+      "step": 1242
+    },
+    {
+      "epoch": 0.6629333333333334,
+      "grad_norm": 0.42841013030392777,
+      "learning_rate": 5.394013727258254e-05,
+      "loss": 0.6747,
+      "step": 1243
+    },
+    {
+      "epoch": 0.6634666666666666,
+      "grad_norm": 0.41529556235870374,
+      "learning_rate": 5.378682303724435e-05,
+      "loss": 0.6941,
+      "step": 1244
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.45007324582289304,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6367,
+      "step": 1245
+    },
+    {
+      "epoch": 0.6645333333333333,
+      "grad_norm": 0.44953898222936955,
+      "learning_rate": 5.348060902265871e-05,
+      "loss": 0.6463,
+      "step": 1246
+    },
+    {
+      "epoch": 0.6650666666666667,
+      "grad_norm": 0.35092620368532895,
+      "learning_rate": 5.332771015781275e-05,
+      "loss": 0.6415,
+      "step": 1247
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.4339567113204943,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.7325,
+      "step": 1248
+    },
+    {
+      "epoch": 0.6661333333333334,
+      "grad_norm": 0.5796903423136838,
+      "learning_rate": 5.302233099590928e-05,
+      "loss": 0.6139,
+      "step": 1249
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.5001667011104508,
+      "learning_rate": 5.286985161076029e-05,
+      "loss": 0.6418,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.3820274792926042,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6333,
+      "step": 1251
+    },
+    {
+      "epoch": 0.6677333333333333,
+      "grad_norm": 0.3930062869164529,
+      "learning_rate": 5.2565315508699376e-05,
+      "loss": 0.7064,
+      "step": 1252
+    },
+    {
+      "epoch": 0.6682666666666667,
+      "grad_norm": 0.45246692735783917,
+      "learning_rate": 5.2413259701178505e-05,
+      "loss": 0.6997,
+      "step": 1253
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.40434583944586,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6441,
+      "step": 1254
+    },
+    {
+      "epoch": 0.6693333333333333,
+      "grad_norm": 0.38217373263717996,
+      "learning_rate": 5.210957484346314e-05,
+      "loss": 0.6356,
+      "step": 1255
+    },
+    {
+      "epoch": 0.6698666666666667,
+      "grad_norm": 0.4063080671701538,
+      "learning_rate": 5.195794670011776e-05,
+      "loss": 0.6134,
+      "step": 1256
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.4566062530039861,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.7148,
+      "step": 1257
+    },
+    {
+      "epoch": 0.6709333333333334,
+      "grad_norm": 0.41542567815953363,
+      "learning_rate": 5.165512124837344e-05,
+      "loss": 0.6601,
+      "step": 1258
+    },
+    {
+      "epoch": 0.6714666666666667,
+      "grad_norm": 0.3802028666786385,
+      "learning_rate": 5.150392484425728e-05,
+      "loss": 0.6613,
+      "step": 1259
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.39668663463620063,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.6248,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6725333333333333,
+      "grad_norm": 0.38452156701095597,
+      "learning_rate": 5.120196693701267e-05,
+      "loss": 0.7134,
+      "step": 1261
+    },
+    {
+      "epoch": 0.6730666666666667,
+      "grad_norm": 0.3980848074267266,
+      "learning_rate": 5.105120633557634e-05,
+      "loss": 0.6716,
+      "step": 1262
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.3780272107106022,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.6562,
+      "step": 1263
+    },
+    {
+      "epoch": 0.6741333333333334,
+      "grad_norm": 0.476362292617112,
+      "learning_rate": 5.075012408804458e-05,
+      "loss": 0.7302,
+      "step": 1264
+    },
+    {
+      "epoch": 0.6746666666666666,
+      "grad_norm": 0.4234756636372608,
+      "learning_rate": 5.059980334102637e-05,
+      "loss": 0.634,
+      "step": 1265
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.42949549729642317,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.68,
+      "step": 1266
+    },
+    {
+      "epoch": 0.6757333333333333,
+      "grad_norm": 0.3920835689592617,
+      "learning_rate": 5.0299604844886985e-05,
+      "loss": 0.6153,
+      "step": 1267
+    },
+    {
+      "epoch": 0.6762666666666667,
+      "grad_norm": 0.4368257248483272,
+      "learning_rate": 5.014972799220403e-05,
+      "loss": 0.5868,
+      "step": 1268
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.4748109414344694,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7149,
+      "step": 1269
+    },
+    {
+      "epoch": 0.6773333333333333,
+      "grad_norm": 0.41796503413553826,
+      "learning_rate": 4.985042131538545e-05,
+      "loss": 0.6976,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6778666666666666,
+      "grad_norm": 0.3895235977869893,
+      "learning_rate": 4.9700992385024934e-05,
+      "loss": 0.6573,
+      "step": 1271
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.5289606490625995,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6889,
+      "step": 1272
+    },
+    {
+      "epoch": 0.6789333333333334,
+      "grad_norm": 0.37743217428546827,
+      "learning_rate": 4.940258557148765e-05,
+      "loss": 0.6709,
+      "step": 1273
+    },
+    {
+      "epoch": 0.6794666666666667,
+      "grad_norm": 0.3947410873715308,
+      "learning_rate": 4.9253608579398855e-05,
+      "loss": 0.7213,
+      "step": 1274
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.37167156176277555,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6571,
+      "step": 1275
+    },
+    {
+      "epoch": 0.6805333333333333,
+      "grad_norm": 0.47132134783591934,
+      "learning_rate": 4.895610964891923e-05,
+      "loss": 0.6725,
+      "step": 1276
+    },
+    {
+      "epoch": 0.6810666666666667,
+      "grad_norm": 0.44473248869883525,
+      "learning_rate": 4.880758859890536e-05,
+      "loss": 0.6765,
+      "step": 1277
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.4375571908622867,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.7074,
+      "step": 1278
+    },
+    {
+      "epoch": 0.6821333333333334,
+      "grad_norm": 0.4399434207020205,
+      "learning_rate": 4.851100554686021e-05,
+      "loss": 0.6768,
+      "step": 1279
+    },
+    {
+      "epoch": 0.6826666666666666,
+      "grad_norm": 0.4906658379653056,
+      "learning_rate": 4.836294443047088e-05,
+      "loss": 0.7046,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.41867234663521896,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.6557,
+      "step": 1281
+    },
+    {
+      "epoch": 0.6837333333333333,
+      "grad_norm": 0.41536161568550684,
+      "learning_rate": 4.8067285227622404e-05,
+      "loss": 0.6557,
+      "step": 1282
+    },
+    {
+      "epoch": 0.6842666666666667,
+      "grad_norm": 0.36418655673146955,
+      "learning_rate": 4.791968802404648e-05,
+      "loss": 0.6456,
+      "step": 1283
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.38810328652348886,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6383,
+      "step": 1284
+    },
+    {
+      "epoch": 0.6853333333333333,
+      "grad_norm": 0.37927031129219313,
+      "learning_rate": 4.762496061632814e-05,
+      "loss": 0.6243,
+      "step": 1285
+    },
+    {
+      "epoch": 0.6858666666666666,
+      "grad_norm": 0.41696650741641733,
+      "learning_rate": 4.747783129228656e-05,
+      "loss": 0.6666,
+      "step": 1286
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.43900653427477354,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.6663,
+      "step": 1287
+    },
+    {
+      "epoch": 0.6869333333333333,
+      "grad_norm": 0.387172857897453,
+      "learning_rate": 4.718404360058966e-05,
+      "loss": 0.5998,
+      "step": 1288
+    },
+    {
+      "epoch": 0.6874666666666667,
+      "grad_norm": 0.35978959618990397,
+      "learning_rate": 4.7037386110228985e-05,
+      "loss": 0.6276,
+      "step": 1289
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3630333517324512,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.5525,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6885333333333333,
+      "grad_norm": 0.3765369610533995,
+      "learning_rate": 4.6744546030189486e-05,
+      "loss": 0.629,
+      "step": 1291
+    },
+    {
+      "epoch": 0.6890666666666667,
+      "grad_norm": 0.40095821539063065,
+      "learning_rate": 4.659836431497563e-05,
+      "loss": 0.6931,
+      "step": 1292
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.44801140332437855,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.6566,
+      "step": 1293
+    },
+    {
+      "epoch": 0.6901333333333334,
+      "grad_norm": 0.4125511746187467,
+      "learning_rate": 4.630647971676232e-05,
+      "loss": 0.697,
+      "step": 1294
+    },
+    {
+      "epoch": 0.6906666666666667,
+      "grad_norm": 0.37231414111132,
+      "learning_rate": 4.6160777705374524e-05,
+      "loss": 0.5672,
+      "step": 1295
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.42029924495620385,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.633,
+      "step": 1296
+    },
+    {
+      "epoch": 0.6917333333333333,
+      "grad_norm": 0.3883556397443518,
+      "learning_rate": 4.586985643347717e-05,
+      "loss": 0.6617,
+      "step": 1297
+    },
+    {
+      "epoch": 0.6922666666666667,
+      "grad_norm": 0.436223536266935,
+      "learning_rate": 4.572463804170263e-05,
+      "loss": 0.7111,
+      "step": 1298
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.4080282150670288,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6967,
+      "step": 1299
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.41073688554435384,
+      "learning_rate": 4.543468791472131e-05,
+      "loss": 0.6456,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6938666666666666,
+      "grad_norm": 0.49566855251674646,
+      "learning_rate": 4.5289957045349653e-05,
+      "loss": 0.7341,
+      "step": 1301
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.4047726805413657,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.6305,
+      "step": 1302
+    },
+    {
+      "epoch": 0.6949333333333333,
+      "grad_norm": 0.37403704331724497,
+      "learning_rate": 4.5000985855784746e-05,
+      "loss": 0.5819,
+      "step": 1303
+    },
+    {
+      "epoch": 0.6954666666666667,
+      "grad_norm": 0.3849853682834576,
+      "learning_rate": 4.485674639850333e-05,
+      "loss": 0.6712,
+      "step": 1304
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.43032730410963155,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.6693,
+      "step": 1305
+    },
+    {
+      "epoch": 0.6965333333333333,
+      "grad_norm": 0.42595857625470585,
+      "learning_rate": 4.456876191254582e-05,
+      "loss": 0.6872,
+      "step": 1306
+    },
+    {
+      "epoch": 0.6970666666666666,
+      "grad_norm": 0.4261292946457587,
+      "learning_rate": 4.442501774383515e-05,
+      "loss": 0.6343,
+      "step": 1307
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.42577749406423065,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.654,
+      "step": 1308
+    },
+    {
+      "epoch": 0.6981333333333334,
+      "grad_norm": 0.37662528704168124,
+      "learning_rate": 4.413802770115816e-05,
+      "loss": 0.6579,
+      "step": 1309
+    },
+    {
+      "epoch": 0.6986666666666667,
+      "grad_norm": 0.412943020979185,
+      "learning_rate": 4.399478268418771e-05,
+      "loss": 0.5987,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.4792157042628448,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.6855,
+      "step": 1311
+    },
+    {
+      "epoch": 0.6997333333333333,
+      "grad_norm": 0.49828789883725827,
+      "learning_rate": 4.3708794797738375e-05,
+      "loss": 0.6583,
+      "step": 1312
+    },
+    {
+      "epoch": 0.7002666666666667,
+      "grad_norm": 0.455543554624898,
+      "learning_rate": 4.3566052782262735e-05,
+      "loss": 0.6929,
+      "step": 1313
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.37273936906484545,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.5976,
+      "step": 1314
+    },
+    {
+      "epoch": 0.7013333333333334,
+      "grad_norm": 0.4221561766618173,
+      "learning_rate": 4.328107473805487e-05,
+      "loss": 0.7151,
+      "step": 1315
+    },
+    {
+      "epoch": 0.7018666666666666,
+      "grad_norm": 0.4837192353409229,
+      "learning_rate": 4.3138839560310303e-05,
+      "loss": 0.6432,
+      "step": 1316
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.44031896444546276,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6473,
+      "step": 1317
+    },
+    {
+      "epoch": 0.7029333333333333,
+      "grad_norm": 0.46081639364772325,
+      "learning_rate": 4.2854879017217894e-05,
+      "loss": 0.7147,
+      "step": 1318
+    },
+    {
+      "epoch": 0.7034666666666667,
+      "grad_norm": 0.4450285751670438,
+      "learning_rate": 4.271315449981934e-05,
+      "loss": 0.5895,
+      "step": 1319
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.45999585667362314,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.6837,
+      "step": 1320
+    },
+    {
+      "epoch": 0.7045333333333333,
+      "grad_norm": 0.5129723821905416,
+      "learning_rate": 4.2430219089370823e-05,
+      "loss": 0.6623,
+      "step": 1321
+    },
+    {
+      "epoch": 0.7050666666666666,
+      "grad_norm": 0.4809087567118606,
+      "learning_rate": 4.228900904120895e-05,
+      "loss": 0.7028,
+      "step": 1322
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.3743897890725053,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.6569,
+      "step": 1323
+    },
+    {
+      "epoch": 0.7061333333333333,
+      "grad_norm": 0.42802344056660824,
+      "learning_rate": 4.200710636738189e-05,
+      "loss": 0.6528,
+      "step": 1324
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 0.35795356050626215,
+      "learning_rate": 4.1866414583520877e-05,
+      "loss": 0.5789,
+      "step": 1325
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.3545967010302099,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6488,
+      "step": 1326
+    },
+    {
+      "epoch": 0.7077333333333333,
+      "grad_norm": 0.4122734551525557,
+      "learning_rate": 4.158555222253771e-05,
+      "loss": 0.6833,
+      "step": 1327
+    },
+    {
+      "epoch": 0.7082666666666667,
+      "grad_norm": 0.4470107486644953,
+      "learning_rate": 4.14453824841132e-05,
+      "loss": 0.5883,
+      "step": 1328
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.3843709654838672,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.5941,
+      "step": 1329
+    },
+    {
+      "epoch": 0.7093333333333334,
+      "grad_norm": 0.3976943559572271,
+      "learning_rate": 4.1165567984237764e-05,
+      "loss": 0.6148,
+      "step": 1330
+    },
+    {
+      "epoch": 0.7098666666666666,
+      "grad_norm": 0.35580217621603544,
+      "learning_rate": 4.102592405835536e-05,
+      "loss": 0.6146,
+      "step": 1331
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4082499877029761,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.6558,
+      "step": 1332
+    },
+    {
+      "epoch": 0.7109333333333333,
+      "grad_norm": 0.3724418218833736,
+      "learning_rate": 4.074716493968975e-05,
+      "loss": 0.6596,
+      "step": 1333
+    },
+    {
+      "epoch": 0.7114666666666667,
+      "grad_norm": 0.44126564209790003,
+      "learning_rate": 4.060805057932359e-05,
+      "loss": 0.6869,
+      "step": 1334
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.4731138311593641,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6276,
+      "step": 1335
+    },
+    {
+      "epoch": 0.7125333333333334,
+      "grad_norm": 0.3805420640266841,
+      "learning_rate": 4.0330354333606234e-05,
+      "loss": 0.6331,
+      "step": 1336
+    },
+    {
+      "epoch": 0.7130666666666666,
+      "grad_norm": 0.4195470170035172,
+      "learning_rate": 4.019177327749822e-05,
+      "loss": 0.677,
+      "step": 1337
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.4505011447716041,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.5932,
+      "step": 1338
+    },
+    {
+      "epoch": 0.7141333333333333,
+      "grad_norm": 0.39866514042034257,
+      "learning_rate": 3.991514736790258e-05,
+      "loss": 0.6426,
+      "step": 1339
+    },
+    {
+      "epoch": 0.7146666666666667,
+      "grad_norm": 0.3854196652956052,
+      "learning_rate": 3.977710334046193e-05,
+      "loss": 0.6336,
+      "step": 1340
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.45061179240724464,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.7372,
+      "step": 1341
+    },
+    {
+      "epoch": 0.7157333333333333,
+      "grad_norm": 0.4727340269071424,
+      "learning_rate": 3.950155520139581e-05,
+      "loss": 0.6015,
+      "step": 1342
+    },
+    {
+      "epoch": 0.7162666666666667,
+      "grad_norm": 0.453650302483887,
+      "learning_rate": 3.936405191259891e-05,
+      "loss": 0.6064,
+      "step": 1343
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.4625455283465757,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6724,
+      "step": 1344
+    },
+    {
+      "epoch": 0.7173333333333334,
+      "grad_norm": 0.4411413514045393,
+      "learning_rate": 3.9089588949504655e-05,
+      "loss": 0.6293,
+      "step": 1345
+    },
+    {
+      "epoch": 0.7178666666666667,
+      "grad_norm": 0.4339975054849528,
+      "learning_rate": 3.895263009479534e-05,
+      "loss": 0.6984,
+      "step": 1346
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.4772981118073982,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6929,
+      "step": 1347
+    },
+    {
+      "epoch": 0.7189333333333333,
+      "grad_norm": 0.507646836621118,
+      "learning_rate": 3.867925968395085e-05,
+      "loss": 0.6822,
+      "step": 1348
+    },
+    {
+      "epoch": 0.7194666666666667,
+      "grad_norm": 0.4153766085596804,
+      "learning_rate": 3.854284894414122e-05,
+      "loss": 0.6188,
+      "step": 1349
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3866991784899309,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.6159,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7205333333333334,
+      "grad_norm": 0.5239299312000721,
+      "learning_rate": 3.82705784324618e-05,
+      "loss": 0.647,
+      "step": 1351
+    },
+    {
+      "epoch": 0.7210666666666666,
+      "grad_norm": 0.42454726330467013,
+      "learning_rate": 3.8134719473633094e-05,
+      "loss": 0.6194,
+      "step": 1352
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.41768396922163836,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6889,
+      "step": 1353
+    },
+    {
+      "epoch": 0.7221333333333333,
+      "grad_norm": 0.39602687125476754,
+      "learning_rate": 3.786355617847385e-05,
+      "loss": 0.6154,
+      "step": 1354
+    },
+    {
+      "epoch": 0.7226666666666667,
+      "grad_norm": 0.3749979309905685,
+      "learning_rate": 3.772825265187802e-05,
+      "loss": 0.663,
+      "step": 1355
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.3956799422689981,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.6809,
+      "step": 1356
+    },
+    {
+      "epoch": 0.7237333333333333,
+      "grad_norm": 0.40275062142513046,
+      "learning_rate": 3.7458203860837234e-05,
+      "loss": 0.625,
+      "step": 1357
+    },
+    {
+      "epoch": 0.7242666666666666,
+      "grad_norm": 0.40665924811794546,
+      "learning_rate": 3.732345940279893e-05,
+      "loss": 0.5881,
+      "step": 1358
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.3683174549367088,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.6293,
+      "step": 1359
+    },
+    {
+      "epoch": 0.7253333333333334,
+      "grad_norm": 0.38720613592766406,
+      "learning_rate": 3.705453237352227e-05,
+      "loss": 0.6677,
+      "step": 1360
+    },
+    {
+      "epoch": 0.7258666666666667,
+      "grad_norm": 0.5036749622874261,
+      "learning_rate": 3.692035060534088e-05,
+      "loss": 0.6574,
+      "step": 1361
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.4493115634851116,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.6573,
+      "step": 1362
+    },
+    {
+      "epoch": 0.7269333333333333,
+      "grad_norm": 0.4535192687115761,
+      "learning_rate": 3.665255256532638e-05,
+      "loss": 0.5975,
+      "step": 1363
+    },
+    {
+      "epoch": 0.7274666666666667,
+      "grad_norm": 0.40060918718377536,
+      "learning_rate": 3.651893709317887e-05,
+      "loss": 0.6855,
+      "step": 1364
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.3944709192374314,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.566,
+      "step": 1365
+    },
+    {
+      "epoch": 0.7285333333333334,
+      "grad_norm": 0.3726353302478949,
+      "learning_rate": 3.625227523958252e-05,
+      "loss": 0.6728,
+      "step": 1366
+    },
+    {
+      "epoch": 0.7290666666666666,
+      "grad_norm": 0.3949773732359607,
+      "learning_rate": 3.611922965442648e-05,
+      "loss": 0.6496,
+      "step": 1367
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3609421259884968,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.6039,
+      "step": 1368
+    },
+    {
+      "epoch": 0.7301333333333333,
+      "grad_norm": 0.4445430731854319,
+      "learning_rate": 3.5853711153868965e-05,
+      "loss": 0.6484,
+      "step": 1369
+    },
+    {
+      "epoch": 0.7306666666666667,
+      "grad_norm": 0.37093530483032444,
+      "learning_rate": 3.5721239031346066e-05,
+      "loss": 0.5676,
+      "step": 1370
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.3667309750893148,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.5689,
+      "step": 1371
+    },
+    {
+      "epoch": 0.7317333333333333,
+      "grad_norm": 0.4066934128932922,
+      "learning_rate": 3.545687101972013e-05,
+      "loss": 0.6381,
+      "step": 1372
+    },
+    {
+      "epoch": 0.7322666666666666,
+      "grad_norm": 0.436885499719995,
+      "learning_rate": 3.53249759200601e-05,
+      "loss": 0.6583,
+      "step": 1373
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.40790686873105114,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6026,
+      "step": 1374
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.4355691333133161,
+      "learning_rate": 3.506176550233863e-05,
+      "loss": 0.5614,
+      "step": 1375
+    },
+    {
+      "epoch": 0.7338666666666667,
+      "grad_norm": 0.38422557716272554,
+      "learning_rate": 3.4930450970263485e-05,
+      "loss": 0.6302,
+      "step": 1376
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.4461172465345776,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6151,
+      "step": 1377
+    },
+    {
+      "epoch": 0.7349333333333333,
+      "grad_norm": 0.47843635473682655,
+      "learning_rate": 3.46684052203088e-05,
+      "loss": 0.6479,
+      "step": 1378
+    },
+    {
+      "epoch": 0.7354666666666667,
+      "grad_norm": 0.42331070288726547,
+      "learning_rate": 3.4537674784937614e-05,
+      "loss": 0.636,
+      "step": 1379
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.41940131364136446,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.6257,
+      "step": 1380
+    },
+    {
+      "epoch": 0.7365333333333334,
+      "grad_norm": 0.36547908260345435,
+      "learning_rate": 3.427680074531113e-05,
+      "loss": 0.5842,
+      "step": 1381
+    },
+    {
+      "epoch": 0.7370666666666666,
+      "grad_norm": 0.47445564541342283,
+      "learning_rate": 3.4146657920065285e-05,
+      "loss": 0.6197,
+      "step": 1382
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.3724715574385032,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.6063,
+      "step": 1383
+    },
+    {
+      "epoch": 0.7381333333333333,
+      "grad_norm": 0.37114586593610205,
+      "learning_rate": 3.388696260183832e-05,
+      "loss": 0.6413,
+      "step": 1384
+    },
+    {
+      "epoch": 0.7386666666666667,
+      "grad_norm": 0.4376502146324676,
+      "learning_rate": 3.3757410884346894e-05,
+      "loss": 0.6883,
+      "step": 1385
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.4416327899525981,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6672,
+      "step": 1386
+    },
+    {
+      "epoch": 0.7397333333333334,
+      "grad_norm": 0.3946237440320865,
+      "learning_rate": 3.3498901266912396e-05,
+      "loss": 0.651,
+      "step": 1387
+    },
+    {
+      "epoch": 0.7402666666666666,
+      "grad_norm": 0.40310508460486283,
+      "learning_rate": 3.336994413891828e-05,
+      "loss": 0.6521,
+      "step": 1388
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.39745506515246737,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.6669,
+      "step": 1389
+    },
+    {
+      "epoch": 0.7413333333333333,
+      "grad_norm": 0.3829176817076023,
+      "learning_rate": 3.3112627169802946e-05,
+      "loss": 0.6258,
+      "step": 1390
+    },
+    {
+      "epoch": 0.7418666666666667,
+      "grad_norm": 0.39879227345399526,
+      "learning_rate": 3.298426809706928e-05,
+      "loss": 0.5979,
+      "step": 1391
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.37884755432785133,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6177,
+      "step": 1392
+    },
+    {
+      "epoch": 0.7429333333333333,
+      "grad_norm": 0.3788256924171279,
+      "learning_rate": 3.2728150691747115e-05,
+      "loss": 0.6381,
+      "step": 1393
+    },
+    {
+      "epoch": 0.7434666666666667,
+      "grad_norm": 0.4748820117521675,
+      "learning_rate": 3.2600393123964113e-05,
+      "loss": 0.63,
+      "step": 1394
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.3607655066738298,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6382,
+      "step": 1395
+    },
+    {
+      "epoch": 0.7445333333333334,
+      "grad_norm": 0.4079651593929963,
+      "learning_rate": 3.234548216567049e-05,
+      "loss": 0.6357,
+      "step": 1396
+    },
+    {
+      "epoch": 0.7450666666666667,
+      "grad_norm": 0.3825655035108801,
+      "learning_rate": 3.2218329536362704e-05,
+      "loss": 0.6376,
+      "step": 1397
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.3617531326612794,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.6141,
+      "step": 1398
+    },
+    {
+      "epoch": 0.7461333333333333,
+      "grad_norm": 0.4382082698441567,
+      "learning_rate": 3.196463187590929e-05,
+      "loss": 0.6647,
+      "step": 1399
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.3632824704187875,
+      "learning_rate": 3.1838087602343344e-05,
+      "loss": 0.6035,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.3560365471363562,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6091,
+      "step": 1401
+    },
+    {
+      "epoch": 0.7477333333333334,
+      "grad_norm": 0.3838294817984326,
+      "learning_rate": 3.158561005793402e-05,
+      "loss": 0.6705,
+      "step": 1402
+    },
+    {
+      "epoch": 0.7482666666666666,
+      "grad_norm": 0.3733398564081259,
+      "learning_rate": 3.145967754102691e-05,
+      "loss": 0.6039,
+      "step": 1403
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.48904916348930316,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.6889,
+      "step": 1404
+    },
+    {
+      "epoch": 0.7493333333333333,
+      "grad_norm": 0.4073197262263337,
+      "learning_rate": 3.120842689807468e-05,
+      "loss": 0.5791,
+      "step": 1405
+    },
+    {
+      "epoch": 0.7498666666666667,
+      "grad_norm": 0.3869296702934028,
+      "learning_rate": 3.108310952230212e-05,
+      "loss": 0.6186,
+      "step": 1406
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.426240402154218,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6561,
+      "step": 1407
+    },
+    {
+      "epoch": 0.7509333333333333,
+      "grad_norm": 0.36780682032114076,
+      "learning_rate": 3.083309253324651e-05,
+      "loss": 0.6031,
+      "step": 1408
+    },
+    {
+      "epoch": 0.7514666666666666,
+      "grad_norm": 0.4272964553458606,
+      "learning_rate": 3.070839366655215e-05,
+      "loss": 0.6594,
+      "step": 1409
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.5311925819299563,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6507,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7525333333333334,
+      "grad_norm": 0.42988680170294935,
+      "learning_rate": 3.0459617050677868e-05,
+      "loss": 0.6737,
+      "step": 1411
+    },
+    {
+      "epoch": 0.7530666666666667,
+      "grad_norm": 0.3495168859717038,
+      "learning_rate": 3.0335540044382694e-05,
+      "loss": 0.5725,
+      "step": 1412
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.3753366695725862,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.6434,
+      "step": 1413
+    },
+    {
+      "epoch": 0.7541333333333333,
+      "grad_norm": 0.5231402347632671,
+      "learning_rate": 3.008801048763914e-05,
+      "loss": 0.6249,
+      "step": 1414
+    },
+    {
+      "epoch": 0.7546666666666667,
+      "grad_norm": 0.3955684288042401,
+      "learning_rate": 2.996455867635155e-05,
+      "loss": 0.6417,
+      "step": 1415
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.45152547314678537,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.6455,
+      "step": 1416
+    },
+    {
+      "epoch": 0.7557333333333334,
+      "grad_norm": 0.4215951935807362,
+      "learning_rate": 2.9718282831172883e-05,
+      "loss": 0.6326,
+      "step": 1417
+    },
+    {
+      "epoch": 0.7562666666666666,
+      "grad_norm": 0.43153923594719235,
+      "learning_rate": 2.9595459532698854e-05,
+      "loss": 0.5353,
+      "step": 1418
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.4259659351664963,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6094,
+      "step": 1419
+    },
+    {
+      "epoch": 0.7573333333333333,
+      "grad_norm": 0.4360547293724885,
+      "learning_rate": 2.9350444017825385e-05,
+      "loss": 0.6261,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7578666666666667,
+      "grad_norm": 0.4638142563286862,
+      "learning_rate": 2.922825253307947e-05,
+      "loss": 0.6728,
+      "step": 1421
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.39738276131172434,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6649,
+      "step": 1422
+    },
+    {
+      "epoch": 0.7589333333333333,
+      "grad_norm": 0.37987971995978564,
+      "learning_rate": 2.898450393337977e-05,
+      "loss": 0.5753,
+      "step": 1423
+    },
+    {
+      "epoch": 0.7594666666666666,
+      "grad_norm": 0.4125297244986955,
+      "learning_rate": 2.8862947546296315e-05,
+      "loss": 0.604,
+      "step": 1424
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3981575505947327,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.6008,
+      "step": 1425
+    },
+    {
+      "epoch": 0.7605333333333333,
+      "grad_norm": 0.3698169667168355,
+      "learning_rate": 2.8620472412590228e-05,
+      "loss": 0.5976,
+      "step": 1426
+    },
+    {
+      "epoch": 0.7610666666666667,
+      "grad_norm": 0.42819852022057525,
+      "learning_rate": 2.8499554390035143e-05,
+      "loss": 0.6912,
+      "step": 1427
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.4026300079112191,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.5829,
+      "step": 1428
+    },
+    {
+      "epoch": 0.7621333333333333,
+      "grad_norm": 0.46967039539446204,
+      "learning_rate": 2.8258359238917665e-05,
+      "loss": 0.6183,
+      "step": 1429
+    },
+    {
+      "epoch": 0.7626666666666667,
+      "grad_norm": 0.4272125240673037,
+      "learning_rate": 2.8138082830600554e-05,
+      "loss": 0.7072,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.5238563483964447,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.7119,
+      "step": 1431
+    },
+    {
+      "epoch": 0.7637333333333334,
+      "grad_norm": 0.39904585175727725,
+      "learning_rate": 2.7898174144266732e-05,
+      "loss": 0.625,
+      "step": 1432
+    },
+    {
+      "epoch": 0.7642666666666666,
+      "grad_norm": 0.35931920466909617,
+      "learning_rate": 2.7778542582653744e-05,
+      "loss": 0.6495,
+      "step": 1433
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.48450376861339645,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6576,
+      "step": 1434
+    },
+    {
+      "epoch": 0.7653333333333333,
+      "grad_norm": 0.4441082592529733,
+      "learning_rate": 2.753992680872457e-05,
+      "loss": 0.6697,
+      "step": 1435
+    },
+    {
+      "epoch": 0.7658666666666667,
+      "grad_norm": 0.4135004461788684,
+      "learning_rate": 2.7420943308951284e-05,
+      "loss": 0.6178,
+      "step": 1436
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.4427996136818655,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.6312,
+      "step": 1437
+    },
+    {
+      "epoch": 0.7669333333333334,
+      "grad_norm": 0.3784058040312355,
+      "learning_rate": 2.7183626860300247e-05,
+      "loss": 0.6212,
+      "step": 1438
+    },
+    {
+      "epoch": 0.7674666666666666,
+      "grad_norm": 0.3865991684709847,
+      "learning_rate": 2.7065294620085424e-05,
+      "loss": 0.6506,
+      "step": 1439
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.43057174229672973,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6286,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7685333333333333,
+      "grad_norm": 0.36209032917868483,
+      "learning_rate": 2.6829283874666233e-05,
+      "loss": 0.5779,
+      "step": 1441
+    },
+    {
+      "epoch": 0.7690666666666667,
+      "grad_norm": 0.46226915122683315,
+      "learning_rate": 2.6711606074225782e-05,
+      "loss": 0.6852,
+      "step": 1442
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.40311867497576176,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6374,
+      "step": 1443
+    },
+    {
+      "epoch": 0.7701333333333333,
+      "grad_norm": 0.3896750404013572,
+      "learning_rate": 2.647690737490106e-05,
+      "loss": 0.6076,
+      "step": 1444
+    },
+    {
+      "epoch": 0.7706666666666667,
+      "grad_norm": 0.43408224275159285,
+      "learning_rate": 2.6359887176862718e-05,
+      "loss": 0.7034,
+      "step": 1445
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.4034619127151879,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6662,
+      "step": 1446
+    },
+    {
+      "epoch": 0.7717333333333334,
+      "grad_norm": 0.40323793654487333,
+      "learning_rate": 2.6126506831233344e-05,
+      "loss": 0.6089,
+      "step": 1447
+    },
+    {
+      "epoch": 0.7722666666666667,
+      "grad_norm": 0.4443026691449622,
+      "learning_rate": 2.6010147380551475e-05,
+      "loss": 0.6625,
+      "step": 1448
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.40824863764176456,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.5817,
+      "step": 1449
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.3985195292815999,
+      "learning_rate": 2.577809166078716e-05,
+      "loss": 0.6344,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7738666666666667,
+      "grad_norm": 0.3976172596684671,
+      "learning_rate": 2.566239608465838e-05,
+      "loss": 0.6812,
+      "step": 1451
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4536497404759498,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.7157,
+      "step": 1452
+    },
+    {
+      "epoch": 0.7749333333333334,
+      "grad_norm": 0.37512175064821757,
+      "learning_rate": 2.543167122732918e-05,
+      "loss": 0.6133,
+      "step": 1453
+    },
+    {
+      "epoch": 0.7754666666666666,
+      "grad_norm": 0.45226940070095484,
+      "learning_rate": 2.5316642635108244e-05,
+      "loss": 0.6507,
+      "step": 1454
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.4047218154179542,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.6417,
+      "step": 1455
+    },
+    {
+      "epoch": 0.7765333333333333,
+      "grad_norm": 0.4043716939480558,
+      "learning_rate": 2.508725484101684e-05,
+      "loss": 0.6298,
+      "step": 1456
+    },
+    {
+      "epoch": 0.7770666666666667,
+      "grad_norm": 0.4811202338654842,
+      "learning_rate": 2.4972896324133144e-05,
+      "loss": 0.7504,
+      "step": 1457
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.40514939933246713,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6279,
+      "step": 1458
+    },
+    {
+      "epoch": 0.7781333333333333,
+      "grad_norm": 0.39546233769838046,
+      "learning_rate": 2.4744851758148156e-05,
+      "loss": 0.6299,
+      "step": 1459
+    },
+    {
+      "epoch": 0.7786666666666666,
+      "grad_norm": 0.41192552360644447,
+      "learning_rate": 2.4631166390022574e-05,
+      "loss": 0.6328,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.386669832921117,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6478,
+      "step": 1461
+    },
+    {
+      "epoch": 0.7797333333333333,
+      "grad_norm": 0.46866776580758834,
+      "learning_rate": 2.4404471180913058e-05,
+      "loss": 0.64,
+      "step": 1462
+    },
+    {
+      "epoch": 0.7802666666666667,
+      "grad_norm": 0.37375433442866596,
+      "learning_rate": 2.429146201687538e-05,
+      "loss": 0.5574,
+      "step": 1463
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.535446435298506,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6927,
+      "step": 1464
+    },
+    {
+      "epoch": 0.7813333333333333,
+      "grad_norm": 0.4270250115123898,
+      "learning_rate": 2.4066122257145894e-05,
+      "loss": 0.6721,
+      "step": 1465
+    },
+    {
+      "epoch": 0.7818666666666667,
+      "grad_norm": 0.39523168119792357,
+      "learning_rate": 2.3953792334352787e-05,
+      "loss": 0.667,
+      "step": 1466
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.43788555689573,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6069,
+      "step": 1467
+    },
+    {
+      "epoch": 0.7829333333333334,
+      "grad_norm": 0.3539516179999365,
+      "learning_rate": 2.3729814080079816e-05,
+      "loss": 0.6349,
+      "step": 1468
+    },
+    {
+      "epoch": 0.7834666666666666,
+      "grad_norm": 0.3928005959926205,
+      "learning_rate": 2.361816641743303e-05,
+      "loss": 0.6294,
+      "step": 1469
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.5619320827830976,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.625,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7845333333333333,
+      "grad_norm": 0.41719664105653065,
+      "learning_rate": 2.339555568810221e-05,
+      "loss": 0.6822,
+      "step": 1471
+    },
+    {
+      "epoch": 0.7850666666666667,
+      "grad_norm": 0.3721896133840847,
+      "learning_rate": 2.328459328616759e-05,
+      "loss": 0.61,
+      "step": 1472
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.3982359588916754,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6404,
+      "step": 1473
+    },
+    {
+      "epoch": 0.7861333333333334,
+      "grad_norm": 0.37681429170405933,
+      "learning_rate": 2.306335606451181e-05,
+      "loss": 0.6142,
+      "step": 1474
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 0.41225871232645167,
+      "learning_rate": 2.295308190543859e-05,
+      "loss": 0.6141,
+      "step": 1475
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.41223663323819554,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6203,
+      "step": 1476
+    },
+    {
+      "epoch": 0.7877333333333333,
+      "grad_norm": 0.41047135281712777,
+      "learning_rate": 2.2733224137277366e-05,
+      "loss": 0.6554,
+      "step": 1477
+    },
+    {
+      "epoch": 0.7882666666666667,
+      "grad_norm": 0.4509436252773787,
+      "learning_rate": 2.262364118471805e-05,
+      "loss": 0.6593,
+      "step": 1478
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.39428136142012704,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.5994,
+      "step": 1479
+    },
+    {
+      "epoch": 0.7893333333333333,
+      "grad_norm": 0.4179779554309077,
+      "learning_rate": 2.2405168778797646e-05,
+      "loss": 0.601,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7898666666666667,
+      "grad_norm": 0.409533310574979,
+      "learning_rate": 2.2296279977828337e-05,
+      "loss": 0.6129,
+      "step": 1481
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.41836000821068886,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.5881,
+      "step": 1482
+    },
+    {
+      "epoch": 0.7909333333333334,
+      "grad_norm": 0.3879860591336309,
+      "learning_rate": 2.2079198805662914e-05,
+      "loss": 0.6536,
+      "step": 1483
+    },
+    {
+      "epoch": 0.7914666666666667,
+      "grad_norm": 0.38425377493679347,
+      "learning_rate": 2.1971007082704164e-05,
+      "loss": 0.6057,
+      "step": 1484
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.3759245494228626,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.5599,
+      "step": 1485
+    },
+    {
+      "epoch": 0.7925333333333333,
+      "grad_norm": 0.39671834848468107,
+      "learning_rate": 2.1755322978418137e-05,
+      "loss": 0.6314,
+      "step": 1486
+    },
+    {
+      "epoch": 0.7930666666666667,
+      "grad_norm": 0.48380012072574685,
+      "learning_rate": 2.1647831241156302e-05,
+      "loss": 0.6187,
+      "step": 1487
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.40956690105938526,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6592,
+      "step": 1488
+    },
+    {
+      "epoch": 0.7941333333333334,
+      "grad_norm": 0.4448834724932321,
+      "learning_rate": 2.1433550001327373e-05,
+      "loss": 0.649,
+      "step": 1489
+    },
+    {
+      "epoch": 0.7946666666666666,
+      "grad_norm": 0.43295392506847596,
+      "learning_rate": 2.1326761138636553e-05,
+      "loss": 0.651,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.41656167554386153,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6,
+      "step": 1491
+    },
+    {
+      "epoch": 0.7957333333333333,
+      "grad_norm": 0.3846976133129346,
+      "learning_rate": 2.111388852214001e-05,
+      "loss": 0.5965,
+      "step": 1492
+    },
+    {
+      "epoch": 0.7962666666666667,
+      "grad_norm": 0.35629754003871555,
+      "learning_rate": 2.1007805404004242e-05,
+      "loss": 0.6437,
+      "step": 1493
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.4081664586908809,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.6338,
+      "step": 1494
+    },
+    {
+      "epoch": 0.7973333333333333,
+      "grad_norm": 0.451342848418995,
+      "learning_rate": 2.0796347131858186e-05,
+      "loss": 0.6113,
+      "step": 1495
+    },
+    {
+      "epoch": 0.7978666666666666,
+      "grad_norm": 0.3597857961428527,
+      "learning_rate": 2.069097260929439e-05,
+      "loss": 0.611,
+      "step": 1496
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.4459748765452952,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6767,
+      "step": 1497
+    },
+    {
+      "epoch": 0.7989333333333334,
+      "grad_norm": 0.37846205035587793,
+      "learning_rate": 2.048093436450603e-05,
+      "loss": 0.5782,
+      "step": 1498
+    },
+    {
+      "epoch": 0.7994666666666667,
+      "grad_norm": 0.39859055595750215,
+      "learning_rate": 2.0376271269487514e-05,
+      "loss": 0.6733,
+      "step": 1499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.4447503145214553,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6725,
+      "step": 1500
+    },
+    {
+      "epoch": 0.8005333333333333,
+      "grad_norm": 0.555091204170937,
+      "learning_rate": 2.0167658696900317e-05,
+      "loss": 0.8254,
+      "step": 1501
+    },
+    {
+      "epoch": 0.8010666666666667,
+      "grad_norm": 0.3952512325592042,
+      "learning_rate": 2.0063709842280432e-05,
+      "loss": 0.6195,
+      "step": 1502
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.4041425848211964,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.6251,
+      "step": 1503
+    },
+    {
+      "epoch": 0.8021333333333334,
+      "grad_norm": 0.4182819245877915,
+      "learning_rate": 1.985652854842247e-05,
+      "loss": 0.5874,
+      "step": 1504
+    },
+    {
+      "epoch": 0.8026666666666666,
+      "grad_norm": 0.42370316417755666,
+      "learning_rate": 1.9753296727859195e-05,
+      "loss": 0.6149,
+      "step": 1505
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.4324233810250658,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6546,
+      "step": 1506
+    },
+    {
+      "epoch": 0.8037333333333333,
+      "grad_norm": 0.48220088050045584,
+      "learning_rate": 1.9547552280792524e-05,
+      "loss": 0.7157,
+      "step": 1507
+    },
+    {
+      "epoch": 0.8042666666666667,
+      "grad_norm": 0.4148371009594092,
+      "learning_rate": 1.9445040268673298e-05,
+      "loss": 0.5829,
+      "step": 1508
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.3853866518903104,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.6083,
+      "step": 1509
+    },
+    {
+      "epoch": 0.8053333333333333,
+      "grad_norm": 0.37623562198874927,
+      "learning_rate": 1.9240738197844278e-05,
+      "loss": 0.612,
+      "step": 1510
+    },
+    {
+      "epoch": 0.8058666666666666,
+      "grad_norm": 0.37972097881100897,
+      "learning_rate": 1.9138948749211472e-05,
+      "loss": 0.6373,
+      "step": 1511
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.4554611346192583,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6807,
+      "step": 1512
+    },
+    {
+      "epoch": 0.8069333333333333,
+      "grad_norm": 0.40726502755025723,
+      "learning_rate": 1.8936094545302095e-05,
+      "loss": 0.619,
+      "step": 1513
+    },
+    {
+      "epoch": 0.8074666666666667,
+      "grad_norm": 0.4200742311622154,
+      "learning_rate": 1.883503039577894e-05,
+      "loss": 0.6284,
+      "step": 1514
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.43408031966201255,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.6363,
+      "step": 1515
+    },
+    {
+      "epoch": 0.8085333333333333,
+      "grad_norm": 0.3844388790122424,
+      "learning_rate": 1.8633629510559314e-05,
+      "loss": 0.5677,
+      "step": 1516
+    },
+    {
+      "epoch": 0.8090666666666667,
+      "grad_norm": 0.4253501033138691,
+      "learning_rate": 1.8533293376276472e-05,
+      "loss": 0.667,
+      "step": 1517
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.7020869956627367,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.6088,
+      "step": 1518
+    },
+    {
+      "epoch": 0.8101333333333334,
+      "grad_norm": 0.39396732741035073,
+      "learning_rate": 1.8333351222458407e-05,
+      "loss": 0.6442,
+      "step": 1519
+    },
+    {
+      "epoch": 0.8106666666666666,
+      "grad_norm": 0.3493693979867729,
+      "learning_rate": 1.8233745799980817e-05,
+      "loss": 0.548,
+      "step": 1520
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.35117778593054183,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.5234,
+      "step": 1521
+    },
+    {
+      "epoch": 0.8117333333333333,
+      "grad_norm": 0.36619486228400655,
+      "learning_rate": 1.803526775107217e-05,
+      "loss": 0.6023,
+      "step": 1522
+    },
+    {
+      "epoch": 0.8122666666666667,
+      "grad_norm": 0.5099186718206056,
+      "learning_rate": 1.7936395717326704e-05,
+      "loss": 0.731,
+      "step": 1523
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.38852863824832645,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6515,
+      "step": 1524
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 0.8558317095638946,
+      "learning_rate": 1.773938710748706e-05,
+      "loss": 0.5826,
+      "step": 1525
+    },
+    {
+      "epoch": 0.8138666666666666,
+      "grad_norm": 0.4568966509730956,
+      "learning_rate": 1.7641251119690505e-05,
+      "loss": 0.6245,
+      "step": 1526
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.3905492382856017,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.623,
+      "step": 1527
+    },
+    {
+      "epoch": 0.8149333333333333,
+      "grad_norm": 0.4484323000541753,
+      "learning_rate": 1.744571724358789e-05,
+      "loss": 0.6571,
+      "step": 1528
+    },
+    {
+      "epoch": 0.8154666666666667,
+      "grad_norm": 0.4143297902340561,
+      "learning_rate": 1.7348319939175637e-05,
+      "loss": 0.6794,
+      "step": 1529
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.5571960141540871,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.6572,
+      "step": 1530
+    },
+    {
+      "epoch": 0.8165333333333333,
+      "grad_norm": 0.4434340121566493,
+      "learning_rate": 1.715426605184407e-05,
+      "loss": 0.601,
+      "step": 1531
+    },
+    {
+      "epoch": 0.8170666666666667,
+      "grad_norm": 0.4575437450532476,
+      "learning_rate": 1.705761004839911e-05,
+      "loss": 0.5841,
+      "step": 1532
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.4155530391409412,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.5829,
+      "step": 1533
+    },
+    {
+      "epoch": 0.8181333333333334,
+      "grad_norm": 0.4867803274492459,
+      "learning_rate": 1.6865041365097435e-05,
+      "loss": 0.6954,
+      "step": 1534
+    },
+    {
+      "epoch": 0.8186666666666667,
+      "grad_norm": 0.4007212557744502,
+      "learning_rate": 1.676912926028007e-05,
+      "loss": 0.6377,
+      "step": 1535
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.47953864644031985,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6398,
+      "step": 1536
+    },
+    {
+      "epoch": 0.8197333333333333,
+      "grad_norm": 0.36216117236352097,
+      "learning_rate": 1.6578050956351886e-05,
+      "loss": 0.5955,
+      "step": 1537
+    },
+    {
+      "epoch": 0.8202666666666667,
+      "grad_norm": 0.4215038024581817,
+      "learning_rate": 1.6482885327829913e-05,
+      "loss": 0.5442,
+      "step": 1538
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.3888286315492147,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6086,
+      "step": 1539
+    },
+    {
+      "epoch": 0.8213333333333334,
+      "grad_norm": 0.36291020187514095,
+      "learning_rate": 1.6293302538564382e-05,
+      "loss": 0.669,
+      "step": 1540
+    },
+    {
+      "epoch": 0.8218666666666666,
+      "grad_norm": 0.4127933693889317,
+      "learning_rate": 1.619888594394382e-05,
+      "loss": 0.5936,
+      "step": 1541
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.4612420205644517,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6399,
+      "step": 1542
+    },
+    {
+      "epoch": 0.8229333333333333,
+      "grad_norm": 0.37059553028725717,
+      "learning_rate": 1.601080376443763e-05,
+      "loss": 0.6532,
+      "step": 1543
+    },
+    {
+      "epoch": 0.8234666666666667,
+      "grad_norm": 0.37171843223128626,
+      "learning_rate": 1.5917138741193973e-05,
+      "loss": 0.5681,
+      "step": 1544
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.38126739055664743,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6638,
+      "step": 1545
+    },
+    {
+      "epoch": 0.8245333333333333,
+      "grad_norm": 0.501560182270239,
+      "learning_rate": 1.573056222621453e-05,
+      "loss": 0.654,
+      "step": 1546
+    },
+    {
+      "epoch": 0.8250666666666666,
+      "grad_norm": 0.35005270978531305,
+      "learning_rate": 1.5637651291624523e-05,
+      "loss": 0.6031,
+      "step": 1547
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.4579964934854775,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6843,
+      "step": 1548
+    },
+    {
+      "epoch": 0.8261333333333334,
+      "grad_norm": 0.4607942867233716,
+      "learning_rate": 1.5452585455473977e-05,
+      "loss": 0.7195,
+      "step": 1549
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.4311888167516736,
+      "learning_rate": 1.536043110654809e-05,
+      "loss": 0.6781,
+      "step": 1550
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.39518314956332207,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6296,
+      "step": 1551
+    },
+    {
+      "epoch": 0.8277333333333333,
+      "grad_norm": 0.42462918863938204,
+      "learning_rate": 1.5176880922928616e-05,
+      "loss": 0.6714,
+      "step": 1552
+    },
+    {
+      "epoch": 0.8282666666666667,
+      "grad_norm": 0.5099449458583174,
+      "learning_rate": 1.5085485636343755e-05,
+      "loss": 0.6888,
+      "step": 1553
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.4305167865608094,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6933,
+      "step": 1554
+    },
+    {
+      "epoch": 0.8293333333333334,
+      "grad_norm": 0.3906628278194183,
+      "learning_rate": 1.4903456038223939e-05,
+      "loss": 0.6238,
+      "step": 1555
+    },
+    {
+      "epoch": 0.8298666666666666,
+      "grad_norm": 0.3973741892339613,
+      "learning_rate": 1.4812822270257009e-05,
+      "loss": 0.6051,
+      "step": 1556
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.4251466738972076,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.6643,
+      "step": 1557
+    },
+    {
+      "epoch": 0.8309333333333333,
+      "grad_norm": 0.43910898280815147,
+      "learning_rate": 1.4632318149739177e-05,
+      "loss": 0.5949,
+      "step": 1558
+    },
+    {
+      "epoch": 0.8314666666666667,
+      "grad_norm": 0.34915883478379534,
+      "learning_rate": 1.454244833620102e-05,
+      "loss": 0.6349,
+      "step": 1559
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.5213617644505394,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6377,
+      "step": 1560
+    },
+    {
+      "epoch": 0.8325333333333333,
+      "grad_norm": 0.3545430675105045,
+      "learning_rate": 1.4363474544389877e-05,
+      "loss": 0.5441,
+      "step": 1561
+    },
+    {
+      "epoch": 0.8330666666666666,
+      "grad_norm": 0.40508741535303194,
+      "learning_rate": 1.4274371100559791e-05,
+      "loss": 0.624,
+      "step": 1562
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.3592324997219386,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.6798,
+      "step": 1563
+    },
+    {
+      "epoch": 0.8341333333333333,
+      "grad_norm": 0.42576466795036494,
+      "learning_rate": 1.409693244743192e-05,
+      "loss": 0.6369,
+      "step": 1564
+    },
+    {
+      "epoch": 0.8346666666666667,
+      "grad_norm": 0.42515842466851367,
+      "learning_rate": 1.4008597767992871e-05,
+      "loss": 0.6619,
+      "step": 1565
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.3684056355727429,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.6439,
+      "step": 1566
+    },
+    {
+      "epoch": 0.8357333333333333,
+      "grad_norm": 0.40750521721877575,
+      "learning_rate": 1.3832699022267515e-05,
+      "loss": 0.6697,
+      "step": 1567
+    },
+    {
+      "epoch": 0.8362666666666667,
+      "grad_norm": 0.4212557915225309,
+      "learning_rate": 1.37451354812416e-05,
+      "loss": 0.6729,
+      "step": 1568
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.37503390518120405,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.5766,
+      "step": 1569
+    },
+    {
+      "epoch": 0.8373333333333334,
+      "grad_norm": 0.6496010140480745,
+      "learning_rate": 1.3570781370252582e-05,
+      "loss": 0.5991,
+      "step": 1570
+    },
+    {
+      "epoch": 0.8378666666666666,
+      "grad_norm": 0.37552527204325087,
+      "learning_rate": 1.3483991320937306e-05,
+      "loss": 0.6557,
+      "step": 1571
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3739385674179931,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.5871,
+      "step": 1572
+    },
+    {
+      "epoch": 0.8389333333333333,
+      "grad_norm": 0.5293550137074311,
+      "learning_rate": 1.3311186530505838e-05,
+      "loss": 0.6839,
+      "step": 1573
+    },
+    {
+      "epoch": 0.8394666666666667,
+      "grad_norm": 0.3677175062078189,
+      "learning_rate": 1.322517230541096e-05,
+      "loss": 0.6414,
+      "step": 1574
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.548355690374935,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6905,
+      "step": 1575
+    },
+    {
+      "epoch": 0.8405333333333334,
+      "grad_norm": 0.46196465082695615,
+      "learning_rate": 1.30539214797198e-05,
+      "loss": 0.6747,
+      "step": 1576
+    },
+    {
+      "epoch": 0.8410666666666666,
+      "grad_norm": 0.41851828465891927,
+      "learning_rate": 1.2968685390504465e-05,
+      "loss": 0.6216,
+      "step": 1577
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.40981786976518636,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6417,
+      "step": 1578
+    },
+    {
+      "epoch": 0.8421333333333333,
+      "grad_norm": 0.3839440413324502,
+      "learning_rate": 1.2798993131973091e-05,
+      "loss": 0.6436,
+      "step": 1579
+    },
+    {
+      "epoch": 0.8426666666666667,
+      "grad_norm": 0.385038786223522,
+      "learning_rate": 1.2714537469383858e-05,
+      "loss": 0.5943,
+      "step": 1580
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.4053712383349885,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6126,
+      "step": 1581
+    },
+    {
+      "epoch": 0.8437333333333333,
+      "grad_norm": 0.5242860113328237,
+      "learning_rate": 1.2546408338544769e-05,
+      "loss": 0.6922,
+      "step": 1582
+    },
+    {
+      "epoch": 0.8442666666666667,
+      "grad_norm": 0.4126345251310983,
+      "learning_rate": 1.2462735372353996e-05,
+      "loss": 0.6342,
+      "step": 1583
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3822975004295326,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6097,
+      "step": 1584
+    },
+    {
+      "epoch": 0.8453333333333334,
+      "grad_norm": 0.426063098512969,
+      "learning_rate": 1.2296173887730123e-05,
+      "loss": 0.665,
+      "step": 1585
+    },
+    {
+      "epoch": 0.8458666666666667,
+      "grad_norm": 0.3834280671572431,
+      "learning_rate": 1.2213285866674905e-05,
+      "loss": 0.6156,
+      "step": 1586
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.5218622425452143,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6951,
+      "step": 1587
+    },
+    {
+      "epoch": 0.8469333333333333,
+      "grad_norm": 0.4311073635345907,
+      "learning_rate": 1.2048296504658207e-05,
+      "loss": 0.6073,
+      "step": 1588
+    },
+    {
+      "epoch": 0.8474666666666667,
+      "grad_norm": 0.41770082292991967,
+      "learning_rate": 1.1966195656380031e-05,
+      "loss": 0.623,
+      "step": 1589
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4788822598582512,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6017,
+      "step": 1590
+    },
+    {
+      "epoch": 0.8485333333333334,
+      "grad_norm": 0.45318611123423785,
+      "learning_rate": 1.1802782851111205e-05,
+      "loss": 0.5844,
+      "step": 1591
+    },
+    {
+      "epoch": 0.8490666666666666,
+      "grad_norm": 0.4666932424200912,
+      "learning_rate": 1.1721471382096027e-05,
+      "loss": 0.6738,
+      "step": 1592
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.32966958448507333,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.5449,
+      "step": 1593
+    },
+    {
+      "epoch": 0.8501333333333333,
+      "grad_norm": 0.42879322785279267,
+      "learning_rate": 1.1559639525345311e-05,
+      "loss": 0.6513,
+      "step": 1594
+    },
+    {
+      "epoch": 0.8506666666666667,
+      "grad_norm": 0.4321907403217351,
+      "learning_rate": 1.1479119620864276e-05,
+      "loss": 0.5743,
+      "step": 1595
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.4805601721813143,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6635,
+      "step": 1596
+    },
+    {
+      "epoch": 0.8517333333333333,
+      "grad_norm": 0.3804008808713323,
+      "learning_rate": 1.1318873061913405e-05,
+      "loss": 0.5685,
+      "step": 1597
+    },
+    {
+      "epoch": 0.8522666666666666,
+      "grad_norm": 0.4520068873796711,
+      "learning_rate": 1.123914688596409e-05,
+      "loss": 0.6906,
+      "step": 1598
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.3858318101878712,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.5936,
+      "step": 1599
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.41950896359501727,
+      "learning_rate": 1.1080489931489391e-05,
+      "loss": 0.5963,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8538666666666667,
+      "grad_norm": 0.3617807491751094,
+      "learning_rate": 1.1001559626737756e-05,
+      "loss": 0.6464,
+      "step": 1601
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3338632658928289,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.5682,
+      "step": 1602
+    },
+    {
+      "epoch": 0.8549333333333333,
+      "grad_norm": 0.44834261317165985,
+      "learning_rate": 1.0844496540694515e-05,
+      "loss": 0.6806,
+      "step": 1603
+    },
+    {
+      "epoch": 0.8554666666666667,
+      "grad_norm": 0.38605134729581864,
+      "learning_rate": 1.0766364228417148e-05,
+      "loss": 0.639,
+      "step": 1604
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.474858212815285,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6611,
+      "step": 1605
+    },
+    {
+      "epoch": 0.8565333333333334,
+      "grad_norm": 0.36827023902002454,
+      "learning_rate": 1.0610899231924886e-05,
+      "loss": 0.5788,
+      "step": 1606
+    },
+    {
+      "epoch": 0.8570666666666666,
+      "grad_norm": 0.3681513757299373,
+      "learning_rate": 1.0533567011952094e-05,
+      "loss": 0.6167,
+      "step": 1607
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.5328726228914815,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.5887,
+      "step": 1608
+    },
+    {
+      "epoch": 0.8581333333333333,
+      "grad_norm": 0.40482039710821704,
+      "learning_rate": 1.0379704283181179e-05,
+      "loss": 0.5998,
+      "step": 1609
+    },
+    {
+      "epoch": 0.8586666666666667,
+      "grad_norm": 0.3471325558361177,
+      "learning_rate": 1.0303174233840528e-05,
+      "loss": 0.6261,
+      "step": 1610
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.4708026052451067,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.656,
+      "step": 1611
+    },
+    {
+      "epoch": 0.8597333333333333,
+      "grad_norm": 0.48267555668015905,
+      "learning_rate": 1.0150917907899926e-05,
+      "loss": 0.6586,
+      "step": 1612
+    },
+    {
+      "epoch": 0.8602666666666666,
+      "grad_norm": 0.39935894457820786,
+      "learning_rate": 1.007519208596045e-05,
+      "loss": 0.6007,
+      "step": 1613
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3707866724890578,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6012,
+      "step": 1614
+    },
+    {
+      "epoch": 0.8613333333333333,
+      "grad_norm": 0.4220376448592463,
+      "learning_rate": 9.924546254786493e-06,
+      "loss": 0.6754,
+      "step": 1615
+    },
+    {
+      "epoch": 0.8618666666666667,
+      "grad_norm": 0.41589827662354534,
+      "learning_rate": 9.849626695403324e-06,
+      "loss": 0.6374,
+      "step": 1616
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.4144330890081341,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6389,
+      "step": 1617
+    },
+    {
+      "epoch": 0.8629333333333333,
+      "grad_norm": 0.44259278148951,
+      "learning_rate": 9.700595407649805e-06,
+      "loss": 0.6401,
+      "step": 1618
+    },
+    {
+      "epoch": 0.8634666666666667,
+      "grad_norm": 0.3668074154530959,
+      "learning_rate": 9.62648412430951e-06,
+      "loss": 0.6189,
+      "step": 1619
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.42051676368538643,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6059,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8645333333333334,
+      "grad_norm": 0.4198663098402038,
+      "learning_rate": 9.479071385238892e-06,
+      "loss": 0.6102,
+      "step": 1621
+    },
+    {
+      "epoch": 0.8650666666666667,
+      "grad_norm": 0.3887042775608677,
+      "learning_rate": 9.40577036970538e-06,
+      "loss": 0.6298,
+      "step": 1622
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.41671468252084226,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6175,
+      "step": 1623
+    },
+    {
+      "epoch": 0.8661333333333333,
+      "grad_norm": 0.5725914639247383,
+      "learning_rate": 9.259980141081115e-06,
+      "loss": 0.7004,
+      "step": 1624
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.41570310691335594,
+      "learning_rate": 9.187491363342093e-06,
+      "loss": 0.6219,
+      "step": 1625
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.37678986724126673,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6325,
+      "step": 1626
+    },
+    {
+      "epoch": 0.8677333333333334,
+      "grad_norm": 0.5087701518502271,
+      "learning_rate": 9.043327563322112e-06,
+      "loss": 0.6035,
+      "step": 1627
+    },
+    {
+      "epoch": 0.8682666666666666,
+      "grad_norm": 0.5968366253849593,
+      "learning_rate": 8.971652971536148e-06,
+      "loss": 0.6302,
+      "step": 1628
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.41294872983364506,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.618,
+      "step": 1629
+    },
+    {
+      "epoch": 0.8693333333333333,
+      "grad_norm": 0.40689227115744747,
+      "learning_rate": 8.829119474567671e-06,
+      "loss": 0.6187,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8698666666666667,
+      "grad_norm": 0.41868893419471087,
+      "learning_rate": 8.758260995011825e-06,
+      "loss": 0.58,
+      "step": 1631
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.37781440875663735,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6063,
+      "step": 1632
+    },
+    {
+      "epoch": 0.8709333333333333,
+      "grad_norm": 0.4231537435489686,
+      "learning_rate": 8.617361631727138e-06,
+      "loss": 0.6347,
+      "step": 1633
+    },
+    {
+      "epoch": 0.8714666666666666,
+      "grad_norm": 0.4813253066367813,
+      "learning_rate": 8.547321168745193e-06,
+      "loss": 0.7026,
+      "step": 1634
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.37495130821317624,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.5624,
+      "step": 1635
+    },
+    {
+      "epoch": 0.8725333333333334,
+      "grad_norm": 0.39158733120979616,
+      "learning_rate": 8.408059725858719e-06,
+      "loss": 0.638,
+      "step": 1636
+    },
+    {
+      "epoch": 0.8730666666666667,
+      "grad_norm": 0.4240090134696765,
+      "learning_rate": 8.338839161809997e-06,
+      "loss": 0.6899,
+      "step": 1637
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4494047917779341,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.6973,
+      "step": 1638
+    },
+    {
+      "epoch": 0.8741333333333333,
+      "grad_norm": 0.38717966988032976,
+      "learning_rate": 8.201219382016556e-06,
+      "loss": 0.6946,
+      "step": 1639
+    },
+    {
+      "epoch": 0.8746666666666667,
+      "grad_norm": 0.40085836155970156,
+      "learning_rate": 8.132820577225387e-06,
+      "loss": 0.5721,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.46835619449527544,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6198,
+      "step": 1641
+    },
+    {
+      "epoch": 0.8757333333333334,
+      "grad_norm": 0.44389585227029554,
+      "learning_rate": 7.996846159099557e-06,
+      "loss": 0.6503,
+      "step": 1642
+    },
+    {
+      "epoch": 0.8762666666666666,
+      "grad_norm": 0.41901331171884865,
+      "learning_rate": 7.929270951805178e-06,
+      "loss": 0.6522,
+      "step": 1643
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.45930967963327746,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.5742,
+      "step": 1644
+    },
+    {
+      "epoch": 0.8773333333333333,
+      "grad_norm": 0.4149968259909745,
+      "learning_rate": 7.794945549701993e-06,
+      "loss": 0.6598,
+      "step": 1645
+    },
+    {
+      "epoch": 0.8778666666666667,
+      "grad_norm": 0.4684133133426712,
+      "learning_rate": 7.728195756009204e-06,
+      "loss": 0.6798,
+      "step": 1646
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.40471835448358673,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.5783,
+      "step": 1647
+    },
+    {
+      "epoch": 0.8789333333333333,
+      "grad_norm": 0.39560149646464277,
+      "learning_rate": 7.595522979965819e-06,
+      "loss": 0.6177,
+      "step": 1648
+    },
+    {
+      "epoch": 0.8794666666666666,
+      "grad_norm": 0.40033366704773343,
+      "learning_rate": 7.529600393796232e-06,
+      "loss": 0.6688,
+      "step": 1649
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.409847818506166,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6238,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8805333333333333,
+      "grad_norm": 0.4150779849093329,
+      "learning_rate": 7.3985838094349444e-06,
+      "loss": 0.6336,
+      "step": 1651
+    },
+    {
+      "epoch": 0.8810666666666667,
+      "grad_norm": 0.864398242995123,
+      "learning_rate": 7.333490202478666e-06,
+      "loss": 0.6764,
+      "step": 1652
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.5700955932577777,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6777,
+      "step": 1653
+    },
+    {
+      "epoch": 0.8821333333333333,
+      "grad_norm": 0.4018859847440789,
+      "learning_rate": 7.204133330911178e-06,
+      "loss": 0.6372,
+      "step": 1654
+    },
+    {
+      "epoch": 0.8826666666666667,
+      "grad_norm": 0.5595919704007116,
+      "learning_rate": 7.1398704525792e-06,
+      "loss": 0.6671,
+      "step": 1655
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.44744266480441175,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6112,
+      "step": 1656
+    },
+    {
+      "epoch": 0.8837333333333334,
+      "grad_norm": 0.3344518507688403,
+      "learning_rate": 7.012176770311862e-06,
+      "loss": 0.558,
+      "step": 1657
+    },
+    {
+      "epoch": 0.8842666666666666,
+      "grad_norm": 0.4113097200970075,
+      "learning_rate": 6.948746347689183e-06,
+      "loss": 0.6259,
+      "step": 1658
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.37460528555372463,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.5764,
+      "step": 1659
+    },
+    {
+      "epoch": 0.8853333333333333,
+      "grad_norm": 0.5173789381383239,
+      "learning_rate": 6.8227192865295995e-06,
+      "loss": 0.6869,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8858666666666667,
+      "grad_norm": 0.38227613922471,
+      "learning_rate": 6.760123024328624e-06,
+      "loss": 0.6505,
+      "step": 1661
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.38493026301224964,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.5685,
+      "step": 1662
+    },
+    {
+      "epoch": 0.8869333333333334,
+      "grad_norm": 0.4490857230447103,
+      "learning_rate": 6.635765971293484e-06,
+      "loss": 0.6946,
+      "step": 1663
+    },
+    {
+      "epoch": 0.8874666666666666,
+      "grad_norm": 0.41304345370614004,
+      "learning_rate": 6.5740055518083375e-06,
+      "loss": 0.6859,
+      "step": 1664
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.40744245874632135,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.588,
+      "step": 1665
+    },
+    {
+      "epoch": 0.8885333333333333,
+      "grad_norm": 0.44239687927494337,
+      "learning_rate": 6.451321849032288e-06,
+      "loss": 0.5929,
+      "step": 1666
+    },
+    {
+      "epoch": 0.8890666666666667,
+      "grad_norm": 0.43987486402033016,
+      "learning_rate": 6.390398932093555e-06,
+      "loss": 0.6222,
+      "step": 1667
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3721311434397407,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.579,
+      "step": 1668
+    },
+    {
+      "epoch": 0.8901333333333333,
+      "grad_norm": 0.3904356335284676,
+      "learning_rate": 6.269391876739495e-06,
+      "loss": 0.6187,
+      "step": 1669
+    },
+    {
+      "epoch": 0.8906666666666667,
+      "grad_norm": 0.4871806474824309,
+      "learning_rate": 6.209308099669597e-06,
+      "loss": 0.6944,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.4021699333747135,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.607,
+      "step": 1671
+    },
+    {
+      "epoch": 0.8917333333333334,
+      "grad_norm": 0.4246269131950553,
+      "learning_rate": 6.089980943839924e-06,
+      "loss": 0.6393,
+      "step": 1672
+    },
+    {
+      "epoch": 0.8922666666666667,
+      "grad_norm": 0.45995076739188906,
+      "learning_rate": 6.030737921409169e-06,
+      "loss": 0.5958,
+      "step": 1673
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.49114507115566153,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6515,
+      "step": 1674
+    },
+    {
+      "epoch": 0.8933333333333333,
+      "grad_norm": 0.43786087916568106,
+      "learning_rate": 5.913093872058528e-06,
+      "loss": 0.683,
+      "step": 1675
+    },
+    {
+      "epoch": 0.8938666666666667,
+      "grad_norm": 0.45967511384777227,
+      "learning_rate": 5.854693196441641e-06,
+      "loss": 0.638,
+      "step": 1676
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.44382110536748526,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6734,
+      "step": 1677
+    },
+    {
+      "epoch": 0.8949333333333334,
+      "grad_norm": 0.36098842399148445,
+      "learning_rate": 5.738735415290642e-06,
+      "loss": 0.5527,
+      "step": 1678
+    },
+    {
+      "epoch": 0.8954666666666666,
+      "grad_norm": 0.4384488298020481,
+      "learning_rate": 5.681178656024055e-06,
+      "loss": 0.708,
+      "step": 1679
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.4605140379415962,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.648,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8965333333333333,
+      "grad_norm": 0.45270005595938945,
+      "learning_rate": 5.566910259474289e-06,
+      "loss": 0.6898,
+      "step": 1681
+    },
+    {
+      "epoch": 0.8970666666666667,
+      "grad_norm": 0.4106034614437745,
+      "learning_rate": 5.510198963413881e-06,
+      "loss": 0.6174,
+      "step": 1682
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.38222181067866917,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.5907,
+      "step": 1683
+    },
+    {
+      "epoch": 0.8981333333333333,
+      "grad_norm": 0.37655596401410873,
+      "learning_rate": 5.397623022464226e-06,
+      "loss": 0.635,
+      "step": 1684
+    },
+    {
+      "epoch": 0.8986666666666666,
+      "grad_norm": 0.4439893541682315,
+      "learning_rate": 5.341758713743828e-06,
+      "loss": 0.6316,
+      "step": 1685
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.4650227804386738,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6323,
+      "step": 1686
+    },
+    {
+      "epoch": 0.8997333333333334,
+      "grad_norm": 0.38012305655100215,
+      "learning_rate": 5.230878253907912e-06,
+      "loss": 0.6356,
+      "step": 1687
+    },
+    {
+      "epoch": 0.9002666666666667,
+      "grad_norm": 0.43153803190878853,
+      "learning_rate": 5.175862433898282e-06,
+      "loss": 0.612,
+      "step": 1688
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.46046853955822276,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.5875,
+      "step": 1689
+    },
+    {
+      "epoch": 0.9013333333333333,
+      "grad_norm": 0.42983289954702175,
+      "learning_rate": 5.066680435123106e-06,
+      "loss": 0.6309,
+      "step": 1690
+    },
+    {
+      "epoch": 0.9018666666666667,
+      "grad_norm": 0.35262408600859013,
+      "learning_rate": 5.012514582391592e-06,
+      "loss": 0.5742,
+      "step": 1691
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.43184209643963073,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6638,
+      "step": 1692
+    },
+    {
+      "epoch": 0.9029333333333334,
+      "grad_norm": 0.4889204934449891,
+      "learning_rate": 4.905033978977491e-06,
+      "loss": 0.6544,
+      "step": 1693
+    },
+    {
+      "epoch": 0.9034666666666666,
+      "grad_norm": 0.42236336939209945,
+      "learning_rate": 4.851719549248301e-06,
+      "loss": 0.6422,
+      "step": 1694
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.4567245542144497,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6735,
+      "step": 1695
+    },
+    {
+      "epoch": 0.9045333333333333,
+      "grad_norm": 0.4357239280931403,
+      "learning_rate": 4.745943229770122e-06,
+      "loss": 0.6482,
+      "step": 1696
+    },
+    {
+      "epoch": 0.9050666666666667,
+      "grad_norm": 0.34538968516681984,
+      "learning_rate": 4.693481655885257e-06,
+      "loss": 0.5838,
+      "step": 1697
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.3746841707569123,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.5988,
+      "step": 1698
+    },
+    {
+      "epoch": 0.9061333333333333,
+      "grad_norm": 0.4022329001631021,
+      "learning_rate": 4.58941246311464e-06,
+      "loss": 0.6291,
+      "step": 1699
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.40013953435417127,
+      "learning_rate": 4.537805154995278e-06,
+      "loss": 0.5814,
+      "step": 1700
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.40005018623772687,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.5788,
+      "step": 1701
+    },
+    {
+      "epoch": 0.9077333333333333,
+      "grad_norm": 0.35388380651555296,
+      "learning_rate": 4.435445885824285e-06,
+      "loss": 0.5536,
+      "step": 1702
+    },
+    {
+      "epoch": 0.9082666666666667,
+      "grad_norm": 0.3703089037011794,
+      "learning_rate": 4.384694230432984e-06,
+      "loss": 0.5869,
+      "step": 1703
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3622268087447349,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.5807,
+      "step": 1704
+    },
+    {
+      "epoch": 0.9093333333333333,
+      "grad_norm": 0.396655726300398,
+      "learning_rate": 4.2840476357989825e-06,
+      "loss": 0.6126,
+      "step": 1705
+    },
+    {
+      "epoch": 0.9098666666666667,
+      "grad_norm": 0.35351844522496834,
+      "learning_rate": 4.2341529971023255e-06,
+      "loss": 0.5858,
+      "step": 1706
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.37817326714801963,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6063,
+      "step": 1707
+    },
+    {
+      "epoch": 0.9109333333333334,
+      "grad_norm": 0.4276057921437013,
+      "learning_rate": 4.135221781914034e-06,
+      "loss": 0.6533,
+      "step": 1708
+    },
+    {
+      "epoch": 0.9114666666666666,
+      "grad_norm": 0.4328477312558595,
+      "learning_rate": 4.0861855008460405e-06,
+      "loss": 0.6599,
+      "step": 1709
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3958361891193586,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.6459,
+      "step": 1710
+    },
+    {
+      "epoch": 0.9125333333333333,
+      "grad_norm": 0.41189001066763725,
+      "learning_rate": 3.988972323910778e-06,
+      "loss": 0.6646,
+      "step": 1711
+    },
+    {
+      "epoch": 0.9130666666666667,
+      "grad_norm": 0.3675769278662478,
+      "learning_rate": 3.9407957183368095e-06,
+      "loss": 0.5578,
+      "step": 1712
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.4985512195488047,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.6103,
+      "step": 1713
+    },
+    {
+      "epoch": 0.9141333333333334,
+      "grad_norm": 0.4078016854637492,
+      "learning_rate": 3.845303192289074e-06,
+      "loss": 0.6205,
+      "step": 1714
+    },
+    {
+      "epoch": 0.9146666666666666,
+      "grad_norm": 0.4789924835755939,
+      "learning_rate": 3.797987556970495e-06,
+      "loss": 0.6365,
+      "step": 1715
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.42676372270114,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6447,
+      "step": 1716
+    },
+    {
+      "epoch": 0.9157333333333333,
+      "grad_norm": 0.37019111540882965,
+      "learning_rate": 3.7042182482018075e-06,
+      "loss": 0.6,
+      "step": 1717
+    },
+    {
+      "epoch": 0.9162666666666667,
+      "grad_norm": 0.4344255348252073,
+      "learning_rate": 3.6577648547611033e-06,
+      "loss": 0.6233,
+      "step": 1718
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.5098592399994804,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.7041,
+      "step": 1719
+    },
+    {
+      "epoch": 0.9173333333333333,
+      "grad_norm": 0.40412614594107715,
+      "learning_rate": 3.565721283350931e-06,
+      "loss": 0.6506,
+      "step": 1720
+    },
+    {
+      "epoch": 0.9178666666666667,
+      "grad_norm": 0.4017842528798377,
+      "learning_rate": 3.5201313802375456e-06,
+      "loss": 0.6342,
+      "step": 1721
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.4186885329532945,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6644,
+      "step": 1722
+    },
+    {
+      "epoch": 0.9189333333333334,
+      "grad_norm": 0.39614510568843886,
+      "learning_rate": 3.4298160198856568e-06,
+      "loss": 0.6043,
+      "step": 1723
+    },
+    {
+      "epoch": 0.9194666666666667,
+      "grad_norm": 0.4407424852778707,
+      "learning_rate": 3.3850908323424967e-06,
+      "loss": 0.661,
+      "step": 1724
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.3947691132853167,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.5734,
+      "step": 1725
+    },
+    {
+      "epoch": 0.9205333333333333,
+      "grad_norm": 0.4102618532437786,
+      "learning_rate": 3.296506110302422e-06,
+      "loss": 0.5752,
+      "step": 1726
+    },
+    {
+      "epoch": 0.9210666666666667,
+      "grad_norm": 0.39763954077320046,
+      "learning_rate": 3.252646840332918e-06,
+      "loss": 0.6331,
+      "step": 1727
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.38859094017171963,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.554,
+      "step": 1728
+    },
+    {
+      "epoch": 0.9221333333333334,
+      "grad_norm": 0.4588451672372624,
+      "learning_rate": 3.1657951373467497e-06,
+      "loss": 0.6015,
+      "step": 1729
+    },
+    {
+      "epoch": 0.9226666666666666,
+      "grad_norm": 0.394617664738612,
+      "learning_rate": 3.1228029636824475e-06,
+      "loss": 0.6332,
+      "step": 1730
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.40402859773708183,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6502,
+      "step": 1731
+    },
+    {
+      "epoch": 0.9237333333333333,
+      "grad_norm": 0.3620964096863013,
+      "learning_rate": 3.037686613916857e-06,
+      "loss": 0.5847,
+      "step": 1732
+    },
+    {
+      "epoch": 0.9242666666666667,
+      "grad_norm": 0.39653047718654283,
+      "learning_rate": 2.995562691985898e-06,
+      "loss": 0.5759,
+      "step": 1733
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.37820773086511067,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.5757,
+      "step": 1734
+    },
+    {
+      "epoch": 0.9253333333333333,
+      "grad_norm": 0.48354202592923184,
+      "learning_rate": 2.912183982969385e-06,
+      "loss": 0.633,
+      "step": 1735
+    },
+    {
+      "epoch": 0.9258666666666666,
+      "grad_norm": 0.4088213449252369,
+      "learning_rate": 2.8709294448653225e-06,
+      "loss": 0.6342,
+      "step": 1736
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.3785698115850805,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6574,
+      "step": 1737
+    },
+    {
+      "epoch": 0.9269333333333334,
+      "grad_norm": 0.3841581688458352,
+      "learning_rate": 2.789290617426765e-06,
+      "loss": 0.5831,
+      "step": 1738
+    },
+    {
+      "epoch": 0.9274666666666667,
+      "grad_norm": 0.4175965446852087,
+      "learning_rate": 2.748906571878207e-06,
+      "loss": 0.6021,
+      "step": 1739
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.40051758545421356,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.6028,
+      "step": 1740
+    },
+    {
+      "epoch": 0.9285333333333333,
+      "grad_norm": 0.37273183672886745,
+      "learning_rate": 2.6690098200866098e-06,
+      "loss": 0.577,
+      "step": 1741
+    },
+    {
+      "epoch": 0.9290666666666667,
+      "grad_norm": 0.3578309465440957,
+      "learning_rate": 2.6294973524274125e-06,
+      "loss": 0.564,
+      "step": 1742
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.4164954369827976,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6235,
+      "step": 1743
+    },
+    {
+      "epoch": 0.9301333333333334,
+      "grad_norm": 0.40703600699695486,
+      "learning_rate": 2.551344823532964e-06,
+      "loss": 0.627,
+      "step": 1744
+    },
+    {
+      "epoch": 0.9306666666666666,
+      "grad_norm": 0.4308613221237642,
+      "learning_rate": 2.5127049956730207e-06,
+      "loss": 0.6092,
+      "step": 1745
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.40309993952556944,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.6486,
+      "step": 1746
+    },
+    {
+      "epoch": 0.9317333333333333,
+      "grad_norm": 0.39045448266587895,
+      "learning_rate": 2.436298790049363e-06,
+      "loss": 0.6294,
+      "step": 1747
+    },
+    {
+      "epoch": 0.9322666666666667,
+      "grad_norm": 0.3871387157818062,
+      "learning_rate": 2.3985326404461604e-06,
+      "loss": 0.6227,
+      "step": 1748
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.4121871503169258,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6177,
+      "step": 1749
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.5563305241966505,
+      "learning_rate": 2.3238748115339324e-06,
+      "loss": 0.676,
+      "step": 1750
+    },
+    {
+      "epoch": 0.9338666666666666,
+      "grad_norm": 0.44233128189965715,
+      "learning_rate": 2.286983355164529e-06,
+      "loss": 0.6635,
+      "step": 1751
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.43228959200325306,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6595,
+      "step": 1752
+    },
+    {
+      "epoch": 0.9349333333333333,
+      "grad_norm": 0.3882914765123684,
+      "learning_rate": 2.2140759094162467e-06,
+      "loss": 0.5887,
+      "step": 1753
+    },
+    {
+      "epoch": 0.9354666666666667,
+      "grad_norm": 0.3604084122871091,
+      "learning_rate": 2.178060137750071e-06,
+      "loss": 0.5783,
+      "step": 1754
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.4923309735087353,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.6342,
+      "step": 1755
+    },
+    {
+      "epoch": 0.9365333333333333,
+      "grad_norm": 0.38226580095034884,
+      "learning_rate": 2.106905034576112e-06,
+      "loss": 0.5738,
+      "step": 1756
+    },
+    {
+      "epoch": 0.9370666666666667,
+      "grad_norm": 0.4377425839387685,
+      "learning_rate": 2.0717659155482738e-06,
+      "loss": 0.6496,
+      "step": 1757
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.3527473642611892,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.5983,
+      "step": 1758
+    },
+    {
+      "epoch": 0.9381333333333334,
+      "grad_norm": 0.39912448491239905,
+      "learning_rate": 2.002365067264289e-06,
+      "loss": 0.5969,
+      "step": 1759
+    },
+    {
+      "epoch": 0.9386666666666666,
+      "grad_norm": 0.4226452613966473,
+      "learning_rate": 1.968103545249611e-06,
+      "loss": 0.6234,
+      "step": 1760
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.42902322612300664,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6761,
+      "step": 1761
+    },
+    {
+      "epoch": 0.9397333333333333,
+      "grad_norm": 0.4469457004162467,
+      "learning_rate": 1.900458817025097e-06,
+      "loss": 0.572,
+      "step": 1762
+    },
+    {
+      "epoch": 0.9402666666666667,
+      "grad_norm": 0.41968848850301177,
+      "learning_rate": 1.8670758128126909e-06,
+      "loss": 0.5958,
+      "step": 1763
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.4088909356738927,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6198,
+      "step": 1764
+    },
+    {
+      "epoch": 0.9413333333333334,
+      "grad_norm": 0.3856136564613469,
+      "learning_rate": 1.8011890226208527e-06,
+      "loss": 0.5778,
+      "step": 1765
+    },
+    {
+      "epoch": 0.9418666666666666,
+      "grad_norm": 0.6538311846681734,
+      "learning_rate": 1.7686854333893833e-06,
+      "loss": 0.6943,
+      "step": 1766
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.38972142264775966,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6159,
+      "step": 1767
+    },
+    {
+      "epoch": 0.9429333333333333,
+      "grad_norm": 0.3913667900168577,
+      "learning_rate": 1.7045583519583074e-06,
+      "loss": 0.6314,
+      "step": 1768
+    },
+    {
+      "epoch": 0.9434666666666667,
+      "grad_norm": 0.39282375591012886,
+      "learning_rate": 1.6729350512519005e-06,
+      "loss": 0.6334,
+      "step": 1769
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4060893225762606,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6276,
+      "step": 1770
+    },
+    {
+      "epoch": 0.9445333333333333,
+      "grad_norm": 0.39725728594060705,
+      "learning_rate": 1.6105694020169593e-06,
+      "loss": 0.6707,
+      "step": 1771
+    },
+    {
+      "epoch": 0.9450666666666667,
+      "grad_norm": 0.45614412437773155,
+      "learning_rate": 1.5798272397217095e-06,
+      "loss": 0.636,
+      "step": 1772
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.3787903211402147,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6121,
+      "step": 1773
+    },
+    {
+      "epoch": 0.9461333333333334,
+      "grad_norm": 0.46964438676140063,
+      "learning_rate": 1.5192246987791981e-06,
+      "loss": 0.7232,
+      "step": 1774
+    },
+    {
+      "epoch": 0.9466666666666667,
+      "grad_norm": 0.40521348986377664,
+      "learning_rate": 1.489364501100332e-06,
+      "loss": 0.6322,
+      "step": 1775
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.39599913318282937,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6513,
+      "step": 1776
+    },
+    {
+      "epoch": 0.9477333333333333,
+      "grad_norm": 0.3866365521413763,
+      "learning_rate": 1.430526697162482e-06,
+      "loss": 0.6067,
+      "step": 1777
+    },
+    {
+      "epoch": 0.9482666666666667,
+      "grad_norm": 0.37923608237865436,
+      "learning_rate": 1.4015492666021312e-06,
+      "loss": 0.6434,
+      "step": 1778
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.37607585850317243,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.6123,
+      "step": 1779
+    },
+    {
+      "epoch": 0.9493333333333334,
+      "grad_norm": 0.37654613400330933,
+      "learning_rate": 1.344477780953346e-06,
+      "loss": 0.613,
+      "step": 1780
+    },
+    {
+      "epoch": 0.9498666666666666,
+      "grad_norm": 0.4085526259935621,
+      "learning_rate": 1.3163838962890195e-06,
+      "loss": 0.5823,
+      "step": 1781
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.37706342506067253,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6105,
+      "step": 1782
+    },
+    {
+      "epoch": 0.9509333333333333,
+      "grad_norm": 0.4321633352133741,
+      "learning_rate": 1.261080262743297e-06,
+      "loss": 0.64,
+      "step": 1783
+    },
+    {
+      "epoch": 0.9514666666666667,
+      "grad_norm": 0.40979503291913427,
+      "learning_rate": 1.2338706790069431e-06,
+      "loss": 0.6258,
+      "step": 1784
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.35312484737595107,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.618,
+      "step": 1785
+    },
+    {
+      "epoch": 0.9525333333333333,
+      "grad_norm": 0.3648822216766174,
+      "learning_rate": 1.1803363838667092e-06,
+      "loss": 0.6074,
+      "step": 1786
+    },
+    {
+      "epoch": 0.9530666666666666,
+      "grad_norm": 0.3876797565517906,
+      "learning_rate": 1.1540118323243865e-06,
+      "loss": 0.6507,
+      "step": 1787
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.4596479724815644,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6629,
+      "step": 1788
+    },
+    {
+      "epoch": 0.9541333333333334,
+      "grad_norm": 0.41914541205421035,
+      "learning_rate": 1.1022483143405705e-06,
+      "loss": 0.6365,
+      "step": 1789
+    },
+    {
+      "epoch": 0.9546666666666667,
+      "grad_norm": 0.4226538276338166,
+      "learning_rate": 1.076809502472831e-06,
+      "loss": 0.6159,
+      "step": 1790
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.453590112930756,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6396,
+      "step": 1791
+    },
+    {
+      "epoch": 0.9557333333333333,
+      "grad_norm": 0.8227867613319757,
+      "learning_rate": 1.0268181528061749e-06,
+      "loss": 0.6052,
+      "step": 1792
+    },
+    {
+      "epoch": 0.9562666666666667,
+      "grad_norm": 0.4032076507160707,
+      "learning_rate": 1.0022657642890231e-06,
+      "loss": 0.6231,
+      "step": 1793
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.36310835104679606,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.5699,
+      "step": 1794
+    },
+    {
+      "epoch": 0.9573333333333334,
+      "grad_norm": 0.37503508076643055,
+      "learning_rate": 9.540479264726676e-07,
+      "loss": 0.6047,
+      "step": 1795
+    },
+    {
+      "epoch": 0.9578666666666666,
+      "grad_norm": 0.3558234391460902,
+      "learning_rate": 9.303826211592315e-07,
+      "loss": 0.6084,
+      "step": 1796
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.4579645184505321,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6149,
+      "step": 1797
+    },
+    {
+      "epoch": 0.9589333333333333,
+      "grad_norm": 0.37666047242840756,
+      "learning_rate": 8.839395910626213e-07,
+      "loss": 0.6185,
+      "step": 1798
+    },
+    {
+      "epoch": 0.9594666666666667,
+      "grad_norm": 0.36736878091906583,
+      "learning_rate": 8.611620049653879e-07,
+      "loss": 0.633,
+      "step": 1799
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3969085332861335,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.5631,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9605333333333334,
+      "grad_norm": 0.4769510880191858,
+      "learning_rate": 8.16495030759501e-07,
+      "loss": 0.6434,
+      "step": 1801
+    },
+    {
+      "epoch": 0.9610666666666666,
+      "grad_norm": 0.41106459360183417,
+      "learning_rate": 7.946057760332193e-07,
+      "loss": 0.6728,
+      "step": 1802
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.49660782368097145,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6455,
+      "step": 1803
+    },
+    {
+      "epoch": 0.9621333333333333,
+      "grad_norm": 0.4628750642052684,
+      "learning_rate": 7.517160581569372e-07,
+      "loss": 0.6628,
+      "step": 1804
+    },
+    {
+      "epoch": 0.9626666666666667,
+      "grad_norm": 0.39404379341324314,
+      "learning_rate": 7.307157230821426e-07,
+      "loss": 0.5824,
+      "step": 1805
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.4383186870087429,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6194,
+      "step": 1806
+    },
+    {
+      "epoch": 0.9637333333333333,
+      "grad_norm": 0.3653631542996674,
+      "learning_rate": 6.896044142100433e-07,
+      "loss": 0.6077,
+      "step": 1807
+    },
+    {
+      "epoch": 0.9642666666666667,
+      "grad_norm": 0.3957605584777071,
+      "learning_rate": 6.694935631773258e-07,
+      "loss": 0.5874,
+      "step": 1808
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.4037165979888819,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6066,
+      "step": 1809
+    },
+    {
+      "epoch": 0.9653333333333334,
+      "grad_norm": 0.428723502501131,
+      "learning_rate": 6.301617681886863e-07,
+      "loss": 0.6354,
+      "step": 1810
+    },
+    {
+      "epoch": 0.9658666666666667,
+      "grad_norm": 0.4378736093813807,
+      "learning_rate": 6.109409416834688e-07,
+      "loss": 0.5974,
+      "step": 1811
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.44772881761449956,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6243,
+      "step": 1812
+    },
+    {
+      "epoch": 0.9669333333333333,
+      "grad_norm": 0.3928920983318415,
+      "learning_rate": 5.733897176325665e-07,
+      "loss": 0.6209,
+      "step": 1813
+    },
+    {
+      "epoch": 0.9674666666666667,
+      "grad_norm": 0.4336639143470741,
+      "learning_rate": 5.550594322205504e-07,
+      "loss": 0.6367,
+      "step": 1814
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.3277240984708229,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.5473,
+      "step": 1815
+    },
+    {
+      "epoch": 0.9685333333333334,
+      "grad_norm": 0.3828685362181596,
+      "learning_rate": 5.192897883082747e-07,
+      "loss": 0.5927,
+      "step": 1816
+    },
+    {
+      "epoch": 0.9690666666666666,
+      "grad_norm": 0.37935820321777786,
+      "learning_rate": 5.018505366216175e-07,
+      "loss": 0.6213,
+      "step": 1817
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.37713433584045297,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6042,
+      "step": 1818
+    },
+    {
+      "epoch": 0.9701333333333333,
+      "grad_norm": 0.39006771178981314,
+      "learning_rate": 4.678634341683252e-07,
+      "loss": 0.5797,
+      "step": 1819
+    },
+    {
+      "epoch": 0.9706666666666667,
+      "grad_norm": 0.4549279176101698,
+      "learning_rate": 4.5131568489236166e-07,
+      "loss": 0.64,
+      "step": 1820
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.3759867099326376,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6041,
+      "step": 1821
+    },
+    {
+      "epoch": 0.9717333333333333,
+      "grad_norm": 0.44358842570648876,
+      "learning_rate": 4.191120373120749e-07,
+      "loss": 0.6691,
+      "step": 1822
+    },
+    {
+      "epoch": 0.9722666666666666,
+      "grad_norm": 0.41177907181643036,
+      "learning_rate": 4.034562351727389e-07,
+      "loss": 0.6485,
+      "step": 1823
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.3995106320112122,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6499,
+      "step": 1824
+    },
+    {
+      "epoch": 0.9733333333333334,
+      "grad_norm": 0.3675420880857662,
+      "learning_rate": 3.73036907948543e-07,
+      "loss": 0.6341,
+      "step": 1825
+    },
+    {
+      "epoch": 0.9738666666666667,
+      "grad_norm": 0.621797000673849,
+      "learning_rate": 3.582734737004101e-07,
+      "loss": 0.6395,
+      "step": 1826
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.4097072469260797,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.6102,
+      "step": 1827
+    },
+    {
+      "epoch": 0.9749333333333333,
+      "grad_norm": 0.5407225785782159,
+      "learning_rate": 3.296392843612273e-07,
+      "loss": 0.5909,
+      "step": 1828
+    },
+    {
+      "epoch": 0.9754666666666667,
+      "grad_norm": 0.4122705520910875,
+      "learning_rate": 3.1576861477621287e-07,
+      "loss": 0.6089,
+      "step": 1829
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.38042277285394877,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6294,
+      "step": 1830
+    },
+    {
+      "epoch": 0.9765333333333334,
+      "grad_norm": 0.4465751557766768,
+      "learning_rate": 2.889203328748424e-07,
+      "loss": 0.5854,
+      "step": 1831
+    },
+    {
+      "epoch": 0.9770666666666666,
+      "grad_norm": 0.40270091386558127,
+      "learning_rate": 2.759428007315212e-07,
+      "loss": 0.6226,
+      "step": 1832
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.4270860466442265,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.5886,
+      "step": 1833
+    },
+    {
+      "epoch": 0.9781333333333333,
+      "grad_norm": 0.3791287171667277,
+      "learning_rate": 2.5088114782392257e-07,
+      "loss": 0.5955,
+      "step": 1834
+    },
+    {
+      "epoch": 0.9786666666666667,
+      "grad_norm": 0.41745866531373826,
+      "learning_rate": 2.3879710189753656e-07,
+      "loss": 0.6487,
+      "step": 1835
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.4400123322302061,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6683,
+      "step": 1836
+    },
+    {
+      "epoch": 0.9797333333333333,
+      "grad_norm": 0.5849302348906806,
+      "learning_rate": 2.15522751523467e-07,
+      "loss": 0.7339,
+      "step": 1837
+    },
+    {
+      "epoch": 0.9802666666666666,
+      "grad_norm": 0.40153978522197675,
+      "learning_rate": 2.0433251657653308e-07,
+      "loss": 0.5936,
+      "step": 1838
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.3854755809147383,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6172,
+      "step": 1839
+    },
+    {
+      "epoch": 0.9813333333333333,
+      "grad_norm": 0.42008015852413283,
+      "learning_rate": 1.8284609424142895e-07,
+      "loss": 0.6483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.9818666666666667,
+      "grad_norm": 0.4212138721741976,
+      "learning_rate": 1.7254997101500137e-07,
+      "loss": 0.6017,
+      "step": 1841
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3832577024291346,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6145,
+      "step": 1842
+    },
+    {
+      "epoch": 0.9829333333333333,
+      "grad_norm": 0.3377575889029809,
+      "learning_rate": 1.5285205417319149e-07,
+      "loss": 0.5836,
+      "step": 1843
+    },
+    {
+      "epoch": 0.9834666666666667,
+      "grad_norm": 0.38250977122866126,
+      "learning_rate": 1.4345031937879062e-07,
+      "loss": 0.6037,
+      "step": 1844
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.42094045082935005,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6433,
+      "step": 1845
+    },
+    {
+      "epoch": 0.9845333333333334,
+      "grad_norm": 0.3820832037132924,
+      "learning_rate": 1.255414374179531e-07,
+      "loss": 0.5992,
+      "step": 1846
+    },
+    {
+      "epoch": 0.9850666666666666,
+      "grad_norm": 0.500001482850949,
+      "learning_rate": 1.170343437301491e-07,
+      "loss": 0.6988,
+      "step": 1847
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.5465847295478029,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.694,
+      "step": 1848
+    },
+    {
+      "epoch": 0.9861333333333333,
+      "grad_norm": 0.4163535477641629,
+      "learning_rate": 1.0091497795706728e-07,
+      "loss": 0.6691,
+      "step": 1849
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.37274214909560893,
+      "learning_rate": 9.330275400666332e-08,
+      "loss": 0.5738,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.3865735624911239,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.5859,
+      "step": 1851
+    },
+    {
+      "epoch": 0.9877333333333334,
+      "grad_norm": 0.4870914856922961,
+      "learning_rate": 7.8973337634336e-08,
+      "loss": 0.6312,
+      "step": 1852
+    },
+    {
+      "epoch": 0.9882666666666666,
+      "grad_norm": 0.42538631972017427,
+      "learning_rate": 7.225618800222877e-08,
+      "loss": 0.6686,
+      "step": 1853
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.3860801731383532,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.5808,
+      "step": 1854
+    },
+    {
+      "epoch": 0.9893333333333333,
+      "grad_norm": 0.4987207956749768,
+      "learning_rate": 5.971710613821291e-08,
+      "loss": 0.6563,
+      "step": 1855
+    },
+    {
+      "epoch": 0.9898666666666667,
+      "grad_norm": 0.4078087284343223,
+      "learning_rate": 5.389521134989695e-08,
+      "loss": 0.6138,
+      "step": 1856
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.3739938231097275,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6631,
+      "step": 1857
+    },
+    {
+      "epoch": 0.9909333333333333,
+      "grad_norm": 0.4399246742415485,
+      "learning_rate": 4.314680098592705e-08,
+      "loss": 0.6501,
+      "step": 1858
+    },
+    {
+      "epoch": 0.9914666666666667,
+      "grad_norm": 0.40071478863608095,
+      "learning_rate": 3.8220317506654226e-08,
+      "loss": 0.6179,
+      "step": 1859
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.37147629428382106,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6027,
+      "step": 1860
+    },
+    {
+      "epoch": 0.9925333333333334,
+      "grad_norm": 0.3879847582609873,
+      "learning_rate": 2.9262867509605163e-08,
+      "loss": 0.6125,
+      "step": 1861
+    },
+    {
+      "epoch": 0.9930666666666667,
+      "grad_norm": 0.38679631395041086,
+      "learning_rate": 2.5231927740154704e-08,
+      "loss": 0.6057,
+      "step": 1862
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.6241109452118239,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.7422,
+      "step": 1863
+    },
+    {
+      "epoch": 0.9941333333333333,
+      "grad_norm": 0.4586768882561857,
+      "learning_rate": 1.8065678844314538e-08,
+      "loss": 0.6315,
+      "step": 1864
+    },
+    {
+      "epoch": 0.9946666666666667,
+      "grad_norm": 0.37723836829454943,
+      "learning_rate": 1.4930391117451426e-08,
+      "loss": 0.5763,
+      "step": 1865
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.4306229237441863,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6055,
+      "step": 1866
+    },
+    {
+      "epoch": 0.9957333333333334,
+      "grad_norm": 0.3864565906226324,
+      "learning_rate": 9.555535917993297e-09,
+      "loss": 0.6145,
+      "step": 1867
+    },
+    {
+      "epoch": 0.9962666666666666,
+      "grad_norm": 0.45180935071199785,
+      "learning_rate": 7.315984495548378e-09,
+      "loss": 0.6469,
+      "step": 1868
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.46471876807148316,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6876,
+      "step": 1869
+    },
+    {
+      "epoch": 0.9973333333333333,
+      "grad_norm": 0.39990539409713355,
+      "learning_rate": 3.732667443390181e-09,
+      "loss": 0.5792,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9978666666666667,
+      "grad_norm": 0.42942239779003955,
+      "learning_rate": 2.388912514017516e-09,
+      "loss": 0.6408,
+      "step": 1871
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.423469642517262,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.6451,
+      "step": 1872
+    },
+    {
+      "epoch": 0.9989333333333333,
+      "grad_norm": 0.4689776508975757,
+      "learning_rate": 5.972299119250125e-10,
+      "loss": 0.5901,
+      "step": 1873
+    },
+    {
+      "epoch": 0.9994666666666666,
+      "grad_norm": 0.3714419107868351,
+      "learning_rate": 1.4930758944764479e-10,
+      "loss": 0.5707,
+      "step": 1874
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.39583113714780643,
+      "learning_rate": 0.0,
+      "loss": 0.6271,
+      "step": 1875
+    },
+    {
+      "epoch": 1.0,
+      "step": 1875,
+      "total_flos": 1639115726553088.0,
+      "train_loss": 0.700018780930837,
+      "train_runtime": 29285.1958,
+      "train_samples_per_second": 1.024,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1639115726553088.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..af3b994f74c2f13aeddfcfcfc7298a070899de7b
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "q_proj",
+    "o_proj",
+    "gate_proj",
+    "down_proj",
+    "k_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..61752e8915242cdabe035ad7700cb6f958c6a90b
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:095b0c066a786d9fa7b9743048c71cd2b7ed342cc692815b23c04f41c3e5c31e
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..286e2c2674b240eba973c8041bd4bfe1d8a712f5
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9bce4c81f04f2ebaee51d502e8c15728a3a6e06201db509f429bc66a58a87b2
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..885d58faa403cd028a7ada1286d812793e23835f
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,13167 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005333333333333334,
+      "grad_norm": 1.0782078108678725,
+      "learning_rate": 3.5087719298245615e-06,
+      "loss": 1.5314,
+      "step": 1
+    },
+    {
+      "epoch": 0.0010666666666666667,
+      "grad_norm": 1.0037748871538414,
+      "learning_rate": 7.017543859649123e-06,
+      "loss": 1.4457,
+      "step": 2
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.1053463923600908,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.5674,
+      "step": 3
+    },
+    {
+      "epoch": 0.0021333333333333334,
+      "grad_norm": 1.1243960613506545,
+      "learning_rate": 1.4035087719298246e-05,
+      "loss": 1.514,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026666666666666666,
+      "grad_norm": 1.00841681692077,
+      "learning_rate": 1.7543859649122806e-05,
+      "loss": 1.5181,
+      "step": 5
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9851049857398083,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.4734,
+      "step": 6
+    },
+    {
+      "epoch": 0.0037333333333333333,
+      "grad_norm": 0.9346731112899275,
+      "learning_rate": 2.456140350877193e-05,
+      "loss": 1.41,
+      "step": 7
+    },
+    {
+      "epoch": 0.004266666666666667,
+      "grad_norm": 1.0388157530251243,
+      "learning_rate": 2.8070175438596492e-05,
+      "loss": 1.3507,
+      "step": 8
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.9032230002036801,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.2218,
+      "step": 9
+    },
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 0.7832609647729597,
+      "learning_rate": 3.508771929824561e-05,
+      "loss": 1.0542,
+      "step": 10
+    },
+    {
+      "epoch": 0.005866666666666667,
+      "grad_norm": 0.9730372629946202,
+      "learning_rate": 3.859649122807018e-05,
+      "loss": 0.9675,
+      "step": 11
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.7716758925878686,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.0446,
+      "step": 12
+    },
+    {
+      "epoch": 0.006933333333333333,
+      "grad_norm": 0.7351105141336544,
+      "learning_rate": 4.56140350877193e-05,
+      "loss": 0.97,
+      "step": 13
+    },
+    {
+      "epoch": 0.007466666666666667,
+      "grad_norm": 0.7590877823875037,
+      "learning_rate": 4.912280701754386e-05,
+      "loss": 0.968,
+      "step": 14
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.862001055859692,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.0147,
+      "step": 15
+    },
+    {
+      "epoch": 0.008533333333333334,
+      "grad_norm": 0.7401084648326463,
+      "learning_rate": 5.6140350877192984e-05,
+      "loss": 0.9842,
+      "step": 16
+    },
+    {
+      "epoch": 0.009066666666666667,
+      "grad_norm": 0.7280729375008436,
+      "learning_rate": 5.9649122807017544e-05,
+      "loss": 1.0106,
+      "step": 17
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.6427810107091592,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 0.8929,
+      "step": 18
+    },
+    {
+      "epoch": 0.010133333333333333,
+      "grad_norm": 0.5577149809059004,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.8335,
+      "step": 19
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": 0.5597236992649832,
+      "learning_rate": 7.017543859649122e-05,
+      "loss": 0.917,
+      "step": 20
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.5856837125576331,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.8769,
+      "step": 21
+    },
+    {
+      "epoch": 0.011733333333333333,
+      "grad_norm": 0.5415079697797732,
+      "learning_rate": 7.719298245614036e-05,
+      "loss": 0.906,
+      "step": 22
+    },
+    {
+      "epoch": 0.012266666666666667,
+      "grad_norm": 0.5390421862493808,
+      "learning_rate": 8.070175438596491e-05,
+      "loss": 0.862,
+      "step": 23
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.5936834847094479,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.929,
+      "step": 24
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 0.517546577551544,
+      "learning_rate": 8.771929824561403e-05,
+      "loss": 0.8734,
+      "step": 25
+    },
+    {
+      "epoch": 0.013866666666666666,
+      "grad_norm": 0.4692370942023055,
+      "learning_rate": 9.12280701754386e-05,
+      "loss": 0.9176,
+      "step": 26
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.6064203180946028,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9189,
+      "step": 27
+    },
+    {
+      "epoch": 0.014933333333333333,
+      "grad_norm": 0.5355115074693406,
+      "learning_rate": 9.824561403508771e-05,
+      "loss": 0.9226,
+      "step": 28
+    },
+    {
+      "epoch": 0.015466666666666667,
+      "grad_norm": 0.5526680033802652,
+      "learning_rate": 0.0001017543859649123,
+      "loss": 0.8585,
+      "step": 29
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.6250427254190083,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.8898,
+      "step": 30
+    },
+    {
+      "epoch": 0.016533333333333334,
+      "grad_norm": 0.5751008013736765,
+      "learning_rate": 0.00010877192982456141,
+      "loss": 0.8642,
+      "step": 31
+    },
+    {
+      "epoch": 0.017066666666666667,
+      "grad_norm": 0.5997096010242022,
+      "learning_rate": 0.00011228070175438597,
+      "loss": 0.8783,
+      "step": 32
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.5153207577859897,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.9764,
+      "step": 33
+    },
+    {
+      "epoch": 0.018133333333333335,
+      "grad_norm": 0.4940168304549387,
+      "learning_rate": 0.00011929824561403509,
+      "loss": 0.8537,
+      "step": 34
+    },
+    {
+      "epoch": 0.018666666666666668,
+      "grad_norm": 0.6340293239222504,
+      "learning_rate": 0.00012280701754385965,
+      "loss": 0.9192,
+      "step": 35
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.4654101629859658,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8282,
+      "step": 36
+    },
+    {
+      "epoch": 0.019733333333333332,
+      "grad_norm": 0.47534715474059197,
+      "learning_rate": 0.0001298245614035088,
+      "loss": 0.8343,
+      "step": 37
+    },
+    {
+      "epoch": 0.020266666666666665,
+      "grad_norm": 0.5592703237545299,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.8589,
+      "step": 38
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.4617217902728695,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.8329,
+      "step": 39
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 0.46116022384837163,
+      "learning_rate": 0.00014035087719298245,
+      "loss": 0.8,
+      "step": 40
+    },
+    {
+      "epoch": 0.021866666666666666,
+      "grad_norm": 0.5590053178933165,
+      "learning_rate": 0.00014385964912280703,
+      "loss": 0.9246,
+      "step": 41
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5327015276069621,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.8644,
+      "step": 42
+    },
+    {
+      "epoch": 0.022933333333333333,
+      "grad_norm": 0.5055057672438585,
+      "learning_rate": 0.00015087719298245616,
+      "loss": 0.8465,
+      "step": 43
+    },
+    {
+      "epoch": 0.023466666666666667,
+      "grad_norm": 0.5160236926727955,
+      "learning_rate": 0.0001543859649122807,
+      "loss": 0.8897,
+      "step": 44
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.6456337117548154,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.9126,
+      "step": 45
+    },
+    {
+      "epoch": 0.024533333333333334,
+      "grad_norm": 0.4430961642687092,
+      "learning_rate": 0.00016140350877192982,
+      "loss": 0.8395,
+      "step": 46
+    },
+    {
+      "epoch": 0.025066666666666668,
+      "grad_norm": 0.5072626923883533,
+      "learning_rate": 0.0001649122807017544,
+      "loss": 0.8827,
+      "step": 47
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5375023711847232,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8321,
+      "step": 48
+    },
+    {
+      "epoch": 0.026133333333333335,
+      "grad_norm": 0.47761260989260346,
+      "learning_rate": 0.00017192982456140353,
+      "loss": 0.8416,
+      "step": 49
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.4714384959069264,
+      "learning_rate": 0.00017543859649122806,
+      "loss": 0.7586,
+      "step": 50
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.4494309002584157,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8051,
+      "step": 51
+    },
+    {
+      "epoch": 0.027733333333333332,
+      "grad_norm": 0.448268319282034,
+      "learning_rate": 0.0001824561403508772,
+      "loss": 0.903,
+      "step": 52
+    },
+    {
+      "epoch": 0.028266666666666666,
+      "grad_norm": 0.482599187517368,
+      "learning_rate": 0.00018596491228070177,
+      "loss": 0.7503,
+      "step": 53
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5341483553733253,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8975,
+      "step": 54
+    },
+    {
+      "epoch": 0.029333333333333333,
+      "grad_norm": 0.5902234488941541,
+      "learning_rate": 0.00019298245614035088,
+      "loss": 0.8803,
+      "step": 55
+    },
+    {
+      "epoch": 0.029866666666666666,
+      "grad_norm": 0.47711627331975215,
+      "learning_rate": 0.00019649122807017543,
+      "loss": 0.8352,
+      "step": 56
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.6494019156360745,
+      "learning_rate": 0.0002,
+      "loss": 0.8666,
+      "step": 57
+    },
+    {
+      "epoch": 0.030933333333333334,
+      "grad_norm": 0.4657753841679875,
+      "learning_rate": 0.00019999985069241055,
+      "loss": 0.7725,
+      "step": 58
+    },
+    {
+      "epoch": 0.031466666666666664,
+      "grad_norm": 0.5012960618867901,
+      "learning_rate": 0.00019999940277008808,
+      "loss": 0.8394,
+      "step": 59
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.4698325753034376,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8371,
+      "step": 60
+    },
+    {
+      "epoch": 0.03253333333333333,
+      "grad_norm": 0.5131939985752545,
+      "learning_rate": 0.00019999761108748597,
+      "loss": 0.7883,
+      "step": 61
+    },
+    {
+      "epoch": 0.03306666666666667,
+      "grad_norm": 0.4475679183229205,
+      "learning_rate": 0.00019999626733255662,
+      "loss": 0.8087,
+      "step": 62
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.43235736858508816,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.7295,
+      "step": 63
+    },
+    {
+      "epoch": 0.034133333333333335,
+      "grad_norm": 0.5090252509945237,
+      "learning_rate": 0.00019999268401550447,
+      "loss": 0.8488,
+      "step": 64
+    },
+    {
+      "epoch": 0.034666666666666665,
+      "grad_norm": 0.49769313808738236,
+      "learning_rate": 0.000199990444464082,
+      "loss": 0.8864,
+      "step": 65
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.430549326517399,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.7422,
+      "step": 66
+    },
+    {
+      "epoch": 0.03573333333333333,
+      "grad_norm": 0.5335405696936507,
+      "learning_rate": 0.00019998506960888256,
+      "loss": 0.8491,
+      "step": 67
+    },
+    {
+      "epoch": 0.03626666666666667,
+      "grad_norm": 0.5616102390859028,
+      "learning_rate": 0.00019998193432115572,
+      "loss": 0.8632,
+      "step": 68
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.4898557321457483,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8596,
+      "step": 69
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 0.48687077498181897,
+      "learning_rate": 0.00019997476807225985,
+      "loss": 0.7753,
+      "step": 70
+    },
+    {
+      "epoch": 0.037866666666666667,
+      "grad_norm": 0.573843156384488,
+      "learning_rate": 0.0001999707371324904,
+      "loss": 0.8917,
+      "step": 71
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.44160542816380943,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8187,
+      "step": 72
+    },
+    {
+      "epoch": 0.038933333333333334,
+      "grad_norm": 0.4590305279469336,
+      "learning_rate": 0.00019996177968249334,
+      "loss": 0.7603,
+      "step": 73
+    },
+    {
+      "epoch": 0.039466666666666664,
+      "grad_norm": 0.44667044744960277,
+      "learning_rate": 0.0001999568531990141,
+      "loss": 0.7877,
+      "step": 74
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.49118475163538916,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8661,
+      "step": 75
+    },
+    {
+      "epoch": 0.04053333333333333,
+      "grad_norm": 0.5045332172686827,
+      "learning_rate": 0.00019994610478865011,
+      "loss": 0.8415,
+      "step": 76
+    },
+    {
+      "epoch": 0.04106666666666667,
+      "grad_norm": 0.4529834789311489,
+      "learning_rate": 0.0001999402828938618,
+      "loss": 0.8218,
+      "step": 77
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4420842289599489,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.7662,
+      "step": 78
+    },
+    {
+      "epoch": 0.042133333333333335,
+      "grad_norm": 0.4793570083240121,
+      "learning_rate": 0.00019992774381199778,
+      "loss": 0.8215,
+      "step": 79
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 0.5547977557934145,
+      "learning_rate": 0.00019992102666236566,
+      "loss": 0.8068,
+      "step": 80
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.49307798171651535,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.7447,
+      "step": 81
+    },
+    {
+      "epoch": 0.04373333333333333,
+      "grad_norm": 0.468758023957036,
+      "learning_rate": 0.00019990669724599336,
+      "loss": 0.8129,
+      "step": 82
+    },
+    {
+      "epoch": 0.04426666666666667,
+      "grad_norm": 0.44259774727013146,
+      "learning_rate": 0.00019989908502204292,
+      "loss": 0.7862,
+      "step": 83
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5106391511608349,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8318,
+      "step": 84
+    },
+    {
+      "epoch": 0.04533333333333334,
+      "grad_norm": 0.6202394187290609,
+      "learning_rate": 0.00019988296565626987,
+      "loss": 0.8637,
+      "step": 85
+    },
+    {
+      "epoch": 0.04586666666666667,
+      "grad_norm": 0.5122146608102414,
+      "learning_rate": 0.00019987445856258206,
+      "loss": 0.8237,
+      "step": 86
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.5148195184994458,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.9645,
+      "step": 87
+    },
+    {
+      "epoch": 0.046933333333333334,
+      "grad_norm": 0.500459052702555,
+      "learning_rate": 0.00019985654968062122,
+      "loss": 0.8149,
+      "step": 88
+    },
+    {
+      "epoch": 0.047466666666666664,
+      "grad_norm": 0.5012533252716662,
+      "learning_rate": 0.00019984714794582683,
+      "loss": 0.7844,
+      "step": 89
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5082868021269187,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.7612,
+      "step": 90
+    },
+    {
+      "epoch": 0.04853333333333333,
+      "grad_norm": 0.5002359115984162,
+      "learning_rate": 0.000199827450028985,
+      "loss": 0.7667,
+      "step": 91
+    },
+    {
+      "epoch": 0.04906666666666667,
+      "grad_norm": 0.46717372027730364,
+      "learning_rate": 0.00019981715390575858,
+      "loss": 0.8329,
+      "step": 92
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5464740170191148,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8945,
+      "step": 93
+    },
+    {
+      "epoch": 0.050133333333333335,
+      "grad_norm": 0.4216179776785064,
+      "learning_rate": 0.00019979566748342347,
+      "loss": 0.7417,
+      "step": 94
+    },
+    {
+      "epoch": 0.050666666666666665,
+      "grad_norm": 0.5242674567300103,
+      "learning_rate": 0.00019978447724847652,
+      "loss": 0.8359,
+      "step": 95
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4413129016219234,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.7834,
+      "step": 96
+    },
+    {
+      "epoch": 0.05173333333333333,
+      "grad_norm": 0.5259889283709757,
+      "learning_rate": 0.00019976120289810247,
+      "loss": 0.846,
+      "step": 97
+    },
+    {
+      "epoch": 0.05226666666666667,
+      "grad_norm": 0.4488863126086829,
+      "learning_rate": 0.00019974911885217608,
+      "loss": 0.7817,
+      "step": 98
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.4919358738924874,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.7426,
+      "step": 99
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.4798760091057027,
+      "learning_rate": 0.0001997240571992685,
+      "loss": 0.7894,
+      "step": 100
+    },
+    {
+      "epoch": 0.05386666666666667,
+      "grad_norm": 0.538485971097003,
+      "learning_rate": 0.00019971107966712518,
+      "loss": 0.8093,
+      "step": 101
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.513366273404903,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.7792,
+      "step": 102
+    },
+    {
+      "epoch": 0.054933333333333334,
+      "grad_norm": 0.5030926322111889,
+      "learning_rate": 0.0001996842313852238,
+      "loss": 0.8661,
+      "step": 103
+    },
+    {
+      "epoch": 0.055466666666666664,
+      "grad_norm": 0.43326580616893945,
+      "learning_rate": 0.00019967036071563877,
+      "loss": 0.7177,
+      "step": 104
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.46267987063497157,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.774,
+      "step": 105
+    },
+    {
+      "epoch": 0.05653333333333333,
+      "grad_norm": 0.4647459226751621,
+      "learning_rate": 0.0001996417265262996,
+      "loss": 0.7847,
+      "step": 106
+    },
+    {
+      "epoch": 0.05706666666666667,
+      "grad_norm": 0.4671650840215938,
+      "learning_rate": 0.00019962696309205148,
+      "loss": 0.8229,
+      "step": 107
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5010409060742875,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8012,
+      "step": 108
+    },
+    {
+      "epoch": 0.058133333333333335,
+      "grad_norm": 0.43678067885716126,
+      "learning_rate": 0.0001995965437648273,
+      "loss": 0.7526,
+      "step": 109
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 0.4414276904744747,
+      "learning_rate": 0.00019958088796268793,
+      "loss": 0.7306,
+      "step": 110
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.4703195728087392,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8314,
+      "step": 111
+    },
+    {
+      "epoch": 0.05973333333333333,
+      "grad_norm": 0.4140033042661725,
+      "learning_rate": 0.00019954868431510764,
+      "loss": 0.7279,
+      "step": 112
+    },
+    {
+      "epoch": 0.06026666666666667,
+      "grad_norm": 0.4992922576120073,
+      "learning_rate": 0.00019953213656583168,
+      "loss": 0.8055,
+      "step": 113
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.4564238539052949,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.7914,
+      "step": 114
+    },
+    {
+      "epoch": 0.06133333333333333,
+      "grad_norm": 0.3983832475948493,
+      "learning_rate": 0.00019949814946337838,
+      "loss": 0.7176,
+      "step": 115
+    },
+    {
+      "epoch": 0.06186666666666667,
+      "grad_norm": 0.5358145339753235,
+      "learning_rate": 0.00019948071021169174,
+      "loss": 0.8084,
+      "step": 116
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.5580701237430883,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.8086,
+      "step": 117
+    },
+    {
+      "epoch": 0.06293333333333333,
+      "grad_norm": 0.45014351621514576,
+      "learning_rate": 0.00019944494056777946,
+      "loss": 0.8038,
+      "step": 118
+    },
+    {
+      "epoch": 0.06346666666666667,
+      "grad_norm": 0.49482662547256834,
+      "learning_rate": 0.00019942661028236745,
+      "loss": 0.7771,
+      "step": 119
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4785941402346086,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8812,
+      "step": 120
+    },
+    {
+      "epoch": 0.06453333333333333,
+      "grad_norm": 0.413038663138189,
+      "learning_rate": 0.00019938905905831654,
+      "loss": 0.705,
+      "step": 121
+    },
+    {
+      "epoch": 0.06506666666666666,
+      "grad_norm": 0.5068091812336115,
+      "learning_rate": 0.00019936983823181132,
+      "loss": 0.8118,
+      "step": 122
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.4044615729153809,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.754,
+      "step": 123
+    },
+    {
+      "epoch": 0.06613333333333334,
+      "grad_norm": 0.5131866881828823,
+      "learning_rate": 0.00019933050643682269,
+      "loss": 0.8209,
+      "step": 124
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.4937709196916679,
+      "learning_rate": 0.00019931039558578997,
+      "loss": 0.755,
+      "step": 125
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4905409456203536,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.7933,
+      "step": 126
+    },
+    {
+      "epoch": 0.06773333333333334,
+      "grad_norm": 0.5575318881989391,
+      "learning_rate": 0.00019926928427691786,
+      "loss": 0.7949,
+      "step": 127
+    },
+    {
+      "epoch": 0.06826666666666667,
+      "grad_norm": 0.46528504729449455,
+      "learning_rate": 0.00019924828394184306,
+      "loss": 0.8185,
+      "step": 128
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.4670060725187256,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.8065,
+      "step": 129
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 0.43269193803693456,
+      "learning_rate": 0.0001992053942239668,
+      "loss": 0.7703,
+      "step": 130
+    },
+    {
+      "epoch": 0.06986666666666666,
+      "grad_norm": 0.44158426698385295,
+      "learning_rate": 0.0001991835049692405,
+      "loss": 0.7997,
+      "step": 131
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.458958112862049,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.835,
+      "step": 132
+    },
+    {
+      "epoch": 0.07093333333333333,
+      "grad_norm": 0.509152891666129,
+      "learning_rate": 0.0001991388379950346,
+      "loss": 0.7686,
+      "step": 133
+    },
+    {
+      "epoch": 0.07146666666666666,
+      "grad_norm": 0.4713020765860328,
+      "learning_rate": 0.0001991160604089374,
+      "loss": 0.7525,
+      "step": 134
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.46303360049740766,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.8002,
+      "step": 135
+    },
+    {
+      "epoch": 0.07253333333333334,
+      "grad_norm": 0.45910893450839446,
+      "learning_rate": 0.00019906961737884077,
+      "loss": 0.7494,
+      "step": 136
+    },
+    {
+      "epoch": 0.07306666666666667,
+      "grad_norm": 0.43134847723171826,
+      "learning_rate": 0.00019904595207352737,
+      "loss": 0.7798,
+      "step": 137
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.49014505922486473,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8493,
+      "step": 138
+    },
+    {
+      "epoch": 0.07413333333333333,
+      "grad_norm": 0.4733554654602289,
+      "learning_rate": 0.000198997734235711,
+      "loss": 0.7839,
+      "step": 139
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.47030819786934236,
+      "learning_rate": 0.00019897318184719385,
+      "loss": 0.8733,
+      "step": 140
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.4629123966641584,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.7857,
+      "step": 141
+    },
+    {
+      "epoch": 0.07573333333333333,
+      "grad_norm": 0.4314385753341011,
+      "learning_rate": 0.0001989231904975272,
+      "loss": 0.7568,
+      "step": 142
+    },
+    {
+      "epoch": 0.07626666666666666,
+      "grad_norm": 0.4401343563472461,
+      "learning_rate": 0.00019889775168565943,
+      "loss": 0.7818,
+      "step": 143
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.39948816182307095,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.739,
+      "step": 144
+    },
+    {
+      "epoch": 0.07733333333333334,
+      "grad_norm": 0.41953302504580503,
+      "learning_rate": 0.00019884598816767563,
+      "loss": 0.7946,
+      "step": 145
+    },
+    {
+      "epoch": 0.07786666666666667,
+      "grad_norm": 0.44653089559058606,
+      "learning_rate": 0.0001988196636161333,
+      "loss": 0.7732,
+      "step": 146
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.37848478996909823,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.7567,
+      "step": 147
+    },
+    {
+      "epoch": 0.07893333333333333,
+      "grad_norm": 0.4302699400835671,
+      "learning_rate": 0.00019876612932099308,
+      "loss": 0.7567,
+      "step": 148
+    },
+    {
+      "epoch": 0.07946666666666667,
+      "grad_norm": 0.44513898799239343,
+      "learning_rate": 0.0001987389197372567,
+      "loss": 0.8458,
+      "step": 149
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5026683457789477,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8723,
+      "step": 150
+    },
+    {
+      "epoch": 0.08053333333333333,
+      "grad_norm": 0.4544036121912563,
+      "learning_rate": 0.00019868361610371097,
+      "loss": 0.7982,
+      "step": 151
+    },
+    {
+      "epoch": 0.08106666666666666,
+      "grad_norm": 0.5001248954779373,
+      "learning_rate": 0.00019865552221904665,
+      "loss": 0.7937,
+      "step": 152
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.5047715395387671,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.8426,
+      "step": 153
+    },
+    {
+      "epoch": 0.08213333333333334,
+      "grad_norm": 0.4286009962772737,
+      "learning_rate": 0.00019859845073339787,
+      "loss": 0.6964,
+      "step": 154
+    },
+    {
+      "epoch": 0.08266666666666667,
+      "grad_norm": 0.5769832815862597,
+      "learning_rate": 0.00019856947330283752,
+      "loss": 0.8705,
+      "step": 155
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.5174392524864813,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.8313,
+      "step": 156
+    },
+    {
+      "epoch": 0.08373333333333334,
+      "grad_norm": 0.46360358996447526,
+      "learning_rate": 0.0001985106354988997,
+      "loss": 0.7806,
+      "step": 157
+    },
+    {
+      "epoch": 0.08426666666666667,
+      "grad_norm": 0.5041118693355422,
+      "learning_rate": 0.00019848077530122083,
+      "loss": 0.8115,
+      "step": 158
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.4263069922493244,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8379,
+      "step": 159
+    },
+    {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.4196993092812524,
+      "learning_rate": 0.00019842017276027832,
+      "loss": 0.7028,
+      "step": 160
+    },
+    {
+      "epoch": 0.08586666666666666,
+      "grad_norm": 0.43315861533948996,
+      "learning_rate": 0.00019838943059798304,
+      "loss": 0.7821,
+      "step": 161
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.4183118319005412,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.6924,
+      "step": 162
+    },
+    {
+      "epoch": 0.08693333333333333,
+      "grad_norm": 0.4363924646924392,
+      "learning_rate": 0.0001983270649487481,
+      "loss": 0.7162,
+      "step": 163
+    },
+    {
+      "epoch": 0.08746666666666666,
+      "grad_norm": 0.5026222267940708,
+      "learning_rate": 0.0001982954416480417,
+      "loss": 0.8423,
+      "step": 164
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.46954716345122977,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.7307,
+      "step": 165
+    },
+    {
+      "epoch": 0.08853333333333334,
+      "grad_norm": 0.4334353060963593,
+      "learning_rate": 0.00019823131456661063,
+      "loss": 0.8122,
+      "step": 166
+    },
+    {
+      "epoch": 0.08906666666666667,
+      "grad_norm": 0.5343974725806075,
+      "learning_rate": 0.00019819881097737915,
+      "loss": 0.8015,
+      "step": 167
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5152336777453058,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.7392,
+      "step": 168
+    },
+    {
+      "epoch": 0.09013333333333333,
+      "grad_norm": 0.44125363525393313,
+      "learning_rate": 0.00019813292418718732,
+      "loss": 0.761,
+      "step": 169
+    },
+    {
+      "epoch": 0.09066666666666667,
+      "grad_norm": 0.4352274281590717,
+      "learning_rate": 0.0001980995411829749,
+      "loss": 0.7527,
+      "step": 170
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.4478477213882576,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.7522,
+      "step": 171
+    },
+    {
+      "epoch": 0.09173333333333333,
+      "grad_norm": 0.46558101443158806,
+      "learning_rate": 0.0001980318964547504,
+      "loss": 0.8325,
+      "step": 172
+    },
+    {
+      "epoch": 0.09226666666666666,
+      "grad_norm": 0.4499716576712317,
+      "learning_rate": 0.0001979976349327357,
+      "loss": 0.737,
+      "step": 173
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4556175554147233,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7528,
+      "step": 174
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 0.47783394689592473,
+      "learning_rate": 0.00019792823408445174,
+      "loss": 0.8193,
+      "step": 175
+    },
+    {
+      "epoch": 0.09386666666666667,
+      "grad_norm": 0.47807561494885875,
+      "learning_rate": 0.0001978930949654239,
+      "loss": 0.8089,
+      "step": 176
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.41756990234426405,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.7716,
+      "step": 177
+    },
+    {
+      "epoch": 0.09493333333333333,
+      "grad_norm": 0.42731198203876414,
+      "learning_rate": 0.00019782193986224995,
+      "loss": 0.8142,
+      "step": 178
+    },
+    {
+      "epoch": 0.09546666666666667,
+      "grad_norm": 0.4847930882369442,
+      "learning_rate": 0.00019778592409058378,
+      "loss": 0.8385,
+      "step": 179
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.464915969639284,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.7324,
+      "step": 180
+    },
+    {
+      "epoch": 0.09653333333333333,
+      "grad_norm": 0.5002738971807956,
+      "learning_rate": 0.0001977130166448355,
+      "loss": 0.7015,
+      "step": 181
+    },
+    {
+      "epoch": 0.09706666666666666,
+      "grad_norm": 0.5023550965237535,
+      "learning_rate": 0.00019767612518846608,
+      "loss": 0.7307,
+      "step": 182
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.49150226338969616,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.6957,
+      "step": 183
+    },
+    {
+      "epoch": 0.09813333333333334,
+      "grad_norm": 0.5018907968640427,
+      "learning_rate": 0.00019760146735955388,
+      "loss": 0.7841,
+      "step": 184
+    },
+    {
+      "epoch": 0.09866666666666667,
+      "grad_norm": 0.39214524669350853,
+      "learning_rate": 0.00019756370120995066,
+      "loss": 0.688,
+      "step": 185
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.42851833998123146,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.744,
+      "step": 186
+    },
+    {
+      "epoch": 0.09973333333333333,
+      "grad_norm": 0.5115573107106643,
+      "learning_rate": 0.000197487295004327,
+      "loss": 0.7902,
+      "step": 187
+    },
+    {
+      "epoch": 0.10026666666666667,
+      "grad_norm": 0.5031210824511215,
+      "learning_rate": 0.00019744865517646706,
+      "loss": 0.8121,
+      "step": 188
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.4422611027768926,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.7694,
+      "step": 189
+    },
+    {
+      "epoch": 0.10133333333333333,
+      "grad_norm": 0.40095867948126285,
+      "learning_rate": 0.0001973705026475726,
+      "loss": 0.7285,
+      "step": 190
+    },
+    {
+      "epoch": 0.10186666666666666,
+      "grad_norm": 0.42020494177988665,
+      "learning_rate": 0.00019733099017991341,
+      "loss": 0.717,
+      "step": 191
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.43899302381157285,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.8096,
+      "step": 192
+    },
+    {
+      "epoch": 0.10293333333333334,
+      "grad_norm": 0.4055049264781474,
+      "learning_rate": 0.0001972510934281218,
+      "loss": 0.7496,
+      "step": 193
+    },
+    {
+      "epoch": 0.10346666666666667,
+      "grad_norm": 0.4409046256864588,
+      "learning_rate": 0.00019721070938257324,
+      "loss": 0.8141,
+      "step": 194
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.46769922457624513,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.775,
+      "step": 195
+    },
+    {
+      "epoch": 0.10453333333333334,
+      "grad_norm": 0.4693161824007968,
+      "learning_rate": 0.0001971290705551347,
+      "loss": 0.7576,
+      "step": 196
+    },
+    {
+      "epoch": 0.10506666666666667,
+      "grad_norm": 0.4791426220542533,
+      "learning_rate": 0.00019708781601703065,
+      "loss": 0.7763,
+      "step": 197
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.422635449428891,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.8065,
+      "step": 198
+    },
+    {
+      "epoch": 0.10613333333333333,
+      "grad_norm": 0.43519477212457386,
+      "learning_rate": 0.00019700443730801413,
+      "loss": 0.731,
+      "step": 199
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.5021449909182034,
+      "learning_rate": 0.00019696231338608316,
+      "loss": 0.8571,
+      "step": 200
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.49111009786941157,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.8424,
+      "step": 201
+    },
+    {
+      "epoch": 0.10773333333333333,
+      "grad_norm": 0.5004754574494609,
+      "learning_rate": 0.00019687719703631755,
+      "loss": 0.7619,
+      "step": 202
+    },
+    {
+      "epoch": 0.10826666666666666,
+      "grad_norm": 0.47862670141886915,
+      "learning_rate": 0.00019683420486265327,
+      "loss": 0.7789,
+      "step": 203
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.400535834468971,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.8097,
+      "step": 204
+    },
+    {
+      "epoch": 0.10933333333333334,
+      "grad_norm": 0.3965503578402775,
+      "learning_rate": 0.0001967473531596671,
+      "loss": 0.7492,
+      "step": 205
+    },
+    {
+      "epoch": 0.10986666666666667,
+      "grad_norm": 0.46438594527621285,
+      "learning_rate": 0.0001967034938896976,
+      "loss": 0.7531,
+      "step": 206
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.49645821863670947,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.7476,
+      "step": 207
+    },
+    {
+      "epoch": 0.11093333333333333,
+      "grad_norm": 0.4797602743957526,
+      "learning_rate": 0.0001966149091676575,
+      "loss": 0.8003,
+      "step": 208
+    },
+    {
+      "epoch": 0.11146666666666667,
+      "grad_norm": 0.48620645881858454,
+      "learning_rate": 0.00019657018398011434,
+      "loss": 0.7763,
+      "step": 209
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.48710455717037493,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.7308,
+      "step": 210
+    },
+    {
+      "epoch": 0.11253333333333333,
+      "grad_norm": 0.488739719891276,
+      "learning_rate": 0.00019647986861976246,
+      "loss": 0.8456,
+      "step": 211
+    },
+    {
+      "epoch": 0.11306666666666666,
+      "grad_norm": 0.4151643748880333,
+      "learning_rate": 0.0001964342787166491,
+      "loss": 0.6726,
+      "step": 212
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.5615694070241564,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7971,
+      "step": 213
+    },
+    {
+      "epoch": 0.11413333333333334,
+      "grad_norm": 0.4365307775047827,
+      "learning_rate": 0.0001963422351452389,
+      "loss": 0.7632,
+      "step": 214
+    },
+    {
+      "epoch": 0.11466666666666667,
+      "grad_norm": 0.4822219117865235,
+      "learning_rate": 0.0001962957817517982,
+      "loss": 0.7515,
+      "step": 215
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.5058365326021625,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.7915,
+      "step": 216
+    },
+    {
+      "epoch": 0.11573333333333333,
+      "grad_norm": 0.44524691870655586,
+      "learning_rate": 0.00019620201244302952,
+      "loss": 0.7229,
+      "step": 217
+    },
+    {
+      "epoch": 0.11626666666666667,
+      "grad_norm": 0.47184407725185074,
+      "learning_rate": 0.00019615469680771096,
+      "loss": 0.7908,
+      "step": 218
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.52321985602367,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7475,
+      "step": 219
+    },
+    {
+      "epoch": 0.11733333333333333,
+      "grad_norm": 0.4259709526108592,
+      "learning_rate": 0.00019605920428166323,
+      "loss": 0.7393,
+      "step": 220
+    },
+    {
+      "epoch": 0.11786666666666666,
+      "grad_norm": 0.42008032226899766,
+      "learning_rate": 0.00019601102767608923,
+      "loss": 0.767,
+      "step": 221
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5159087518954155,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8773,
+      "step": 222
+    },
+    {
+      "epoch": 0.11893333333333334,
+      "grad_norm": 0.4026844295780718,
+      "learning_rate": 0.00019591381449915397,
+      "loss": 0.7382,
+      "step": 223
+    },
+    {
+      "epoch": 0.11946666666666667,
+      "grad_norm": 0.457361121265748,
+      "learning_rate": 0.00019586477821808597,
+      "loss": 0.8101,
+      "step": 224
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.39846496644139395,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.7036,
+      "step": 225
+    },
+    {
+      "epoch": 0.12053333333333334,
+      "grad_norm": 0.4633248517816551,
+      "learning_rate": 0.00019576584700289768,
+      "loss": 0.8288,
+      "step": 226
+    },
+    {
+      "epoch": 0.12106666666666667,
+      "grad_norm": 0.46349626775271696,
+      "learning_rate": 0.00019571595236420102,
+      "loss": 0.8263,
+      "step": 227
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.46354979074211466,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.8229,
+      "step": 228
+    },
+    {
+      "epoch": 0.12213333333333333,
+      "grad_norm": 0.4622245498790009,
+      "learning_rate": 0.00019561530576956703,
+      "loss": 0.782,
+      "step": 229
+    },
+    {
+      "epoch": 0.12266666666666666,
+      "grad_norm": 0.4201225638647903,
+      "learning_rate": 0.00019556455411417573,
+      "loss": 0.7595,
+      "step": 230
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.45240939219601756,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.7586,
+      "step": 231
+    },
+    {
+      "epoch": 0.12373333333333333,
+      "grad_norm": 0.4229103664112331,
+      "learning_rate": 0.00019546219484500475,
+      "loss": 0.7522,
+      "step": 232
+    },
+    {
+      "epoch": 0.12426666666666666,
+      "grad_norm": 0.47614492773008826,
+      "learning_rate": 0.00019541058753688538,
+      "loss": 0.8181,
+      "step": 233
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.5594293747726962,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.863,
+      "step": 234
+    },
+    {
+      "epoch": 0.12533333333333332,
+      "grad_norm": 0.47959040417664484,
+      "learning_rate": 0.00019530651834411474,
+      "loss": 0.8326,
+      "step": 235
+    },
+    {
+      "epoch": 0.12586666666666665,
+      "grad_norm": 0.44611393404366323,
+      "learning_rate": 0.00019525405677022989,
+      "loss": 0.7456,
+      "step": 236
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.41646709788210534,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7606,
+      "step": 237
+    },
+    {
+      "epoch": 0.12693333333333334,
+      "grad_norm": 0.49413052721639045,
+      "learning_rate": 0.0001951482804507517,
+      "loss": 0.8225,
+      "step": 238
+    },
+    {
+      "epoch": 0.12746666666666667,
+      "grad_norm": 0.43207494172455535,
+      "learning_rate": 0.00019509496602102252,
+      "loss": 0.6985,
+      "step": 239
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4685530071491613,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.8728,
+      "step": 240
+    },
+    {
+      "epoch": 0.12853333333333333,
+      "grad_norm": 0.4748529746658996,
+      "learning_rate": 0.00019498748541760846,
+      "loss": 0.765,
+      "step": 241
+    },
+    {
+      "epoch": 0.12906666666666666,
+      "grad_norm": 0.5153684120119952,
+      "learning_rate": 0.0001949333195648769,
+      "loss": 0.7984,
+      "step": 242
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.4687234921012601,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.7725,
+      "step": 243
+    },
+    {
+      "epoch": 0.13013333333333332,
+      "grad_norm": 0.49591514964277283,
+      "learning_rate": 0.00019482413756610173,
+      "loss": 0.7857,
+      "step": 244
+    },
+    {
+      "epoch": 0.13066666666666665,
+      "grad_norm": 0.46511631376724744,
+      "learning_rate": 0.0001947691217460921,
+      "loss": 0.7754,
+      "step": 245
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.41817559631604106,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.7059,
+      "step": 246
+    },
+    {
+      "epoch": 0.13173333333333334,
+      "grad_norm": 0.42292853515667883,
+      "learning_rate": 0.00019465824128625617,
+      "loss": 0.7317,
+      "step": 247
+    },
+    {
+      "epoch": 0.13226666666666667,
+      "grad_norm": 0.4555647309671736,
+      "learning_rate": 0.00019460237697753577,
+      "loss": 0.7881,
+      "step": 248
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.4106971317302793,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.748,
+      "step": 249
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.4151948013402095,
+      "learning_rate": 0.00019448980103658613,
+      "loss": 0.7256,
+      "step": 250
+    },
+    {
+      "epoch": 0.13386666666666666,
+      "grad_norm": 0.461900745415348,
+      "learning_rate": 0.0001944330897405257,
+      "loss": 0.7443,
+      "step": 251
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5106190880012906,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.7918,
+      "step": 252
+    },
+    {
+      "epoch": 0.13493333333333332,
+      "grad_norm": 0.4449232128613463,
+      "learning_rate": 0.00019431882134397598,
+      "loss": 0.7788,
+      "step": 253
+    },
+    {
+      "epoch": 0.13546666666666668,
+      "grad_norm": 0.46317995146668955,
+      "learning_rate": 0.00019426126458470936,
+      "loss": 0.8401,
+      "step": 254
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.5709331162036041,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7444,
+      "step": 255
+    },
+    {
+      "epoch": 0.13653333333333334,
+      "grad_norm": 0.4111423556450833,
+      "learning_rate": 0.00019414530680355837,
+      "loss": 0.7836,
+      "step": 256
+    },
+    {
+      "epoch": 0.13706666666666667,
+      "grad_norm": 0.416597649490344,
+      "learning_rate": 0.00019408690612794148,
+      "loss": 0.7613,
+      "step": 257
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.4437037245893358,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7686,
+      "step": 258
+    },
+    {
+      "epoch": 0.13813333333333333,
+      "grad_norm": 0.4804498085842282,
+      "learning_rate": 0.00019396926207859084,
+      "loss": 0.7955,
+      "step": 259
+    },
+    {
+      "epoch": 0.13866666666666666,
+      "grad_norm": 0.43955022809984884,
+      "learning_rate": 0.0001939100190561601,
+      "loss": 0.7277,
+      "step": 260
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.4730184206969309,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.798,
+      "step": 261
+    },
+    {
+      "epoch": 0.13973333333333332,
+      "grad_norm": 0.47261200218264654,
+      "learning_rate": 0.0001937906919003304,
+      "loss": 0.825,
+      "step": 262
+    },
+    {
+      "epoch": 0.14026666666666668,
+      "grad_norm": 0.4958987350547001,
+      "learning_rate": 0.00019373060812326052,
+      "loss": 0.7624,
+      "step": 263
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.45183616075090804,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8222,
+      "step": 264
+    },
+    {
+      "epoch": 0.14133333333333334,
+      "grad_norm": 0.44803560898773614,
+      "learning_rate": 0.00019360960106790643,
+      "loss": 0.7226,
+      "step": 265
+    },
+    {
+      "epoch": 0.14186666666666667,
+      "grad_norm": 0.42718829995419133,
+      "learning_rate": 0.0001935486781509677,
+      "loss": 0.7481,
+      "step": 266
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.4353186573981329,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7603,
+      "step": 267
+    },
+    {
+      "epoch": 0.14293333333333333,
+      "grad_norm": 0.4202904819567657,
+      "learning_rate": 0.00019342599444819168,
+      "loss": 0.7449,
+      "step": 268
+    },
+    {
+      "epoch": 0.14346666666666666,
+      "grad_norm": 0.4794847518043432,
+      "learning_rate": 0.00019336423402870653,
+      "loss": 0.773,
+      "step": 269
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.45356141483247475,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.7635,
+      "step": 270
+    },
+    {
+      "epoch": 0.14453333333333335,
+      "grad_norm": 0.48737870973855363,
+      "learning_rate": 0.0001932398769756714,
+      "loss": 0.8234,
+      "step": 271
+    },
+    {
+      "epoch": 0.14506666666666668,
+      "grad_norm": 0.4555062112337216,
+      "learning_rate": 0.0001931772807134704,
+      "loss": 0.7405,
+      "step": 272
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.4328351212745179,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.6942,
+      "step": 273
+    },
+    {
+      "epoch": 0.14613333333333334,
+      "grad_norm": 0.5617653746446652,
+      "learning_rate": 0.00019305125365231084,
+      "loss": 0.7743,
+      "step": 274
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 0.42251896089010094,
+      "learning_rate": 0.00019298782322968815,
+      "loss": 0.7327,
+      "step": 275
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.5347497686840524,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.8222,
+      "step": 276
+    },
+    {
+      "epoch": 0.14773333333333333,
+      "grad_norm": 0.46392759769161884,
+      "learning_rate": 0.0001928601295474208,
+      "loss": 0.7476,
+      "step": 277
+    },
+    {
+      "epoch": 0.14826666666666666,
+      "grad_norm": 0.46963090477790553,
+      "learning_rate": 0.00019279586666908884,
+      "loss": 0.7022,
+      "step": 278
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.4266844164826448,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7656,
+      "step": 279
+    },
+    {
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.41242368746109326,
+      "learning_rate": 0.00019266650979752136,
+      "loss": 0.8116,
+      "step": 280
+    },
+    {
+      "epoch": 0.14986666666666668,
+      "grad_norm": 0.47546443364095325,
+      "learning_rate": 0.00019260141619056507,
+      "loss": 0.7875,
+      "step": 281
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.5292143795351046,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.7666,
+      "step": 282
+    },
+    {
+      "epoch": 0.15093333333333334,
+      "grad_norm": 0.47803327081719627,
+      "learning_rate": 0.0001924703996062038,
+      "loss": 0.784,
+      "step": 283
+    },
+    {
+      "epoch": 0.15146666666666667,
+      "grad_norm": 0.4018308323084294,
+      "learning_rate": 0.0001924044770200342,
+      "loss": 0.7479,
+      "step": 284
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.43518027414862276,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.7893,
+      "step": 285
+    },
+    {
+      "epoch": 0.15253333333333333,
+      "grad_norm": 0.45918856850372475,
+      "learning_rate": 0.0001922718042439908,
+      "loss": 0.7766,
+      "step": 286
+    },
+    {
+      "epoch": 0.15306666666666666,
+      "grad_norm": 0.44310990709580944,
+      "learning_rate": 0.000192205054450298,
+      "loss": 0.7575,
+      "step": 287
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.4824186440304817,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8228,
+      "step": 288
+    },
+    {
+      "epoch": 0.15413333333333334,
+      "grad_norm": 0.4164815411310583,
+      "learning_rate": 0.00019207072904819486,
+      "loss": 0.7584,
+      "step": 289
+    },
+    {
+      "epoch": 0.15466666666666667,
+      "grad_norm": 0.4492882028447551,
+      "learning_rate": 0.00019200315384090044,
+      "loss": 0.7653,
+      "step": 290
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.4583365189022047,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7365,
+      "step": 291
+    },
+    {
+      "epoch": 0.15573333333333333,
+      "grad_norm": 0.40883526606640835,
+      "learning_rate": 0.00019186717942277462,
+      "loss": 0.7414,
+      "step": 292
+    },
+    {
+      "epoch": 0.15626666666666666,
+      "grad_norm": 0.4571007026103209,
+      "learning_rate": 0.00019179878061798347,
+      "loss": 0.7315,
+      "step": 293
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.43915788183077686,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.7649,
+      "step": 294
+    },
+    {
+      "epoch": 0.15733333333333333,
+      "grad_norm": 0.42781757478259014,
+      "learning_rate": 0.00019166116083819002,
+      "loss": 0.7031,
+      "step": 295
+    },
+    {
+      "epoch": 0.15786666666666666,
+      "grad_norm": 0.4194122860641237,
+      "learning_rate": 0.00019159194027414128,
+      "loss": 0.7347,
+      "step": 296
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.3946453275568492,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.7288,
+      "step": 297
+    },
+    {
+      "epoch": 0.15893333333333334,
+      "grad_norm": 0.3716516600450698,
+      "learning_rate": 0.00019145267883125482,
+      "loss": 0.677,
+      "step": 298
+    },
+    {
+      "epoch": 0.15946666666666667,
+      "grad_norm": 0.569373216702423,
+      "learning_rate": 0.00019138263836827288,
+      "loss": 0.789,
+      "step": 299
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3925415031456995,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.723,
+      "step": 300
+    },
+    {
+      "epoch": 0.16053333333333333,
+      "grad_norm": 0.4295146910368237,
+      "learning_rate": 0.00019124173900498818,
+      "loss": 0.7327,
+      "step": 301
+    },
+    {
+      "epoch": 0.16106666666666666,
+      "grad_norm": 0.4391000814211482,
+      "learning_rate": 0.00019117088052543233,
+      "loss": 0.7953,
+      "step": 302
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.4974671790719608,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.7985,
+      "step": 303
+    },
+    {
+      "epoch": 0.16213333333333332,
+      "grad_norm": 0.4793462957844173,
+      "learning_rate": 0.00019102834702846387,
+      "loss": 0.7564,
+      "step": 304
+    },
+    {
+      "epoch": 0.16266666666666665,
+      "grad_norm": 0.4205080963822408,
+      "learning_rate": 0.0001909566724366779,
+      "loss": 0.7161,
+      "step": 305
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.4442006612150125,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.7848,
+      "step": 306
+    },
+    {
+      "epoch": 0.16373333333333334,
+      "grad_norm": 0.4677563528998587,
+      "learning_rate": 0.00019081250863665794,
+      "loss": 0.8152,
+      "step": 307
+    },
+    {
+      "epoch": 0.16426666666666667,
+      "grad_norm": 0.4463099795826012,
+      "learning_rate": 0.0001907400198589189,
+      "loss": 0.7082,
+      "step": 308
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.41450881976512516,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.8286,
+      "step": 309
+    },
+    {
+      "epoch": 0.16533333333333333,
+      "grad_norm": 0.49032251343742833,
+      "learning_rate": 0.00019059422963029464,
+      "loss": 0.8278,
+      "step": 310
+    },
+    {
+      "epoch": 0.16586666666666666,
+      "grad_norm": 0.5551963533783116,
+      "learning_rate": 0.0001905209286147611,
+      "loss": 0.7858,
+      "step": 311
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4425974836672353,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7261,
+      "step": 312
+    },
+    {
+      "epoch": 0.16693333333333332,
+      "grad_norm": 0.4308581660899142,
+      "learning_rate": 0.0001903735158756905,
+      "loss": 0.7759,
+      "step": 313
+    },
+    {
+      "epoch": 0.16746666666666668,
+      "grad_norm": 0.4089398086125329,
+      "learning_rate": 0.0001902994045923502,
+      "loss": 0.7032,
+      "step": 314
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.4478445524514376,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.7864,
+      "step": 315
+    },
+    {
+      "epoch": 0.16853333333333334,
+      "grad_norm": 0.4349562448937956,
+      "learning_rate": 0.0001901503733045967,
+      "loss": 0.6596,
+      "step": 316
+    },
+    {
+      "epoch": 0.16906666666666667,
+      "grad_norm": 0.47145823286121696,
+      "learning_rate": 0.00019007545374521355,
+      "loss": 0.7355,
+      "step": 317
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.4527311758640081,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7405,
+      "step": 318
+    },
+    {
+      "epoch": 0.17013333333333333,
+      "grad_norm": 0.5662376082754184,
+      "learning_rate": 0.00018992480791403958,
+      "loss": 0.7689,
+      "step": 319
+    },
+    {
+      "epoch": 0.17066666666666666,
+      "grad_norm": 0.5303863637398115,
+      "learning_rate": 0.0001898490820921001,
+      "loss": 0.8052,
+      "step": 320
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.4457136180185171,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.761,
+      "step": 321
+    },
+    {
+      "epoch": 0.17173333333333332,
+      "grad_norm": 0.521176367976079,
+      "learning_rate": 0.0001896968257661595,
+      "loss": 0.7912,
+      "step": 322
+    },
+    {
+      "epoch": 0.17226666666666668,
+      "grad_norm": 0.3993581009496419,
+      "learning_rate": 0.00018962029571681886,
+      "loss": 0.7552,
+      "step": 323
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.482634638047166,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.7378,
+      "step": 324
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 0.47156012587256724,
+      "learning_rate": 0.00018946643298804793,
+      "loss": 0.7993,
+      "step": 325
+    },
+    {
+      "epoch": 0.17386666666666667,
+      "grad_norm": 0.4529242253730277,
+      "learning_rate": 0.00018938910076807513,
+      "loss": 0.8097,
+      "step": 326
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.43198723603603484,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7645,
+      "step": 327
+    },
+    {
+      "epoch": 0.17493333333333333,
+      "grad_norm": 0.49769246836747966,
+      "learning_rate": 0.0001892336357715829,
+      "loss": 0.7583,
+      "step": 328
+    },
+    {
+      "epoch": 0.17546666666666666,
+      "grad_norm": 0.44381661099224823,
+      "learning_rate": 0.0001891555034593055,
+      "loss": 0.8192,
+      "step": 329
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.41365517886042213,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.7281,
+      "step": 330
+    },
+    {
+      "epoch": 0.17653333333333332,
+      "grad_norm": 0.39253340117724317,
+      "learning_rate": 0.00018899844037326225,
+      "loss": 0.7751,
+      "step": 331
+    },
+    {
+      "epoch": 0.17706666666666668,
+      "grad_norm": 0.4221356965204189,
+      "learning_rate": 0.0001889195100685106,
+      "loss": 0.7335,
+      "step": 332
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.465812608519683,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.8006,
+      "step": 333
+    },
+    {
+      "epoch": 0.17813333333333334,
+      "grad_norm": 0.39955412682780783,
+      "learning_rate": 0.00018876085311403593,
+      "loss": 0.7791,
+      "step": 334
+    },
+    {
+      "epoch": 0.17866666666666667,
+      "grad_norm": 0.4198511360966401,
+      "learning_rate": 0.00018868112693808665,
+      "loss": 0.7713,
+      "step": 335
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4683459551741011,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7975,
+      "step": 336
+    },
+    {
+      "epoch": 0.17973333333333333,
+      "grad_norm": 0.524369646706343,
+      "learning_rate": 0.00018852088037913577,
+      "loss": 0.7755,
+      "step": 337
+    },
+    {
+      "epoch": 0.18026666666666666,
+      "grad_norm": 0.4475292419332502,
+      "learning_rate": 0.0001884403604746547,
+      "loss": 0.7293,
+      "step": 338
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.47005833665256663,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.734,
+      "step": 339
+    },
+    {
+      "epoch": 0.18133333333333335,
+      "grad_norm": 0.5459009790421003,
+      "learning_rate": 0.00018827852861790398,
+      "loss": 0.8221,
+      "step": 340
+    },
+    {
+      "epoch": 0.18186666666666668,
+      "grad_norm": 0.4717398269488305,
+      "learning_rate": 0.00018819721714888877,
+      "loss": 0.7679,
+      "step": 341
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.4614656366933708,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.6819,
+      "step": 342
+    },
+    {
+      "epoch": 0.18293333333333334,
+      "grad_norm": 0.4426786261456807,
+      "learning_rate": 0.00018803380434362,
+      "loss": 0.7383,
+      "step": 343
+    },
+    {
+      "epoch": 0.18346666666666667,
+      "grad_norm": 0.5046949310507889,
+      "learning_rate": 0.0001879517034953418,
+      "loss": 0.7637,
+      "step": 344
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.5151554930335568,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.8356,
+      "step": 345
+    },
+    {
+      "epoch": 0.18453333333333333,
+      "grad_norm": 0.48056516871808136,
+      "learning_rate": 0.00018778671413332513,
+      "loss": 0.802,
+      "step": 346
+    },
+    {
+      "epoch": 0.18506666666666666,
+      "grad_norm": 0.3669928978344368,
+      "learning_rate": 0.00018770382611226987,
+      "loss": 0.6344,
+      "step": 347
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.38839417685216243,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.6665,
+      "step": 348
+    },
+    {
+      "epoch": 0.18613333333333335,
+      "grad_norm": 0.444571466544383,
+      "learning_rate": 0.000187537264627646,
+      "loss": 0.8017,
+      "step": 349
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.48649755629417796,
+      "learning_rate": 0.00018745359166145523,
+      "loss": 0.7976,
+      "step": 350
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.4674034942105575,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7909,
+      "step": 351
+    },
+    {
+      "epoch": 0.18773333333333334,
+      "grad_norm": 0.40054121875355303,
+      "learning_rate": 0.00018728546253061614,
+      "loss": 0.7854,
+      "step": 352
+    },
+    {
+      "epoch": 0.18826666666666667,
+      "grad_norm": 0.45543333246211487,
+      "learning_rate": 0.00018720100686802694,
+      "loss": 0.8092,
+      "step": 353
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.43356723573270656,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7513,
+      "step": 354
+    },
+    {
+      "epoch": 0.18933333333333333,
+      "grad_norm": 0.42987940452717943,
+      "learning_rate": 0.00018703131460949554,
+      "loss": 0.75,
+      "step": 355
+    },
+    {
+      "epoch": 0.18986666666666666,
+      "grad_norm": 0.43154615134467134,
+      "learning_rate": 0.0001869460785202802,
+      "loss": 0.7715,
+      "step": 356
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.4577432897967549,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7883,
+      "step": 357
+    },
+    {
+      "epoch": 0.19093333333333334,
+      "grad_norm": 0.4038076486111166,
+      "learning_rate": 0.00018677482769458904,
+      "loss": 0.7199,
+      "step": 358
+    },
+    {
+      "epoch": 0.19146666666666667,
+      "grad_norm": 0.5876808158493642,
+      "learning_rate": 0.00018668881346949417,
+      "loss": 0.6808,
+      "step": 359
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4428224313753587,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7276,
+      "step": 360
+    },
+    {
+      "epoch": 0.19253333333333333,
+      "grad_norm": 0.5834742907517645,
+      "learning_rate": 0.00018651600867906272,
+      "loss": 0.8507,
+      "step": 361
+    },
+    {
+      "epoch": 0.19306666666666666,
+      "grad_norm": 0.4530072437109187,
+      "learning_rate": 0.00018642921862974742,
+      "loss": 0.7194,
+      "step": 362
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.45015304766820907,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.732,
+      "step": 363
+    },
+    {
+      "epoch": 0.19413333333333332,
+      "grad_norm": 0.5145809525792678,
+      "learning_rate": 0.00018625486451875843,
+      "loss": 0.8305,
+      "step": 364
+    },
+    {
+      "epoch": 0.19466666666666665,
+      "grad_norm": 0.4470293039575751,
+      "learning_rate": 0.0001861673009777325,
+      "loss": 0.7451,
+      "step": 365
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4187471245853827,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.7431,
+      "step": 366
+    },
+    {
+      "epoch": 0.19573333333333334,
+      "grad_norm": 0.4542301308072446,
+      "learning_rate": 0.00018599140223200716,
+      "loss": 0.7999,
+      "step": 367
+    },
+    {
+      "epoch": 0.19626666666666667,
+      "grad_norm": 0.4139522048552885,
+      "learning_rate": 0.0001859030675525681,
+      "loss": 0.7829,
+      "step": 368
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.39572804667237355,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.7474,
+      "step": 369
+    },
+    {
+      "epoch": 0.19733333333333333,
+      "grad_norm": 0.42426366372189334,
+      "learning_rate": 0.0001857256288994402,
+      "loss": 0.7804,
+      "step": 370
+    },
+    {
+      "epoch": 0.19786666666666666,
+      "grad_norm": 0.4998095801741916,
+      "learning_rate": 0.00018563652545561013,
+      "loss": 0.824,
+      "step": 371
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.41234129174941464,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7441,
+      "step": 372
+    },
+    {
+      "epoch": 0.19893333333333332,
+      "grad_norm": 0.38419386021960866,
+      "learning_rate": 0.000185457551663799,
+      "loss": 0.7612,
+      "step": 373
+    },
+    {
+      "epoch": 0.19946666666666665,
+      "grad_norm": 0.5637900248272991,
+      "learning_rate": 0.00018536768185026083,
+      "loss": 0.7429,
+      "step": 374
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.41953989413339743,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.7709,
+      "step": 375
+    },
+    {
+      "epoch": 0.20053333333333334,
+      "grad_norm": 0.45785135429480267,
+      "learning_rate": 0.00018518717772974302,
+      "loss": 0.7743,
+      "step": 376
+    },
+    {
+      "epoch": 0.20106666666666667,
+      "grad_norm": 0.40332005903710416,
+      "learning_rate": 0.00018509654396177609,
+      "loss": 0.7278,
+      "step": 377
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4304370314944406,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.786,
+      "step": 378
+    },
+    {
+      "epoch": 0.20213333333333333,
+      "grad_norm": 0.4134176865184048,
+      "learning_rate": 0.00018491451436365627,
+      "loss": 0.7411,
+      "step": 379
+    },
+    {
+      "epoch": 0.20266666666666666,
+      "grad_norm": 0.44883302145505194,
+      "learning_rate": 0.0001848231190770714,
+      "loss": 0.7239,
+      "step": 380
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.4446190605144744,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.6804,
+      "step": 381
+    },
+    {
+      "epoch": 0.20373333333333332,
+      "grad_norm": 0.4559578816460269,
+      "learning_rate": 0.00018463956889345194,
+      "loss": 0.7216,
+      "step": 382
+    },
+    {
+      "epoch": 0.20426666666666668,
+      "grad_norm": 0.46692582047382547,
+      "learning_rate": 0.00018454741454452603,
+      "loss": 0.778,
+      "step": 383
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4670038870928643,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7928,
+      "step": 384
+    },
+    {
+      "epoch": 0.20533333333333334,
+      "grad_norm": 0.42271310945924956,
+      "learning_rate": 0.00018436234870837547,
+      "loss": 0.716,
+      "step": 385
+    },
+    {
+      "epoch": 0.20586666666666667,
+      "grad_norm": 0.47097578907718746,
+      "learning_rate": 0.00018426943777378552,
+      "loss": 0.777,
+      "step": 386
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.42074461461628854,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7355,
+      "step": 387
+    },
+    {
+      "epoch": 0.20693333333333333,
+      "grad_norm": 0.3771891409228768,
+      "learning_rate": 0.00018408286125880604,
+      "loss": 0.705,
+      "step": 388
+    },
+    {
+      "epoch": 0.20746666666666666,
+      "grad_norm": 0.4143114769704897,
+      "learning_rate": 0.00018398919623556238,
+      "loss": 0.7558,
+      "step": 389
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.41762981116418973,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.7619,
+      "step": 390
+    },
+    {
+      "epoch": 0.20853333333333332,
+      "grad_norm": 0.4833105403522849,
+      "learning_rate": 0.0001838011140560562,
+      "loss": 0.7258,
+      "step": 391
+    },
+    {
+      "epoch": 0.20906666666666668,
+      "grad_norm": 0.5263680970417046,
+      "learning_rate": 0.00018370669746143564,
+      "loss": 0.7977,
+      "step": 392
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5252260664462001,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.8086,
+      "step": 393
+    },
+    {
+      "epoch": 0.21013333333333334,
+      "grad_norm": 0.4060831593752984,
+      "learning_rate": 0.0001835171146721701,
+      "loss": 0.6517,
+      "step": 394
+    },
+    {
+      "epoch": 0.21066666666666667,
+      "grad_norm": 0.5183938396928125,
+      "learning_rate": 0.00018342194904364813,
+      "loss": 0.7967,
+      "step": 395
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5223281194557258,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.8407,
+      "step": 396
+    },
+    {
+      "epoch": 0.21173333333333333,
+      "grad_norm": 0.539061397830276,
+      "learning_rate": 0.00018323087073971993,
+      "loss": 0.8545,
+      "step": 397
+    },
+    {
+      "epoch": 0.21226666666666666,
+      "grad_norm": 0.4547422637203524,
+      "learning_rate": 0.00018313495863490258,
+      "loss": 0.7115,
+      "step": 398
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.3878963454195641,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.738,
+      "step": 399
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.42732703361178465,
+      "learning_rate": 0.00018294238995160094,
+      "loss": 0.6755,
+      "step": 400
+    },
+    {
+      "epoch": 0.21386666666666668,
+      "grad_norm": 0.43086787308731883,
+      "learning_rate": 0.00018284573394815597,
+      "loss": 0.7616,
+      "step": 401
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.5263563910058618,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7579,
+      "step": 402
+    },
+    {
+      "epoch": 0.21493333333333334,
+      "grad_norm": 0.46390603341389813,
+      "learning_rate": 0.00018265168006082437,
+      "loss": 0.7246,
+      "step": 403
+    },
+    {
+      "epoch": 0.21546666666666667,
+      "grad_norm": 0.4384442085538908,
+      "learning_rate": 0.00018255428275641214,
+      "loss": 0.6936,
+      "step": 404
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5084653458182219,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7694,
+      "step": 405
+    },
+    {
+      "epoch": 0.21653333333333333,
+      "grad_norm": 0.42629750863324306,
+      "learning_rate": 0.0001823587488803095,
+      "loss": 0.7096,
+      "step": 406
+    },
+    {
+      "epoch": 0.21706666666666666,
+      "grad_norm": 0.4174225185106244,
+      "learning_rate": 0.00018226061289251298,
+      "loss": 0.7668,
+      "step": 407
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4848832911881881,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7749,
+      "step": 408
+    },
+    {
+      "epoch": 0.21813333333333335,
+      "grad_norm": 0.5256712576294765,
+      "learning_rate": 0.00018206360428267332,
+      "loss": 0.8654,
+      "step": 409
+    },
+    {
+      "epoch": 0.21866666666666668,
+      "grad_norm": 0.45369406635948906,
+      "learning_rate": 0.00018196473224892784,
+      "loss": 0.7768,
+      "step": 410
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.4580442622619772,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.7038,
+      "step": 411
+    },
+    {
+      "epoch": 0.21973333333333334,
+      "grad_norm": 0.4121359019799272,
+      "learning_rate": 0.0001817662542000192,
+      "loss": 0.7529,
+      "step": 412
+    },
+    {
+      "epoch": 0.22026666666666667,
+      "grad_norm": 0.4745036119867769,
+      "learning_rate": 0.0001816666487775416,
+      "loss": 0.7799,
+      "step": 413
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4653160110390173,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7328,
+      "step": 414
+    },
+    {
+      "epoch": 0.22133333333333333,
+      "grad_norm": 0.3900147216295794,
+      "learning_rate": 0.00018146670662372354,
+      "loss": 0.7004,
+      "step": 415
+    },
+    {
+      "epoch": 0.22186666666666666,
+      "grad_norm": 0.5052791180641548,
+      "learning_rate": 0.0001813663704894407,
+      "loss": 0.8386,
+      "step": 416
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.44377730164941653,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.8182,
+      "step": 417
+    },
+    {
+      "epoch": 0.22293333333333334,
+      "grad_norm": 0.4279979054066684,
+      "learning_rate": 0.00018116496960422107,
+      "loss": 0.7957,
+      "step": 418
+    },
+    {
+      "epoch": 0.22346666666666667,
+      "grad_norm": 0.4178082284065879,
+      "learning_rate": 0.00018106390545469795,
+      "loss": 0.7324,
+      "step": 419
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.48231159640949595,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7425,
+      "step": 420
+    },
+    {
+      "epoch": 0.22453333333333333,
+      "grad_norm": 0.49797186124876386,
+      "learning_rate": 0.00018086105125078857,
+      "loss": 0.7728,
+      "step": 421
+    },
+    {
+      "epoch": 0.22506666666666666,
+      "grad_norm": 0.4210454989697462,
+      "learning_rate": 0.00018075926180215576,
+      "loss": 0.708,
+      "step": 422
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.43773426687044753,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.8104,
+      "step": 423
+    },
+    {
+      "epoch": 0.22613333333333333,
+      "grad_norm": 0.47375967549476183,
+      "learning_rate": 0.0001805549597313267,
+      "loss": 0.7881,
+      "step": 424
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 0.4508248166569347,
+      "learning_rate": 0.0001804524477192075,
+      "loss": 0.7566,
+      "step": 425
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.5109080365948736,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.8352,
+      "step": 426
+    },
+    {
+      "epoch": 0.22773333333333334,
+      "grad_norm": 0.4105244821037551,
+      "learning_rate": 0.00018024670327214084,
+      "loss": 0.6978,
+      "step": 427
+    },
+    {
+      "epoch": 0.22826666666666667,
+      "grad_norm": 0.4770073011942472,
+      "learning_rate": 0.00018014347145157755,
+      "loss": 0.7503,
+      "step": 428
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.4476019882960167,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.729,
+      "step": 429
+    },
+    {
+      "epoch": 0.22933333333333333,
+      "grad_norm": 0.4337305503040713,
+      "learning_rate": 0.0001799362901577196,
+      "loss": 0.6877,
+      "step": 430
+    },
+    {
+      "epoch": 0.22986666666666666,
+      "grad_norm": 0.4395490030745343,
+      "learning_rate": 0.00017983234130309968,
+      "loss": 0.7754,
+      "step": 431
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.4478885632204688,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7173,
+      "step": 432
+    },
+    {
+      "epoch": 0.23093333333333332,
+      "grad_norm": 0.45970180329057325,
+      "learning_rate": 0.00017962372873051252,
+      "loss": 0.7813,
+      "step": 433
+    },
+    {
+      "epoch": 0.23146666666666665,
+      "grad_norm": 0.46335197040398113,
+      "learning_rate": 0.00017951906563549397,
+      "loss": 0.7375,
+      "step": 434
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.4778764846294453,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7646,
+      "step": 435
+    },
+    {
+      "epoch": 0.23253333333333334,
+      "grad_norm": 0.46623072488367895,
+      "learning_rate": 0.00017930902739070562,
+      "loss": 0.7116,
+      "step": 436
+    },
+    {
+      "epoch": 0.23306666666666667,
+      "grad_norm": 0.45062956425618433,
+      "learning_rate": 0.00017920365286814183,
+      "loss": 0.7567,
+      "step": 437
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.3901412272035564,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.67,
+      "step": 438
+    },
+    {
+      "epoch": 0.23413333333333333,
+      "grad_norm": 0.4481168814065678,
+      "learning_rate": 0.0001789921945959958,
+      "loss": 0.7487,
+      "step": 439
+    },
+    {
+      "epoch": 0.23466666666666666,
+      "grad_norm": 0.45255804332698363,
+      "learning_rate": 0.00017888611147786002,
+      "loss": 0.7566,
+      "step": 440
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.3858371436267006,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7214,
+      "step": 441
+    },
+    {
+      "epoch": 0.23573333333333332,
+      "grad_norm": 0.3838906928309417,
+      "learning_rate": 0.00017867323886136348,
+      "loss": 0.7824,
+      "step": 442
+    },
+    {
+      "epoch": 0.23626666666666668,
+      "grad_norm": 0.6921743256899109,
+      "learning_rate": 0.00017856644999867264,
+      "loss": 0.8048,
+      "step": 443
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.4723095825934299,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.8044,
+      "step": 444
+    },
+    {
+      "epoch": 0.23733333333333334,
+      "grad_norm": 0.4735570320108188,
+      "learning_rate": 0.00017835216875884368,
+      "loss": 0.6807,
+      "step": 445
+    },
+    {
+      "epoch": 0.23786666666666667,
+      "grad_norm": 0.5826638776213384,
+      "learning_rate": 0.0001782446770215819,
+      "loss": 0.7849,
+      "step": 446
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.45486726183484355,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.8277,
+      "step": 447
+    },
+    {
+      "epoch": 0.23893333333333333,
+      "grad_norm": 0.4997180204666084,
+      "learning_rate": 0.00017802899291729585,
+      "loss": 0.8167,
+      "step": 448
+    },
+    {
+      "epoch": 0.23946666666666666,
+      "grad_norm": 0.4469017072685085,
+      "learning_rate": 0.0001779208011943371,
+      "loss": 0.734,
+      "step": 449
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4265899873892352,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7077,
+      "step": 450
+    },
+    {
+      "epoch": 0.24053333333333332,
+      "grad_norm": 0.4665814125248521,
+      "learning_rate": 0.00017770372002217172,
+      "loss": 0.7479,
+      "step": 451
+    },
+    {
+      "epoch": 0.24106666666666668,
+      "grad_norm": 0.5392805201410067,
+      "learning_rate": 0.00017759483122120238,
+      "loss": 0.7737,
+      "step": 452
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.5366952879705709,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.8441,
+      "step": 453
+    },
+    {
+      "epoch": 0.24213333333333334,
+      "grad_norm": 0.44525037077182855,
+      "learning_rate": 0.00017737635881528196,
+      "loss": 0.7602,
+      "step": 454
+    },
+    {
+      "epoch": 0.24266666666666667,
+      "grad_norm": 0.42518163895662253,
+      "learning_rate": 0.00017726677586272263,
+      "loss": 0.7703,
+      "step": 455
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.4977376483405448,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7516,
+      "step": 456
+    },
+    {
+      "epoch": 0.24373333333333333,
+      "grad_norm": 0.482970794527716,
+      "learning_rate": 0.00017704691809456143,
+      "loss": 0.7232,
+      "step": 457
+    },
+    {
+      "epoch": 0.24426666666666666,
+      "grad_norm": 0.4457733986004114,
+      "learning_rate": 0.0001769366439354882,
+      "loss": 0.7644,
+      "step": 458
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.44855878261627097,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.8107,
+      "step": 459
+    },
+    {
+      "epoch": 0.24533333333333332,
+      "grad_norm": 0.43853317843352935,
+      "learning_rate": 0.00017671540671383243,
+      "loss": 0.7253,
+      "step": 460
+    },
+    {
+      "epoch": 0.24586666666666668,
+      "grad_norm": 0.5211453375036034,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 0.8156,
+      "step": 461
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.5456521312038222,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7169,
+      "step": 462
+    },
+    {
+      "epoch": 0.24693333333333334,
+      "grad_norm": 0.44472418234899885,
+      "learning_rate": 0.00017638183358256696,
+      "loss": 0.7461,
+      "step": 463
+    },
+    {
+      "epoch": 0.24746666666666667,
+      "grad_norm": 0.44575325016140327,
+      "learning_rate": 0.00017627018591992018,
+      "loss": 0.7025,
+      "step": 464
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.4327582162947951,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.7489,
+      "step": 465
+    },
+    {
+      "epoch": 0.24853333333333333,
+      "grad_norm": 0.4141928177631284,
+      "learning_rate": 0.00017604620766564723,
+      "loss": 0.7049,
+      "step": 466
+    },
+    {
+      "epoch": 0.24906666666666666,
+      "grad_norm": 0.6626481468166168,
+      "learning_rate": 0.00017593387774285412,
+      "loss": 0.8694,
+      "step": 467
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.548728387711451,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8548,
+      "step": 468
+    },
+    {
+      "epoch": 0.2501333333333333,
+      "grad_norm": 0.42774074789324845,
+      "learning_rate": 0.0001757085379831246,
+      "loss": 0.7204,
+      "step": 469
+    },
+    {
+      "epoch": 0.25066666666666665,
+      "grad_norm": 0.4199250541649349,
+      "learning_rate": 0.00017559552881908695,
+      "loss": 0.7111,
+      "step": 470
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.5061021079058381,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8057,
+      "step": 471
+    },
+    {
+      "epoch": 0.2517333333333333,
+      "grad_norm": 0.4495334182361452,
+      "learning_rate": 0.00017536883360997743,
+      "loss": 0.7956,
+      "step": 472
+    },
+    {
+      "epoch": 0.25226666666666664,
+      "grad_norm": 0.48165073581798634,
+      "learning_rate": 0.00017525514824185185,
+      "loss": 0.8259,
+      "step": 473
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.41335385257603885,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.7342,
+      "step": 474
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 0.4145266461877525,
+      "learning_rate": 0.00017502710367586687,
+      "loss": 0.7595,
+      "step": 475
+    },
+    {
+      "epoch": 0.2538666666666667,
+      "grad_norm": 0.44201502102589235,
+      "learning_rate": 0.0001749127451589832,
+      "loss": 0.7829,
+      "step": 476
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.5203049947428746,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.8107,
+      "step": 477
+    },
+    {
+      "epoch": 0.25493333333333335,
+      "grad_norm": 0.3863436364378562,
+      "learning_rate": 0.00017468335736489177,
+      "loss": 0.6327,
+      "step": 478
+    },
+    {
+      "epoch": 0.2554666666666667,
+      "grad_norm": 0.4551135421256658,
+      "learning_rate": 0.00017456832877267084,
+      "loss": 0.7279,
+      "step": 479
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.46564114422191183,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.778,
+      "step": 480
+    },
+    {
+      "epoch": 0.25653333333333334,
+      "grad_norm": 0.48069249451736823,
+      "learning_rate": 0.00017433760391534167,
+      "loss": 0.735,
+      "step": 481
+    },
+    {
+      "epoch": 0.25706666666666667,
+      "grad_norm": 0.477377028194487,
+      "learning_rate": 0.00017422190833921283,
+      "loss": 0.7508,
+      "step": 482
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.4939324041520078,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7438,
+      "step": 483
+    },
+    {
+      "epoch": 0.2581333333333333,
+      "grad_norm": 0.40333028899327916,
+      "learning_rate": 0.00017398985261944856,
+      "loss": 0.6965,
+      "step": 484
+    },
+    {
+      "epoch": 0.25866666666666666,
+      "grad_norm": 0.4391616856195648,
+      "learning_rate": 0.00017387349316876666,
+      "loss": 0.7382,
+      "step": 485
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4158524659490832,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.71,
+      "step": 486
+    },
+    {
+      "epoch": 0.2597333333333333,
+      "grad_norm": 0.4651585154255733,
+      "learning_rate": 0.0001736401128231373,
+      "loss": 0.7553,
+      "step": 487
+    },
+    {
+      "epoch": 0.26026666666666665,
+      "grad_norm": 0.4304389835842047,
+      "learning_rate": 0.00017352309262509894,
+      "loss": 0.7894,
+      "step": 488
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.40308975255561746,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.6878,
+      "step": 489
+    },
+    {
+      "epoch": 0.2613333333333333,
+      "grad_norm": 0.4258435882075208,
+      "learning_rate": 0.0001732883939257742,
+      "loss": 0.7406,
+      "step": 490
+    },
+    {
+      "epoch": 0.2618666666666667,
+      "grad_norm": 0.419156478110173,
+      "learning_rate": 0.0001731707161253338,
+      "loss": 0.7157,
+      "step": 491
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.4050502111153129,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7601,
+      "step": 492
+    },
+    {
+      "epoch": 0.26293333333333335,
+      "grad_norm": 0.4757091960255574,
+      "learning_rate": 0.00017293470537991463,
+      "loss": 0.7088,
+      "step": 493
+    },
+    {
+      "epoch": 0.2634666666666667,
+      "grad_norm": 0.4805129071729885,
+      "learning_rate": 0.00017281637313969978,
+      "loss": 0.6709,
+      "step": 494
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.4655840407847795,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.7099,
+      "step": 495
+    },
+    {
+      "epoch": 0.26453333333333334,
+      "grad_norm": 0.6808307954945988,
+      "learning_rate": 0.00017257905669104874,
+      "loss": 0.8317,
+      "step": 496
+    },
+    {
+      "epoch": 0.2650666666666667,
+      "grad_norm": 0.4651069645499294,
+      "learning_rate": 0.00017246007319127545,
+      "loss": 0.7326,
+      "step": 497
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.42048664296755117,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7201,
+      "step": 498
+    },
+    {
+      "epoch": 0.26613333333333333,
+      "grad_norm": 0.4336395380929332,
+      "learning_rate": 0.00017222145741734626,
+      "loss": 0.7521,
+      "step": 499
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.3820731757706012,
+      "learning_rate": 0.00017210182585573327,
+      "loss": 0.7068,
+      "step": 500
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.43070761285149495,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.7258,
+      "step": 501
+    },
+    {
+      "epoch": 0.2677333333333333,
+      "grad_norm": 0.4793623382558248,
+      "learning_rate": 0.00017186191716939944,
+      "loss": 0.7424,
+      "step": 502
+    },
+    {
+      "epoch": 0.26826666666666665,
+      "grad_norm": 0.4525946541267634,
+      "learning_rate": 0.0001717416407610824,
+      "loss": 0.7221,
+      "step": 503
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4081739057196793,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.7204,
+      "step": 504
+    },
+    {
+      "epoch": 0.2693333333333333,
+      "grad_norm": 0.39358430542956696,
+      "learning_rate": 0.00017150044560996488,
+      "loss": 0.7097,
+      "step": 505
+    },
+    {
+      "epoch": 0.26986666666666664,
+      "grad_norm": 0.39278965176432057,
+      "learning_rate": 0.00017137952758740978,
+      "loss": 0.664,
+      "step": 506
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.5334725427950925,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.7787,
+      "step": 507
+    },
+    {
+      "epoch": 0.27093333333333336,
+      "grad_norm": 0.4674629512590317,
+      "learning_rate": 0.00017113705245370368,
+      "loss": 0.7136,
+      "step": 508
+    },
+    {
+      "epoch": 0.2714666666666667,
+      "grad_norm": 0.4687141573028758,
+      "learning_rate": 0.00017101549606662024,
+      "loss": 0.7125,
+      "step": 509
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.48416110239095644,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7883,
+      "step": 510
+    },
+    {
+      "epoch": 0.27253333333333335,
+      "grad_norm": 0.7565257686655121,
+      "learning_rate": 0.00017077174746692056,
+      "loss": 0.7748,
+      "step": 511
+    },
+    {
+      "epoch": 0.2730666666666667,
+      "grad_norm": 0.383224478590731,
+      "learning_rate": 0.00017064955598217462,
+      "loss": 0.6952,
+      "step": 512
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.4036380929840031,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7542,
+      "step": 513
+    },
+    {
+      "epoch": 0.27413333333333334,
+      "grad_norm": 0.46326149095647695,
+      "learning_rate": 0.00017040454046730115,
+      "loss": 0.7347,
+      "step": 514
+    },
+    {
+      "epoch": 0.27466666666666667,
+      "grad_norm": 0.5846257957630842,
+      "learning_rate": 0.00017028171716882714,
+      "loss": 0.8552,
+      "step": 515
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4465688747242848,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7976,
+      "step": 516
+    },
+    {
+      "epoch": 0.27573333333333333,
+      "grad_norm": 0.3917549578289485,
+      "learning_rate": 0.00017003544132364846,
+      "loss": 0.6694,
+      "step": 517
+    },
+    {
+      "epoch": 0.27626666666666666,
+      "grad_norm": 0.4048606026638533,
+      "learning_rate": 0.00016991198951236088,
+      "loss": 0.7049,
+      "step": 518
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.44460583808185367,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.7478,
+      "step": 519
+    },
+    {
+      "epoch": 0.2773333333333333,
+      "grad_norm": 0.39128767229480316,
+      "learning_rate": 0.00016966445995561727,
+      "loss": 0.7546,
+      "step": 520
+    },
+    {
+      "epoch": 0.27786666666666665,
+      "grad_norm": 0.37647276925663165,
+      "learning_rate": 0.00016954038294932216,
+      "loss": 0.6637,
+      "step": 521
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.4351759016292475,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7691,
+      "step": 522
+    },
+    {
+      "epoch": 0.2789333333333333,
+      "grad_norm": 0.4182015452967113,
+      "learning_rate": 0.0001692916063334479,
+      "loss": 0.7539,
+      "step": 523
+    },
+    {
+      "epoch": 0.27946666666666664,
+      "grad_norm": 0.42134974744242665,
+      "learning_rate": 0.0001691669074667535,
+      "loss": 0.6873,
+      "step": 524
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.45725498098243716,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.8192,
+      "step": 525
+    },
+    {
+      "epoch": 0.28053333333333336,
+      "grad_norm": 0.44261361809918787,
+      "learning_rate": 0.0001689168904776979,
+      "loss": 0.7056,
+      "step": 526
+    },
+    {
+      "epoch": 0.2810666666666667,
+      "grad_norm": 0.5130096571268379,
+      "learning_rate": 0.00016879157310192535,
+      "loss": 0.7339,
+      "step": 527
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4008058826449141,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7322,
+      "step": 528
+    },
+    {
+      "epoch": 0.28213333333333335,
+      "grad_norm": 0.45631894274308066,
+      "learning_rate": 0.00016854032245897308,
+      "loss": 0.7956,
+      "step": 529
+    },
+    {
+      "epoch": 0.2826666666666667,
+      "grad_norm": 0.43336270108874525,
+      "learning_rate": 0.00016841438994206595,
+      "loss": 0.7363,
+      "step": 530
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.47032489917062453,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7352,
+      "step": 531
+    },
+    {
+      "epoch": 0.28373333333333334,
+      "grad_norm": 0.4125145135017204,
+      "learning_rate": 0.00016816191239765667,
+      "loss": 0.6969,
+      "step": 532
+    },
+    {
+      "epoch": 0.28426666666666667,
+      "grad_norm": 0.4662370497753131,
+      "learning_rate": 0.00016803536812409075,
+      "loss": 0.7986,
+      "step": 533
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.3824746192067442,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.7539,
+      "step": 534
+    },
+    {
+      "epoch": 0.2853333333333333,
+      "grad_norm": 0.4394673484421157,
+      "learning_rate": 0.00016778167046363734,
+      "loss": 0.7932,
+      "step": 535
+    },
+    {
+      "epoch": 0.28586666666666666,
+      "grad_norm": 0.3928956694847816,
+      "learning_rate": 0.00016765451783432953,
+      "loss": 0.7564,
+      "step": 536
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.3794180412480094,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7352,
+      "step": 537
+    },
+    {
+      "epoch": 0.2869333333333333,
+      "grad_norm": 0.4321572584394699,
+      "learning_rate": 0.0001673996068760359,
+      "loss": 0.7277,
+      "step": 538
+    },
+    {
+      "epoch": 0.28746666666666665,
+      "grad_norm": 0.3886733474800513,
+      "learning_rate": 0.00016727184930825288,
+      "loss": 0.6527,
+      "step": 539
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.49422080811805685,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.8295,
+      "step": 540
+    },
+    {
+      "epoch": 0.2885333333333333,
+      "grad_norm": 0.49166063763300444,
+      "learning_rate": 0.00016701573190293077,
+      "loss": 0.6551,
+      "step": 541
+    },
+    {
+      "epoch": 0.2890666666666667,
+      "grad_norm": 0.47865172461051464,
+      "learning_rate": 0.00016688737283019706,
+      "loss": 0.6979,
+      "step": 542
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.40959610643890904,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7106,
+      "step": 543
+    },
+    {
+      "epoch": 0.29013333333333335,
+      "grad_norm": 0.4503720919421993,
+      "learning_rate": 0.00016663005586108176,
+      "loss": 0.8032,
+      "step": 544
+    },
+    {
+      "epoch": 0.2906666666666667,
+      "grad_norm": 0.4498320174874359,
+      "learning_rate": 0.00016650109873308765,
+      "loss": 0.7532,
+      "step": 545
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.47826486319096867,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7374,
+      "step": 546
+    },
+    {
+      "epoch": 0.29173333333333334,
+      "grad_norm": 0.3978639353039526,
+      "learning_rate": 0.0001662425891156531,
+      "loss": 0.7523,
+      "step": 547
+    },
+    {
+      "epoch": 0.2922666666666667,
+      "grad_norm": 0.48483046835985555,
+      "learning_rate": 0.00016611303739816168,
+      "loss": 0.7692,
+      "step": 548
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.45089043354921626,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7443,
+      "step": 549
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.4665950656664754,
+      "learning_rate": 0.00016585334207993476,
+      "loss": 0.7904,
+      "step": 550
+    },
+    {
+      "epoch": 0.29386666666666666,
+      "grad_norm": 0.3883149125160683,
+      "learning_rate": 0.00016572319925468892,
+      "loss": 0.7179,
+      "step": 551
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.41632379179992457,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7281,
+      "step": 552
+    },
+    {
+      "epoch": 0.2949333333333333,
+      "grad_norm": 0.4977495720997442,
+      "learning_rate": 0.0001654623252150624,
+      "loss": 0.7299,
+      "step": 553
+    },
+    {
+      "epoch": 0.29546666666666666,
+      "grad_norm": 0.47903261526417296,
+      "learning_rate": 0.00016533159477969122,
+      "loss": 0.7304,
+      "step": 554
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.37701238814284743,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7557,
+      "step": 555
+    },
+    {
+      "epoch": 0.2965333333333333,
+      "grad_norm": 0.41930029095265514,
+      "learning_rate": 0.00016506954902973655,
+      "loss": 0.742,
+      "step": 556
+    },
+    {
+      "epoch": 0.29706666666666665,
+      "grad_norm": 0.4188454431920848,
+      "learning_rate": 0.00016493823449766136,
+      "loss": 0.6864,
+      "step": 557
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.4468757996322642,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.712,
+      "step": 558
+    },
+    {
+      "epoch": 0.2981333333333333,
+      "grad_norm": 0.4343228600884407,
+      "learning_rate": 0.00016467502407993992,
+      "loss": 0.7297,
+      "step": 559
+    },
+    {
+      "epoch": 0.2986666666666667,
+      "grad_norm": 0.40922799139319394,
+      "learning_rate": 0.0001645431289802799,
+      "loss": 0.6923,
+      "step": 560
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.4203326579304541,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.7145,
+      "step": 561
+    },
+    {
+      "epoch": 0.29973333333333335,
+      "grad_norm": 0.47335490523849333,
+      "learning_rate": 0.00016427876096865394,
+      "loss": 0.7525,
+      "step": 562
+    },
+    {
+      "epoch": 0.3002666666666667,
+      "grad_norm": 0.41381219985814566,
+      "learning_rate": 0.00016414628884613107,
+      "loss": 0.7399,
+      "step": 563
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.4409350610415614,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.681,
+      "step": 564
+    },
+    {
+      "epoch": 0.30133333333333334,
+      "grad_norm": 0.3538655075484582,
+      "learning_rate": 0.00016388077034557355,
+      "loss": 0.7164,
+      "step": 565
+    },
+    {
+      "epoch": 0.30186666666666667,
+      "grad_norm": 0.43471299509023836,
+      "learning_rate": 0.00016374772476041748,
+      "loss": 0.761,
+      "step": 566
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.4410425533885929,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7548,
+      "step": 567
+    },
+    {
+      "epoch": 0.30293333333333333,
+      "grad_norm": 0.4511426783652151,
+      "learning_rate": 0.00016348106290682118,
+      "loss": 0.8016,
+      "step": 568
+    },
+    {
+      "epoch": 0.30346666666666666,
+      "grad_norm": 0.4009363166019702,
+      "learning_rate": 0.00016334744743467364,
+      "loss": 0.7287,
+      "step": 569
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4775594919679725,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.8506,
+      "step": 570
+    },
+    {
+      "epoch": 0.3045333333333333,
+      "grad_norm": 0.45067055657521304,
+      "learning_rate": 0.00016307964939465914,
+      "loss": 0.7547,
+      "step": 571
+    },
+    {
+      "epoch": 0.30506666666666665,
+      "grad_norm": 0.5112675671620446,
+      "learning_rate": 0.00016294546762647775,
+      "loss": 0.7682,
+      "step": 572
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.4307918320940786,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7041,
+      "step": 573
+    },
+    {
+      "epoch": 0.3061333333333333,
+      "grad_norm": 0.43442178567013523,
+      "learning_rate": 0.0001626765405972011,
+      "loss": 0.7524,
+      "step": 574
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 0.4668392119887552,
+      "learning_rate": 0.00016254179613916278,
+      "loss": 0.6954,
+      "step": 575
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.5133283395950824,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.8325,
+      "step": 576
+    },
+    {
+      "epoch": 0.30773333333333336,
+      "grad_norm": 0.4417181247890331,
+      "learning_rate": 0.000162271747348122,
+      "loss": 0.7256,
+      "step": 577
+    },
+    {
+      "epoch": 0.3082666666666667,
+      "grad_norm": 0.45562250143164257,
+      "learning_rate": 0.0001621364438215262,
+      "loss": 0.7251,
+      "step": 578
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.44188741570717166,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7584,
+      "step": 579
+    },
+    {
+      "epoch": 0.30933333333333335,
+      "grad_norm": 0.41305164806040257,
+      "learning_rate": 0.00016186528052636692,
+      "loss": 0.7725,
+      "step": 580
+    },
+    {
+      "epoch": 0.3098666666666667,
+      "grad_norm": 0.4448580030745107,
+      "learning_rate": 0.0001617294215675382,
+      "loss": 0.7517,
+      "step": 581
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.41272246176475724,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.6428,
+      "step": 582
+    },
+    {
+      "epoch": 0.31093333333333334,
+      "grad_norm": 0.427761714352196,
+      "learning_rate": 0.0001614571510558588,
+      "loss": 0.7489,
+      "step": 583
+    },
+    {
+      "epoch": 0.31146666666666667,
+      "grad_norm": 0.41542254625356395,
+      "learning_rate": 0.00016132074031604917,
+      "loss": 0.6996,
+      "step": 584
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.42220688807675394,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7579,
+      "step": 585
+    },
+    {
+      "epoch": 0.31253333333333333,
+      "grad_norm": 0.3687717725151293,
+      "learning_rate": 0.00016104736990520468,
+      "loss": 0.7211,
+      "step": 586
+    },
+    {
+      "epoch": 0.31306666666666666,
+      "grad_norm": 0.3881030069969,
+      "learning_rate": 0.0001609104110504954,
+      "loss": 0.7115,
+      "step": 587
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.3657955407253785,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7149,
+      "step": 588
+    },
+    {
+      "epoch": 0.3141333333333333,
+      "grad_norm": 0.40800587674458866,
+      "learning_rate": 0.00016063594808740113,
+      "loss": 0.7142,
+      "step": 589
+    },
+    {
+      "epoch": 0.31466666666666665,
+      "grad_norm": 0.42721068026872167,
+      "learning_rate": 0.00016049844479860422,
+      "loss": 0.7204,
+      "step": 590
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.5371856046024449,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.7918,
+      "step": 591
+    },
+    {
+      "epoch": 0.3157333333333333,
+      "grad_norm": 0.362776360031712,
+      "learning_rate": 0.00016022289665953808,
+      "loss": 0.672,
+      "step": 592
+    },
+    {
+      "epoch": 0.31626666666666664,
+      "grad_norm": 0.3889297840285753,
+      "learning_rate": 0.00016008485263209742,
+      "loss": 0.7069,
+      "step": 593
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.5222808118875192,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.6683,
+      "step": 594
+    },
+    {
+      "epoch": 0.31733333333333336,
+      "grad_norm": 0.47788481609241606,
+      "learning_rate": 0.0001598082267225018,
+      "loss": 0.8138,
+      "step": 595
+    },
+    {
+      "epoch": 0.3178666666666667,
+      "grad_norm": 0.43098527220345256,
+      "learning_rate": 0.0001596696456663938,
+      "loss": 0.7268,
+      "step": 596
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.4110374263224395,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.6886,
+      "step": 597
+    },
+    {
+      "epoch": 0.31893333333333335,
+      "grad_norm": 0.4260683340007289,
+      "learning_rate": 0.00015939194942067646,
+      "loss": 0.7213,
+      "step": 598
+    },
+    {
+      "epoch": 0.3194666666666667,
+      "grad_norm": 0.38562882015100414,
+      "learning_rate": 0.0001592528350603103,
+      "loss": 0.6659,
+      "step": 599
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4545905517487075,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7252,
+      "step": 600
+    },
+    {
+      "epoch": 0.32053333333333334,
+      "grad_norm": 0.42394997305272447,
+      "learning_rate": 0.00015897407594164467,
+      "loss": 0.7273,
+      "step": 601
+    },
+    {
+      "epoch": 0.32106666666666667,
+      "grad_norm": 0.41157563157389504,
+      "learning_rate": 0.00015883443201576225,
+      "loss": 0.6836,
+      "step": 602
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.44840001190614737,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7583,
+      "step": 603
+    },
+    {
+      "epoch": 0.3221333333333333,
+      "grad_norm": 0.4182736826915793,
+      "learning_rate": 0.00015855461751588677,
+      "loss": 0.7195,
+      "step": 604
+    },
+    {
+      "epoch": 0.32266666666666666,
+      "grad_norm": 0.4544480209267658,
+      "learning_rate": 0.0001584144477774623,
+      "loss": 0.7668,
+      "step": 605
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.5152676059003954,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7463,
+      "step": 606
+    },
+    {
+      "epoch": 0.3237333333333333,
+      "grad_norm": 0.40967721945566854,
+      "learning_rate": 0.00015813358541647915,
+      "loss": 0.7725,
+      "step": 607
+    },
+    {
+      "epoch": 0.32426666666666665,
+      "grad_norm": 0.4036255773867977,
+      "learning_rate": 0.00015799289363261813,
+      "loss": 0.6744,
+      "step": 608
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.38950914089952277,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7673,
+      "step": 609
+    },
+    {
+      "epoch": 0.3253333333333333,
+      "grad_norm": 0.4286416680955874,
+      "learning_rate": 0.00015771099095879108,
+      "loss": 0.7499,
+      "step": 610
+    },
+    {
+      "epoch": 0.3258666666666667,
+      "grad_norm": 0.43590141467819987,
+      "learning_rate": 0.0001575697809106292,
+      "loss": 0.7338,
+      "step": 611
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4910032323410539,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.6965,
+      "step": 612
+    },
+    {
+      "epoch": 0.32693333333333335,
+      "grad_norm": 0.3693106294847623,
+      "learning_rate": 0.00015728684550018064,
+      "loss": 0.6599,
+      "step": 613
+    },
+    {
+      "epoch": 0.3274666666666667,
+      "grad_norm": 0.49096247961929523,
+      "learning_rate": 0.0001571451209827821,
+      "loss": 0.7749,
+      "step": 614
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.39355592492079217,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.6922,
+      "step": 615
+    },
+    {
+      "epoch": 0.32853333333333334,
+      "grad_norm": 0.47682273617619825,
+      "learning_rate": 0.00015686116043968972,
+      "loss": 0.7402,
+      "step": 616
+    },
+    {
+      "epoch": 0.3290666666666667,
+      "grad_norm": 0.4858149349113941,
+      "learning_rate": 0.00015671892526194516,
+      "loss": 0.7702,
+      "step": 617
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.37638690329692875,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7093,
+      "step": 618
+    },
+    {
+      "epoch": 0.33013333333333333,
+      "grad_norm": 0.4496997952146683,
+      "learning_rate": 0.0001564339472177373,
+      "loss": 0.6926,
+      "step": 619
+    },
+    {
+      "epoch": 0.33066666666666666,
+      "grad_norm": 0.4156548552072922,
+      "learning_rate": 0.00015629120520226165,
+      "loss": 0.7482,
+      "step": 620
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.4139701893635094,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7038,
+      "step": 621
+    },
+    {
+      "epoch": 0.3317333333333333,
+      "grad_norm": 0.42070868574423165,
+      "learning_rate": 0.0001560052173158123,
+      "loss": 0.7312,
+      "step": 622
+    },
+    {
+      "epoch": 0.33226666666666665,
+      "grad_norm": 0.3996824303934374,
+      "learning_rate": 0.00015586197229884184,
+      "loss": 0.6693,
+      "step": 623
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.5498724293289201,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7898,
+      "step": 624
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.39087687283609013,
+      "learning_rate": 0.00015557498225616487,
+      "loss": 0.6658,
+      "step": 625
+    },
+    {
+      "epoch": 0.33386666666666664,
+      "grad_norm": 0.49763265105140403,
+      "learning_rate": 0.0001554312380874542,
+      "loss": 0.782,
+      "step": 626
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.49971072384955406,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7626,
+      "step": 627
+    },
+    {
+      "epoch": 0.33493333333333336,
+      "grad_norm": 0.471518962075809,
+      "learning_rate": 0.00015514325360149668,
+      "loss": 0.7155,
+      "step": 628
+    },
+    {
+      "epoch": 0.3354666666666667,
+      "grad_norm": 0.46901778025938734,
+      "learning_rate": 0.0001549990141442153,
+      "loss": 0.7799,
+      "step": 629
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.3915997285214422,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.6841,
+      "step": 630
+    },
+    {
+      "epoch": 0.33653333333333335,
+      "grad_norm": 0.3809616396925241,
+      "learning_rate": 0.00015471004295465035,
+      "loss": 0.6585,
+      "step": 631
+    },
+    {
+      "epoch": 0.3370666666666667,
+      "grad_norm": 0.45379184123717903,
+      "learning_rate": 0.0001545653120852787,
+      "loss": 0.6928,
+      "step": 632
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.39650350795032846,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7238,
+      "step": 633
+    },
+    {
+      "epoch": 0.33813333333333334,
+      "grad_norm": 0.3948371161329786,
+      "learning_rate": 0.00015427536195829742,
+      "loss": 0.6788,
+      "step": 634
+    },
+    {
+      "epoch": 0.33866666666666667,
+      "grad_norm": 0.4032691640595932,
+      "learning_rate": 0.00015413014356652286,
+      "loss": 0.7184,
+      "step": 635
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.43571883502203596,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7005,
+      "step": 636
+    },
+    {
+      "epoch": 0.33973333333333333,
+      "grad_norm": 0.4328528400250972,
+      "learning_rate": 0.00015383922229462549,
+      "loss": 0.7296,
+      "step": 637
+    },
+    {
+      "epoch": 0.34026666666666666,
+      "grad_norm": 0.6203793014054366,
+      "learning_rate": 0.00015369352028323774,
+      "loss": 0.7943,
+      "step": 638
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.42032184862200134,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7077,
+      "step": 639
+    },
+    {
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.42287690028119707,
+      "learning_rate": 0.0001534016356850244,
+      "loss": 0.7419,
+      "step": 640
+    },
+    {
+      "epoch": 0.34186666666666665,
+      "grad_norm": 0.3953099910639756,
+      "learning_rate": 0.0001532554539698105,
+      "loss": 0.7189,
+      "step": 641
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.365946781409118,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.6947,
+      "step": 642
+    },
+    {
+      "epoch": 0.3429333333333333,
+      "grad_norm": 0.4121012123710881,
+      "learning_rate": 0.00015296261388977108,
+      "loss": 0.7287,
+      "step": 643
+    },
+    {
+      "epoch": 0.34346666666666664,
+      "grad_norm": 0.40568950884016236,
+      "learning_rate": 0.0001528159563994104,
+      "loss": 0.7091,
+      "step": 644
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.42927205307186717,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7421,
+      "step": 645
+    },
+    {
+      "epoch": 0.34453333333333336,
+      "grad_norm": 0.42045019660612665,
+      "learning_rate": 0.00015252216870771345,
+      "loss": 0.7248,
+      "step": 646
+    },
+    {
+      "epoch": 0.3450666666666667,
+      "grad_norm": 0.4688332920501223,
+      "learning_rate": 0.00015237503938367186,
+      "loss": 0.7538,
+      "step": 647
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.4078824082008241,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.6997,
+      "step": 648
+    },
+    {
+      "epoch": 0.34613333333333335,
+      "grad_norm": 0.40740133303695125,
+      "learning_rate": 0.00015208031197595356,
+      "loss": 0.7137,
+      "step": 649
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.45547854576707125,
+      "learning_rate": 0.0001519327147723776,
+      "loss": 0.7737,
+      "step": 650
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.3594657384543693,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.6625,
+      "step": 651
+    },
+    {
+      "epoch": 0.34773333333333334,
+      "grad_norm": 0.4190519097986457,
+      "learning_rate": 0.0001516370555695291,
+      "loss": 0.6854,
+      "step": 652
+    },
+    {
+      "epoch": 0.34826666666666667,
+      "grad_norm": 0.4113728495190875,
+      "learning_rate": 0.00015148899445313981,
+      "loss": 0.7386,
+      "step": 653
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.4265455138939629,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7259,
+      "step": 654
+    },
+    {
+      "epoch": 0.34933333333333333,
+      "grad_norm": 0.4021569493622652,
+      "learning_rate": 0.00015119241140109467,
+      "loss": 0.6953,
+      "step": 655
+    },
+    {
+      "epoch": 0.34986666666666666,
+      "grad_norm": 0.6083975824782651,
+      "learning_rate": 0.00015104389035108077,
+      "loss": 0.7177,
+      "step": 656
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.4249433086732558,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.6917,
+      "step": 657
+    },
+    {
+      "epoch": 0.3509333333333333,
+      "grad_norm": 0.44797184733914347,
+      "learning_rate": 0.0001507463914206012,
+      "loss": 0.7084,
+      "step": 658
+    },
+    {
+      "epoch": 0.35146666666666665,
+      "grad_norm": 0.4488995057651193,
+      "learning_rate": 0.0001505974144285124,
+      "loss": 0.7712,
+      "step": 659
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5119712023325992,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7138,
+      "step": 660
+    },
+    {
+      "epoch": 0.3525333333333333,
+      "grad_norm": 0.4294619539709814,
+      "learning_rate": 0.00015029900761497506,
+      "loss": 0.7378,
+      "step": 661
+    },
+    {
+      "epoch": 0.35306666666666664,
+      "grad_norm": 0.40191573399988767,
+      "learning_rate": 0.00015014957868461458,
+      "loss": 0.7069,
+      "step": 662
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.39045686144937136,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.6839,
+      "step": 663
+    },
+    {
+      "epoch": 0.35413333333333336,
+      "grad_norm": 0.40775124902429194,
+      "learning_rate": 0.000149850272007796,
+      "loss": 0.7536,
+      "step": 664
+    },
+    {
+      "epoch": 0.3546666666666667,
+      "grad_norm": 0.43690198174776207,
+      "learning_rate": 0.00014970039515511304,
+      "loss": 0.7217,
+      "step": 665
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.497685578274488,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7582,
+      "step": 666
+    },
+    {
+      "epoch": 0.35573333333333335,
+      "grad_norm": 0.3927489114501421,
+      "learning_rate": 0.0001494001966589736,
+      "loss": 0.648,
+      "step": 667
+    },
+    {
+      "epoch": 0.3562666666666667,
+      "grad_norm": 0.3841325696674503,
+      "learning_rate": 0.00014924987591195547,
+      "loss": 0.6946,
+      "step": 668
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.3968719116520625,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7557,
+      "step": 669
+    },
+    {
+      "epoch": 0.35733333333333334,
+      "grad_norm": 0.4644707180066225,
+      "learning_rate": 0.0001489487936644237,
+      "loss": 0.7221,
+      "step": 670
+    },
+    {
+      "epoch": 0.35786666666666667,
+      "grad_norm": 0.47182237431179613,
+      "learning_rate": 0.00014879803306298736,
+      "loss": 0.6708,
+      "step": 671
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.37405376397766577,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7303,
+      "step": 672
+    },
+    {
+      "epoch": 0.3589333333333333,
+      "grad_norm": 0.38693014739459,
+      "learning_rate": 0.00014849607515574276,
+      "loss": 0.7205,
+      "step": 673
+    },
+    {
+      "epoch": 0.35946666666666666,
+      "grad_norm": 0.4141446224776969,
+      "learning_rate": 0.00014834487875162657,
+      "loss": 0.6831,
+      "step": 674
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.44085268407984707,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.6771,
+      "step": 675
+    },
+    {
+      "epoch": 0.3605333333333333,
+      "grad_norm": 0.5008038670322603,
+      "learning_rate": 0.00014804205329988225,
+      "loss": 0.7293,
+      "step": 676
+    },
+    {
+      "epoch": 0.36106666666666665,
+      "grad_norm": 0.38289060985917467,
+      "learning_rate": 0.00014789042515653687,
+      "loss": 0.6895,
+      "step": 677
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4750523039680865,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7094,
+      "step": 678
+    },
+    {
+      "epoch": 0.3621333333333333,
+      "grad_norm": 0.43369605197655986,
+      "learning_rate": 0.00014758674029882152,
+      "loss": 0.7762,
+      "step": 679
+    },
+    {
+      "epoch": 0.3626666666666667,
+      "grad_norm": 0.4384513524762407,
+      "learning_rate": 0.00014743468449130063,
+      "loss": 0.7413,
+      "step": 680
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.41556593141549514,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7559,
+      "step": 681
+    },
+    {
+      "epoch": 0.36373333333333335,
+      "grad_norm": 0.4313667616873304,
+      "learning_rate": 0.00014713014838923976,
+      "loss": 0.6445,
+      "step": 682
+    },
+    {
+      "epoch": 0.3642666666666667,
+      "grad_norm": 0.37484474718641714,
+      "learning_rate": 0.00014697766900409074,
+      "loss": 0.6437,
+      "step": 683
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.41599837666242073,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7088,
+      "step": 684
+    },
+    {
+      "epoch": 0.36533333333333334,
+      "grad_norm": 0.40276103245754896,
+      "learning_rate": 0.0001466722898421873,
+      "loss": 0.6754,
+      "step": 685
+    },
+    {
+      "epoch": 0.3658666666666667,
+      "grad_norm": 0.4168388924555747,
+      "learning_rate": 0.0001465193909773413,
+      "loss": 0.7518,
+      "step": 686
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.42526591621099963,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.6827,
+      "step": 687
+    },
+    {
+      "epoch": 0.36693333333333333,
+      "grad_norm": 0.40288589146266274,
+      "learning_rate": 0.00014621317696275564,
+      "loss": 0.7652,
+      "step": 688
+    },
+    {
+      "epoch": 0.36746666666666666,
+      "grad_norm": 0.467329947585081,
+      "learning_rate": 0.00014605986272741748,
+      "loss": 0.7806,
+      "step": 689
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3876255767858793,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.6763,
+      "step": 690
+    },
+    {
+      "epoch": 0.3685333333333333,
+      "grad_norm": 0.4146466363686245,
+      "learning_rate": 0.00014575282208974702,
+      "loss": 0.7033,
+      "step": 691
+    },
+    {
+      "epoch": 0.36906666666666665,
+      "grad_norm": 0.4441170977868678,
+      "learning_rate": 0.00014559909660428468,
+      "loss": 0.6681,
+      "step": 692
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.3558480903224922,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.6505,
+      "step": 693
+    },
+    {
+      "epoch": 0.3701333333333333,
+      "grad_norm": 0.41602598410389735,
+      "learning_rate": 0.00014529123759534255,
+      "loss": 0.7015,
+      "step": 694
+    },
+    {
+      "epoch": 0.37066666666666664,
+      "grad_norm": 0.3703330729862911,
+      "learning_rate": 0.00014513710499117647,
+      "loss": 0.6848,
+      "step": 695
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4263287984363347,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.7294,
+      "step": 696
+    },
+    {
+      "epoch": 0.37173333333333336,
+      "grad_norm": 0.44347359831237204,
+      "learning_rate": 0.00014482843588476974,
+      "loss": 0.698,
+      "step": 697
+    },
+    {
+      "epoch": 0.3722666666666667,
+      "grad_norm": 0.48781838992265036,
+      "learning_rate": 0.00014467390030426186,
+      "loss": 0.7447,
+      "step": 698
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.4609996397724964,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.6821,
+      "step": 699
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.4713648588980301,
+      "learning_rate": 0.0001443644293959693,
+      "loss": 0.7602,
+      "step": 700
+    },
+    {
+      "epoch": 0.3738666666666667,
+      "grad_norm": 0.41196452629535585,
+      "learning_rate": 0.00014420949499231172,
+      "loss": 0.7182,
+      "step": 701
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.5632700705840292,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.8152,
+      "step": 702
+    },
+    {
+      "epoch": 0.37493333333333334,
+      "grad_norm": 0.3998184746916645,
+      "learning_rate": 0.00014389923059926062,
+      "loss": 0.6491,
+      "step": 703
+    },
+    {
+      "epoch": 0.37546666666666667,
+      "grad_norm": 0.41509955532967097,
+      "learning_rate": 0.0001437439015363638,
+      "loss": 0.7134,
+      "step": 704
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.4868921880399975,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.731,
+      "step": 705
+    },
+    {
+      "epoch": 0.37653333333333333,
+      "grad_norm": 0.46637564860401476,
+      "learning_rate": 0.00014343285199700683,
+      "loss": 0.7037,
+      "step": 706
+    },
+    {
+      "epoch": 0.37706666666666666,
+      "grad_norm": 0.4516473152763635,
+      "learning_rate": 0.0001432771324493879,
+      "loss": 0.7053,
+      "step": 707
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.3964393979907646,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.6835,
+      "step": 708
+    },
+    {
+      "epoch": 0.3781333333333333,
+      "grad_norm": 0.3997508055133865,
+      "learning_rate": 0.00014296530612327863,
+      "loss": 0.7097,
+      "step": 709
+    },
+    {
+      "epoch": 0.37866666666666665,
+      "grad_norm": 0.3983774071109117,
+      "learning_rate": 0.00014280920027594907,
+      "loss": 0.7137,
+      "step": 710
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.45146135429403017,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7395,
+      "step": 711
+    },
+    {
+      "epoch": 0.3797333333333333,
+      "grad_norm": 0.46216153520539915,
+      "learning_rate": 0.00014249660554351752,
+      "loss": 0.7127,
+      "step": 712
+    },
+    {
+      "epoch": 0.38026666666666664,
+      "grad_norm": 0.37503357112005953,
+      "learning_rate": 0.00014234011759187083,
+      "loss": 0.6553,
+      "step": 713
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.5407589874243327,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.6839,
+      "step": 714
+    },
+    {
+      "epoch": 0.38133333333333336,
+      "grad_norm": 0.4426383635037013,
+      "learning_rate": 0.00014202676285419812,
+      "loss": 0.7556,
+      "step": 715
+    },
+    {
+      "epoch": 0.3818666666666667,
+      "grad_norm": 0.4351030781652491,
+      "learning_rate": 0.00014186989700389687,
+      "loss": 0.7125,
+      "step": 716
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.42703783138354634,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7657,
+      "step": 717
+    },
+    {
+      "epoch": 0.38293333333333335,
+      "grad_norm": 0.37859228984153764,
+      "learning_rate": 0.0001415557906824895,
+      "loss": 0.6771,
+      "step": 718
+    },
+    {
+      "epoch": 0.3834666666666667,
+      "grad_norm": 0.47642453052703476,
+      "learning_rate": 0.00014139855114935252,
+      "loss": 0.7171,
+      "step": 719
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4300388925618423,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.6921,
+      "step": 720
+    },
+    {
+      "epoch": 0.38453333333333334,
+      "grad_norm": 0.4860275460776051,
+      "learning_rate": 0.0001410837016859161,
+      "loss": 0.7622,
+      "step": 721
+    },
+    {
+      "epoch": 0.38506666666666667,
+      "grad_norm": 0.37535010226201987,
+      "learning_rate": 0.00014092609269580496,
+      "loss": 0.7053,
+      "step": 722
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.3976404676748777,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.694,
+      "step": 723
+    },
+    {
+      "epoch": 0.38613333333333333,
+      "grad_norm": 0.41977233221720256,
+      "learning_rate": 0.00014061050855201723,
+      "loss": 0.7382,
+      "step": 724
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 0.44837616499019745,
+      "learning_rate": 0.0001404525343407228,
+      "loss": 0.7186,
+      "step": 725
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.46263973386021645,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.6689,
+      "step": 726
+    },
+    {
+      "epoch": 0.3877333333333333,
+      "grad_norm": 0.45182531508138,
+      "learning_rate": 0.00014013622399800627,
+      "loss": 0.7417,
+      "step": 727
+    },
+    {
+      "epoch": 0.38826666666666665,
+      "grad_norm": 0.39492668083723853,
+      "learning_rate": 0.00013997788881113489,
+      "loss": 0.7215,
+      "step": 728
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.4523013919145976,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7684,
+      "step": 729
+    },
+    {
+      "epoch": 0.3893333333333333,
+      "grad_norm": 0.4381382220440621,
+      "learning_rate": 0.0001396608607704289,
+      "loss": 0.7339,
+      "step": 730
+    },
+    {
+      "epoch": 0.38986666666666664,
+      "grad_norm": 0.3970523596266796,
+      "learning_rate": 0.0001395021688632882,
+      "loss": 0.7418,
+      "step": 731
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.4347272847239781,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7787,
+      "step": 732
+    },
+    {
+      "epoch": 0.39093333333333335,
+      "grad_norm": 0.4130918424648687,
+      "learning_rate": 0.00013918443164482046,
+      "loss": 0.7127,
+      "step": 733
+    },
+    {
+      "epoch": 0.3914666666666667,
+      "grad_norm": 0.44002182055166045,
+      "learning_rate": 0.000139025387282305,
+      "loss": 0.7232,
+      "step": 734
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.49877895452345106,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7238,
+      "step": 735
+    },
+    {
+      "epoch": 0.39253333333333335,
+      "grad_norm": 0.39406534474061694,
+      "learning_rate": 0.0001387069494253626,
+      "loss": 0.6675,
+      "step": 736
+    },
+    {
+      "epoch": 0.3930666666666667,
+      "grad_norm": 0.45754995204049975,
+      "learning_rate": 0.0001385475568818394,
+      "loss": 0.6866,
+      "step": 737
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.40723032246884827,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.727,
+      "step": 738
+    },
+    {
+      "epoch": 0.39413333333333334,
+      "grad_norm": 0.4959940655631805,
+      "learning_rate": 0.00013822842694453924,
+      "loss": 0.7722,
+      "step": 739
+    },
+    {
+      "epoch": 0.39466666666666667,
+      "grad_norm": 0.5060320422283979,
+      "learning_rate": 0.0001380686905037327,
+      "loss": 0.7185,
+      "step": 740
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.4382737101204542,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7016,
+      "step": 741
+    },
+    {
+      "epoch": 0.3957333333333333,
+      "grad_norm": 0.4632557011709129,
+      "learning_rate": 0.00013774887706279165,
+      "loss": 0.717,
+      "step": 742
+    },
+    {
+      "epoch": 0.39626666666666666,
+      "grad_norm": 0.37928916344341357,
+      "learning_rate": 0.0001375888010176686,
+      "loss": 0.7077,
+      "step": 743
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4988609159170412,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.713,
+      "step": 744
+    },
+    {
+      "epoch": 0.3973333333333333,
+      "grad_norm": 0.5035465539807298,
+      "learning_rate": 0.00013726831266817278,
+      "loss": 0.7755,
+      "step": 745
+    },
+    {
+      "epoch": 0.39786666666666665,
+      "grad_norm": 0.4243815881999628,
+      "learning_rate": 0.00013710790132082692,
+      "loss": 0.7035,
+      "step": 746
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.40511097729163625,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.6997,
+      "step": 747
+    },
+    {
+      "epoch": 0.3989333333333333,
+      "grad_norm": 0.34732451466396097,
+      "learning_rate": 0.00013678674667600102,
+      "loss": 0.5713,
+      "step": 748
+    },
+    {
+      "epoch": 0.3994666666666667,
+      "grad_norm": 0.4239987844039931,
+      "learning_rate": 0.00013662600433753745,
+      "loss": 0.6592,
+      "step": 749
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.43841975631729396,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7232,
+      "step": 750
+    },
+    {
+      "epoch": 0.40053333333333335,
+      "grad_norm": 0.42246102198299307,
+      "learning_rate": 0.00013630419202851284,
+      "loss": 0.6888,
+      "step": 751
+    },
+    {
+      "epoch": 0.4010666666666667,
+      "grad_norm": 0.5032841321184606,
+      "learning_rate": 0.00013614312301893223,
+      "loss": 0.7561,
+      "step": 752
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.4905327312132084,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.7223,
+      "step": 753
+    },
+    {
+      "epoch": 0.40213333333333334,
+      "grad_norm": 0.43946961949901897,
+      "learning_rate": 0.00013582066169451535,
+      "loss": 0.7665,
+      "step": 754
+    },
+    {
+      "epoch": 0.4026666666666667,
+      "grad_norm": 0.4402067407496789,
+      "learning_rate": 0.0001356592703425976,
+      "loss": 0.7159,
+      "step": 755
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.47176887532052597,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7249,
+      "step": 756
+    },
+    {
+      "epoch": 0.40373333333333333,
+      "grad_norm": 0.4476564253342013,
+      "learning_rate": 0.00013533616866903735,
+      "loss": 0.7531,
+      "step": 757
+    },
+    {
+      "epoch": 0.40426666666666666,
+      "grad_norm": 0.46488437306735336,
+      "learning_rate": 0.0001351744593122255,
+      "loss": 0.7542,
+      "step": 758
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.44826828994452866,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7475,
+      "step": 759
+    },
+    {
+      "epoch": 0.4053333333333333,
+      "grad_norm": 0.4392302673067083,
+      "learning_rate": 0.00013485072597298038,
+      "loss": 0.7496,
+      "step": 760
+    },
+    {
+      "epoch": 0.40586666666666665,
+      "grad_norm": 0.4284828258125813,
+      "learning_rate": 0.00013468870295726398,
+      "loss": 0.7599,
+      "step": 761
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.39250654426022963,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.7007,
+      "step": 762
+    },
+    {
+      "epoch": 0.4069333333333333,
+      "grad_norm": 0.3734484227669392,
+      "learning_rate": 0.00013436434665276865,
+      "loss": 0.631,
+      "step": 763
+    },
+    {
+      "epoch": 0.40746666666666664,
+      "grad_norm": 0.4854947563926426,
+      "learning_rate": 0.00013420201433256689,
+      "loss": 0.7066,
+      "step": 764
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.4055127583755472,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.6928,
+      "step": 765
+    },
+    {
+      "epoch": 0.40853333333333336,
+      "grad_norm": 0.45396074749692356,
+      "learning_rate": 0.00013387704377999842,
+      "loss": 0.7343,
+      "step": 766
+    },
+    {
+      "epoch": 0.4090666666666667,
+      "grad_norm": 0.4296512549915837,
+      "learning_rate": 0.00013371440651804313,
+      "loss": 0.6739,
+      "step": 767
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4711535732387848,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.693,
+      "step": 768
+    },
+    {
+      "epoch": 0.41013333333333335,
+      "grad_norm": 0.4267546928980294,
+      "learning_rate": 0.00013338883045108674,
+      "loss": 0.6887,
+      "step": 769
+    },
+    {
+      "epoch": 0.4106666666666667,
+      "grad_norm": 0.4739593317052299,
+      "learning_rate": 0.00013322589261830517,
+      "loss": 0.7671,
+      "step": 770
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.429430590107906,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7383,
+      "step": 771
+    },
+    {
+      "epoch": 0.41173333333333334,
+      "grad_norm": 0.42881367720701874,
+      "learning_rate": 0.0001328997197869194,
+      "loss": 0.7119,
+      "step": 772
+    },
+    {
+      "epoch": 0.41226666666666667,
+      "grad_norm": 0.4615298357246681,
+      "learning_rate": 0.0001327364857623168,
+      "loss": 0.7062,
+      "step": 773
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.467141491830004,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7164,
+      "step": 774
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 0.5202135770227491,
+      "learning_rate": 0.00013240972493249847,
+      "loss": 0.7071,
+      "step": 775
+    },
+    {
+      "epoch": 0.41386666666666666,
+      "grad_norm": 0.4475869027765437,
+      "learning_rate": 0.0001322461991030402,
+      "loss": 0.6947,
+      "step": 776
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.44295597600268727,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7058,
+      "step": 777
+    },
+    {
+      "epoch": 0.4149333333333333,
+      "grad_norm": 0.47176094451362777,
+      "learning_rate": 0.00013191885905658872,
+      "loss": 0.7165,
+      "step": 778
+    },
+    {
+      "epoch": 0.41546666666666665,
+      "grad_norm": 0.43358146949114096,
+      "learning_rate": 0.0001317550458170826,
+      "loss": 0.7151,
+      "step": 779
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4412211134754077,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7551,
+      "step": 780
+    },
+    {
+      "epoch": 0.4165333333333333,
+      "grad_norm": 0.4410286514374678,
+      "learning_rate": 0.00013142713535136414,
+      "loss": 0.6723,
+      "step": 781
+    },
+    {
+      "epoch": 0.41706666666666664,
+      "grad_norm": 0.4216559262181823,
+      "learning_rate": 0.00013126303910434214,
+      "loss": 0.7483,
+      "step": 782
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.3940472095500932,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7078,
+      "step": 783
+    },
+    {
+      "epoch": 0.41813333333333336,
+      "grad_norm": 0.4027435998274283,
+      "learning_rate": 0.00013093456703205288,
+      "loss": 0.661,
+      "step": 784
+    },
+    {
+      "epoch": 0.4186666666666667,
+      "grad_norm": 0.3969104111607289,
+      "learning_rate": 0.00013077019218765305,
+      "loss": 0.6883,
+      "step": 785
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4824453554696596,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7817,
+      "step": 786
+    },
+    {
+      "epoch": 0.41973333333333335,
+      "grad_norm": 0.44449554922837264,
+      "learning_rate": 0.0001304411673365826,
+      "loss": 0.7433,
+      "step": 787
+    },
+    {
+      "epoch": 0.4202666666666667,
+      "grad_norm": 0.43347524170875906,
+      "learning_rate": 0.0001302765183124302,
+      "loss": 0.7109,
+      "step": 788
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.39878441525322905,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.6622,
+      "step": 789
+    },
+    {
+      "epoch": 0.42133333333333334,
+      "grad_norm": 0.4463980260611668,
+      "learning_rate": 0.00012994694952522435,
+      "loss": 0.745,
+      "step": 790
+    },
+    {
+      "epoch": 0.42186666666666667,
+      "grad_norm": 0.47312945243201604,
+      "learning_rate": 0.00012978203074631334,
+      "loss": 0.7728,
+      "step": 791
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.41318675822793455,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.6108,
+      "step": 792
+    },
+    {
+      "epoch": 0.42293333333333333,
+      "grad_norm": 0.46171742829872736,
+      "learning_rate": 0.00012945192688023624,
+      "loss": 0.7379,
+      "step": 793
+    },
+    {
+      "epoch": 0.42346666666666666,
+      "grad_norm": 0.44500590778977384,
+      "learning_rate": 0.0001292867427788104,
+      "loss": 0.6901,
+      "step": 794
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.37258176457460235,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.6797,
+      "step": 795
+    },
+    {
+      "epoch": 0.4245333333333333,
+      "grad_norm": 0.4277090342743537,
+      "learning_rate": 0.00012895611270550666,
+      "loss": 0.7537,
+      "step": 796
+    },
+    {
+      "epoch": 0.42506666666666665,
+      "grad_norm": 0.41455736640289276,
+      "learning_rate": 0.0001287906677209403,
+      "loss": 0.7556,
+      "step": 797
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.3878051589700941,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.6958,
+      "step": 798
+    },
+    {
+      "epoch": 0.4261333333333333,
+      "grad_norm": 0.4371718185766127,
+      "learning_rate": 0.0001284595203261965,
+      "loss": 0.7673,
+      "step": 799
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.4416283994462147,
+      "learning_rate": 0.00012829381890487536,
+      "loss": 0.6506,
+      "step": 800
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.3904046349843101,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7024,
+      "step": 801
+    },
+    {
+      "epoch": 0.42773333333333335,
+      "grad_norm": 0.3627789630617062,
+      "learning_rate": 0.00012796216308838117,
+      "loss": 0.6675,
+      "step": 802
+    },
+    {
+      "epoch": 0.4282666666666667,
+      "grad_norm": 0.46654878719546106,
+      "learning_rate": 0.00012779620968358273,
+      "loss": 0.7709,
+      "step": 803
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.38495662223781735,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.6598,
+      "step": 804
+    },
+    {
+      "epoch": 0.42933333333333334,
+      "grad_norm": 0.38161629022718274,
+      "learning_rate": 0.00012746405435869198,
+      "loss": 0.7313,
+      "step": 805
+    },
+    {
+      "epoch": 0.4298666666666667,
+      "grad_norm": 0.39429952551911845,
+      "learning_rate": 0.00012729785343046588,
+      "loss": 0.6631,
+      "step": 806
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.443184825535912,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7093,
+      "step": 807
+    },
+    {
+      "epoch": 0.43093333333333333,
+      "grad_norm": 0.4839129942053279,
+      "learning_rate": 0.00012696520752395672,
+      "loss": 0.7042,
+      "step": 808
+    },
+    {
+      "epoch": 0.43146666666666667,
+      "grad_norm": 0.41334895955463824,
+      "learning_rate": 0.00012679876353900482,
+      "loss": 0.7246,
+      "step": 809
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4863180689827426,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7449,
+      "step": 810
+    },
+    {
+      "epoch": 0.4325333333333333,
+      "grad_norm": 0.3820454923774122,
+      "learning_rate": 0.00012646563599083996,
+      "loss": 0.6725,
+      "step": 811
+    },
+    {
+      "epoch": 0.43306666666666666,
+      "grad_norm": 0.4220214371529139,
+      "learning_rate": 0.00012629895342239643,
+      "loss": 0.7403,
+      "step": 812
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.41454752364842296,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.7249,
+      "step": 813
+    },
+    {
+      "epoch": 0.4341333333333333,
+      "grad_norm": 0.42604154192156635,
+      "learning_rate": 0.00012596535318548289,
+      "loss": 0.7378,
+      "step": 814
+    },
+    {
+      "epoch": 0.43466666666666665,
+      "grad_norm": 0.43228897217996826,
+      "learning_rate": 0.0001257984365131938,
+      "loss": 0.7029,
+      "step": 815
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.40739908117620516,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.6814,
+      "step": 816
+    },
+    {
+      "epoch": 0.4357333333333333,
+      "grad_norm": 0.4634356318463634,
+      "learning_rate": 0.00012546437255314222,
+      "loss": 0.7377,
+      "step": 817
+    },
+    {
+      "epoch": 0.4362666666666667,
+      "grad_norm": 0.5959063980403425,
+      "learning_rate": 0.0001252972262629454,
+      "loss": 0.8282,
+      "step": 818
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.3897472418811074,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.6303,
+      "step": 819
+    },
+    {
+      "epoch": 0.43733333333333335,
+      "grad_norm": 0.3612063612791084,
+      "learning_rate": 0.00012496270755782914,
+      "loss": 0.6497,
+      "step": 820
+    },
+    {
+      "epoch": 0.4378666666666667,
+      "grad_norm": 0.36916970218865225,
+      "learning_rate": 0.00012479533614183334,
+      "loss": 0.7053,
+      "step": 821
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.40208731910393475,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.6698,
+      "step": 822
+    },
+    {
+      "epoch": 0.43893333333333334,
+      "grad_norm": 0.42344557694377094,
+      "learning_rate": 0.00012446037168194714,
+      "loss": 0.6714,
+      "step": 823
+    },
+    {
+      "epoch": 0.43946666666666667,
+      "grad_norm": 0.41985601136707557,
+      "learning_rate": 0.00012429277963831148,
+      "loss": 0.6884,
+      "step": 824
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.43303368899400574,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.6791,
+      "step": 825
+    },
+    {
+      "epoch": 0.44053333333333333,
+      "grad_norm": 0.4120029133921381,
+      "learning_rate": 0.00012395737842592995,
+      "loss": 0.7271,
+      "step": 826
+    },
+    {
+      "epoch": 0.44106666666666666,
+      "grad_norm": 0.3746594000082755,
+      "learning_rate": 0.000123789570258743,
+      "loss": 0.6436,
+      "step": 827
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.4039068035463203,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.7029,
+      "step": 828
+    },
+    {
+      "epoch": 0.4421333333333333,
+      "grad_norm": 0.41184612698087747,
+      "learning_rate": 0.00012345374130787854,
+      "loss": 0.7111,
+      "step": 829
+    },
+    {
+      "epoch": 0.44266666666666665,
+      "grad_norm": 0.38301489631977276,
+      "learning_rate": 0.00012328572152703725,
+      "loss": 0.6534,
+      "step": 830
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.37525032785125834,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.6434,
+      "step": 831
+    },
+    {
+      "epoch": 0.4437333333333333,
+      "grad_norm": 0.456722632978626,
+      "learning_rate": 0.00012294947386319794,
+      "loss": 0.7518,
+      "step": 832
+    },
+    {
+      "epoch": 0.44426666666666664,
+      "grad_norm": 0.47633067924491773,
+      "learning_rate": 0.0001227812469842864,
+      "loss": 0.7149,
+      "step": 833
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.465979164027581,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7662,
+      "step": 834
+    },
+    {
+      "epoch": 0.44533333333333336,
+      "grad_norm": 0.4091999980397747,
+      "learning_rate": 0.00012244458964423327,
+      "loss": 0.6956,
+      "step": 835
+    },
+    {
+      "epoch": 0.4458666666666667,
+      "grad_norm": 0.4635916658556116,
+      "learning_rate": 0.00012227616018840154,
+      "loss": 0.7667,
+      "step": 836
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.3658341568913632,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.6953,
+      "step": 837
+    },
+    {
+      "epoch": 0.44693333333333335,
+      "grad_norm": 0.37921643901310315,
+      "learning_rate": 0.00012193910221990581,
+      "loss": 0.6237,
+      "step": 838
+    },
+    {
+      "epoch": 0.4474666666666667,
+      "grad_norm": 0.4131134541607325,
+      "learning_rate": 0.00012177047471374807,
+      "loss": 0.6657,
+      "step": 839
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4239805190357618,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7111,
+      "step": 840
+    },
+    {
+      "epoch": 0.44853333333333334,
+      "grad_norm": 0.462761873466164,
+      "learning_rate": 0.0001214330251753481,
+      "loss": 0.707,
+      "step": 841
+    },
+    {
+      "epoch": 0.44906666666666667,
+      "grad_norm": 0.4022517047204756,
+      "learning_rate": 0.00012126420415078132,
+      "loss": 0.6648,
+      "step": 842
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.43799279337032937,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.6855,
+      "step": 843
+    },
+    {
+      "epoch": 0.45013333333333333,
+      "grad_norm": 0.4251834274788315,
+      "learning_rate": 0.00012092637211153885,
+      "loss": 0.7075,
+      "step": 844
+    },
+    {
+      "epoch": 0.45066666666666666,
+      "grad_norm": 0.41225338694188035,
+      "learning_rate": 0.0001207573621056809,
+      "loss": 0.6909,
+      "step": 845
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.4070184019232702,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.6525,
+      "step": 846
+    },
+    {
+      "epoch": 0.4517333333333333,
+      "grad_norm": 0.41649416449245563,
+      "learning_rate": 0.00012041915664493761,
+      "loss": 0.7291,
+      "step": 847
+    },
+    {
+      "epoch": 0.45226666666666665,
+      "grad_norm": 0.4546324691909841,
+      "learning_rate": 0.00012024996219998517,
+      "loss": 0.7653,
+      "step": 848
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.45141669122155437,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7051,
+      "step": 849
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.5444018336338412,
+      "learning_rate": 0.00011991139240711857,
+      "loss": 0.7419,
+      "step": 850
+    },
+    {
+      "epoch": 0.45386666666666664,
+      "grad_norm": 0.43065778637280483,
+      "learning_rate": 0.00011974201807022525,
+      "loss": 0.7176,
+      "step": 851
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.38167947206238456,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.6412,
+      "step": 852
+    },
+    {
+      "epoch": 0.45493333333333336,
+      "grad_norm": 0.4505920393626082,
+      "learning_rate": 0.00011940309304440433,
+      "loss": 0.685,
+      "step": 853
+    },
+    {
+      "epoch": 0.4554666666666667,
+      "grad_norm": 0.43960240840849185,
+      "learning_rate": 0.00011923354336755835,
+      "loss": 0.7047,
+      "step": 854
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.39407494617000993,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.6705,
+      "step": 855
+    },
+    {
+      "epoch": 0.45653333333333335,
+      "grad_norm": 0.4114040227268409,
+      "learning_rate": 0.00011889427221749916,
+      "loss": 0.6595,
+      "step": 856
+    },
+    {
+      "epoch": 0.4570666666666667,
+      "grad_norm": 0.4612087998564698,
+      "learning_rate": 0.00011872455175740112,
+      "loss": 0.7243,
+      "step": 857
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.43932608382970695,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7436,
+      "step": 858
+    },
+    {
+      "epoch": 0.45813333333333334,
+      "grad_norm": 0.37415790553810785,
+      "learning_rate": 0.00011838494360112185,
+      "loss": 0.6556,
+      "step": 859
+    },
+    {
+      "epoch": 0.45866666666666667,
+      "grad_norm": 0.4504569090557854,
+      "learning_rate": 0.00011821505691906216,
+      "loss": 0.6982,
+      "step": 860
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.44034490188774617,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.6777,
+      "step": 861
+    },
+    {
+      "epoch": 0.4597333333333333,
+      "grad_norm": 0.4736985341131689,
+      "learning_rate": 0.00011787512088363817,
+      "loss": 0.6765,
+      "step": 862
+    },
+    {
+      "epoch": 0.46026666666666666,
+      "grad_norm": 0.4677624856624705,
+      "learning_rate": 0.00011770507254537453,
+      "loss": 0.7287,
+      "step": 863
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.4395533085154452,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.696,
+      "step": 864
+    },
+    {
+      "epoch": 0.4613333333333333,
+      "grad_norm": 0.39550018227943234,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.6881,
+      "step": 865
+    },
+    {
+      "epoch": 0.46186666666666665,
+      "grad_norm": 0.47395930050205876,
+      "learning_rate": 0.00011719461234232764,
+      "loss": 0.7316,
+      "step": 866
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.39765873650882605,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.6956,
+      "step": 867
+    },
+    {
+      "epoch": 0.4629333333333333,
+      "grad_norm": 0.4891652376554147,
+      "learning_rate": 0.00011685404796484225,
+      "loss": 0.7982,
+      "step": 868
+    },
+    {
+      "epoch": 0.4634666666666667,
+      "grad_norm": 0.4232612644035031,
+      "learning_rate": 0.00011668369002869912,
+      "loss": 0.6723,
+      "step": 869
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.38762156442078516,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.6667,
+      "step": 870
+    },
+    {
+      "epoch": 0.46453333333333335,
+      "grad_norm": 0.4909127349481194,
+      "learning_rate": 0.00011634282520518383,
+      "loss": 0.742,
+      "step": 871
+    },
+    {
+      "epoch": 0.4650666666666667,
+      "grad_norm": 0.41671132795123844,
+      "learning_rate": 0.00011617231933568578,
+      "loss": 0.7225,
+      "step": 872
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.4856652200088908,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7869,
+      "step": 873
+    },
+    {
+      "epoch": 0.46613333333333334,
+      "grad_norm": 0.5121480118222634,
+      "learning_rate": 0.00011583116322698935,
+      "loss": 0.7202,
+      "step": 874
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.40668954918811323,
+      "learning_rate": 0.00011566051400653486,
+      "loss": 0.7713,
+      "step": 875
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.37095807153206234,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.6935,
+      "step": 876
+    },
+    {
+      "epoch": 0.46773333333333333,
+      "grad_norm": 0.45446232774625944,
+      "learning_rate": 0.00011531907578133429,
+      "loss": 0.7621,
+      "step": 877
+    },
+    {
+      "epoch": 0.46826666666666666,
+      "grad_norm": 0.3887007782251961,
+      "learning_rate": 0.00011514828779617459,
+      "loss": 0.7416,
+      "step": 878
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.45157365228556273,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.6708,
+      "step": 879
+    },
+    {
+      "epoch": 0.4693333333333333,
+      "grad_norm": 0.4032785843368036,
+      "learning_rate": 0.00011480657663072896,
+      "loss": 0.6955,
+      "step": 880
+    },
+    {
+      "epoch": 0.46986666666666665,
+      "grad_norm": 0.40866998088501677,
+      "learning_rate": 0.00011463565447084445,
+      "loss": 0.6839,
+      "step": 881
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3827212503725845,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.6638,
+      "step": 882
+    },
+    {
+      "epoch": 0.4709333333333333,
+      "grad_norm": 0.423243115774533,
+      "learning_rate": 0.00011429367954874819,
+      "loss": 0.657,
+      "step": 883
+    },
+    {
+      "epoch": 0.47146666666666665,
+      "grad_norm": 0.40384524585114895,
+      "learning_rate": 0.0001141226278077254,
+      "loss": 0.6638,
+      "step": 884
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.39767400585791324,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7069,
+      "step": 885
+    },
+    {
+      "epoch": 0.47253333333333336,
+      "grad_norm": 0.4573851491431221,
+      "learning_rate": 0.00011378039831966134,
+      "loss": 0.6972,
+      "step": 886
+    },
+    {
+      "epoch": 0.4730666666666667,
+      "grad_norm": 0.46646520145860276,
+      "learning_rate": 0.00011360922159456928,
+      "loss": 0.7123,
+      "step": 887
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3730865894803738,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7056,
+      "step": 888
+    },
+    {
+      "epoch": 0.47413333333333335,
+      "grad_norm": 0.3910630492654369,
+      "learning_rate": 0.00011326674673806195,
+      "loss": 0.6066,
+      "step": 889
+    },
+    {
+      "epoch": 0.4746666666666667,
+      "grad_norm": 0.4278308966101939,
+      "learning_rate": 0.00011309544962932862,
+      "loss": 0.7611,
+      "step": 890
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.42457588753017367,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.6762,
+      "step": 891
+    },
+    {
+      "epoch": 0.47573333333333334,
+      "grad_norm": 0.42832922115693706,
+      "learning_rate": 0.00011275273860849684,
+      "loss": 0.6467,
+      "step": 892
+    },
+    {
+      "epoch": 0.47626666666666667,
+      "grad_norm": 0.464530242798856,
+      "learning_rate": 0.00011258132571978555,
+      "loss": 0.6688,
+      "step": 893
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.38472469459638076,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.6841,
+      "step": 894
+    },
+    {
+      "epoch": 0.47733333333333333,
+      "grad_norm": 0.38849071920916217,
+      "learning_rate": 0.00011223838774509514,
+      "loss": 0.6736,
+      "step": 895
+    },
+    {
+      "epoch": 0.47786666666666666,
+      "grad_norm": 0.44907458299897185,
+      "learning_rate": 0.00011206686368318086,
+      "loss": 0.6982,
+      "step": 896
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.4565938291447505,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.6869,
+      "step": 897
+    },
+    {
+      "epoch": 0.4789333333333333,
+      "grad_norm": 0.3793537012311367,
+      "learning_rate": 0.00011172370797119712,
+      "loss": 0.7202,
+      "step": 898
+    },
+    {
+      "epoch": 0.47946666666666665,
+      "grad_norm": 0.42887604556878534,
+      "learning_rate": 0.00011155207734584263,
+      "loss": 0.7523,
+      "step": 899
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.46717937772712853,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7172,
+      "step": 900
+    },
+    {
+      "epoch": 0.4805333333333333,
+      "grad_norm": 0.3764414926422453,
+      "learning_rate": 0.00011120871311898254,
+      "loss": 0.6331,
+      "step": 901
+    },
+    {
+      "epoch": 0.48106666666666664,
+      "grad_norm": 0.4429214812780887,
+      "learning_rate": 0.0001110369805428146,
+      "loss": 0.6603,
+      "step": 902
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.41987417589997855,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.7133,
+      "step": 903
+    },
+    {
+      "epoch": 0.48213333333333336,
+      "grad_norm": 0.5210846787709746,
+      "learning_rate": 0.0001106934170290991,
+      "loss": 0.7689,
+      "step": 904
+    },
+    {
+      "epoch": 0.4826666666666667,
+      "grad_norm": 0.4141037873790039,
+      "learning_rate": 0.00011052158711748434,
+      "loss": 0.6461,
+      "step": 905
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.4782885729084308,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.7443,
+      "step": 906
+    },
+    {
+      "epoch": 0.48373333333333335,
+      "grad_norm": 0.40144968575883505,
+      "learning_rate": 0.00011017783355029026,
+      "loss": 0.7076,
+      "step": 907
+    },
+    {
+      "epoch": 0.4842666666666667,
+      "grad_norm": 0.4024462129348643,
+      "learning_rate": 0.00011000591092121127,
+      "loss": 0.6517,
+      "step": 908
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.46684997838951175,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.6774,
+      "step": 909
+    },
+    {
+      "epoch": 0.48533333333333334,
+      "grad_norm": 0.4319143202542929,
+      "learning_rate": 0.0001096619765390232,
+      "loss": 0.7165,
+      "step": 910
+    },
+    {
+      "epoch": 0.48586666666666667,
+      "grad_norm": 0.3914237765407882,
+      "learning_rate": 0.00010948996581295436,
+      "loss": 0.705,
+      "step": 911
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3916173259933699,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.6757,
+      "step": 912
+    },
+    {
+      "epoch": 0.48693333333333333,
+      "grad_norm": 0.39956961393530177,
+      "learning_rate": 0.00010914585985911632,
+      "loss": 0.7288,
+      "step": 913
+    },
+    {
+      "epoch": 0.48746666666666666,
+      "grad_norm": 0.3880763668296263,
+      "learning_rate": 0.00010897376565889971,
+      "loss": 0.7006,
+      "step": 914
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.38121867310237195,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.6636,
+      "step": 915
+    },
+    {
+      "epoch": 0.4885333333333333,
+      "grad_norm": 0.4311920380139067,
+      "learning_rate": 0.00010862949738136681,
+      "loss": 0.6683,
+      "step": 916
+    },
+    {
+      "epoch": 0.48906666666666665,
+      "grad_norm": 0.3846473660057849,
+      "learning_rate": 0.00010845732433208779,
+      "loss": 0.6299,
+      "step": 917
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3843852749197285,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.5507,
+      "step": 918
+    },
+    {
+      "epoch": 0.4901333333333333,
+      "grad_norm": 0.4099112208585509,
+      "learning_rate": 0.00010811290298317755,
+      "loss": 0.6898,
+      "step": 919
+    },
+    {
+      "epoch": 0.49066666666666664,
+      "grad_norm": 0.4239536693345123,
+      "learning_rate": 0.00010794065571204072,
+      "loss": 0.6707,
+      "step": 920
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.39020478363358896,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.6282,
+      "step": 921
+    },
+    {
+      "epoch": 0.49173333333333336,
+      "grad_norm": 0.4138631017383004,
+      "learning_rate": 0.00010759609054818458,
+      "loss": 0.6705,
+      "step": 922
+    },
+    {
+      "epoch": 0.4922666666666667,
+      "grad_norm": 0.39382180684827073,
+      "learning_rate": 0.00010742377368438914,
+      "loss": 0.6753,
+      "step": 923
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.41063973948385085,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.7252,
+      "step": 924
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 0.45590420113087754,
+      "learning_rate": 0.00010707907396588361,
+      "loss": 0.7075,
+      "step": 925
+    },
+    {
+      "epoch": 0.4938666666666667,
+      "grad_norm": 0.479092633471132,
+      "learning_rate": 0.0001069066921404992,
+      "loss": 0.756,
+      "step": 926
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.40947527248349513,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.736,
+      "step": 927
+    },
+    {
+      "epoch": 0.49493333333333334,
+      "grad_norm": 0.41755992091951427,
+      "learning_rate": 0.00010656186713125689,
+      "loss": 0.6764,
+      "step": 928
+    },
+    {
+      "epoch": 0.49546666666666667,
+      "grad_norm": 0.5008756508436008,
+      "learning_rate": 0.0001063894249770989,
+      "loss": 0.7461,
+      "step": 929
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4016638669500053,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.6632,
+      "step": 930
+    },
+    {
+      "epoch": 0.4965333333333333,
+      "grad_norm": 0.4324061289766408,
+      "learning_rate": 0.00010604448394439983,
+      "loss": 0.6477,
+      "step": 931
+    },
+    {
+      "epoch": 0.49706666666666666,
+      "grad_norm": 0.4412168422297335,
+      "learning_rate": 0.00010587198609590505,
+      "loss": 0.741,
+      "step": 932
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.3984762587813605,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.6679,
+      "step": 933
+    },
+    {
+      "epoch": 0.4981333333333333,
+      "grad_norm": 0.449656501907126,
+      "learning_rate": 0.00010552693831014726,
+      "loss": 0.6845,
+      "step": 934
+    },
+    {
+      "epoch": 0.49866666666666665,
+      "grad_norm": 0.3764081526800237,
+      "learning_rate": 0.0001053543894032493,
+      "loss": 0.6395,
+      "step": 935
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.4255182047758748,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.7093,
+      "step": 936
+    },
+    {
+      "epoch": 0.4997333333333333,
+      "grad_norm": 0.3999003305188967,
+      "learning_rate": 0.00010500924413769988,
+      "loss": 0.6587,
+      "step": 937
+    },
+    {
+      "epoch": 0.5002666666666666,
+      "grad_norm": 0.4807963299194944,
+      "learning_rate": 0.00010483664880970457,
+      "loss": 0.7051,
+      "step": 938
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.3845495646724184,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.6897,
+      "step": 939
+    },
+    {
+      "epoch": 0.5013333333333333,
+      "grad_norm": 0.4228148908480343,
+      "learning_rate": 0.00010449141534025045,
+      "loss": 0.6939,
+      "step": 940
+    },
+    {
+      "epoch": 0.5018666666666667,
+      "grad_norm": 0.41707744163602367,
+      "learning_rate": 0.00010431877822971117,
+      "loss": 0.6927,
+      "step": 941
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.4075836226546671,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.7014,
+      "step": 942
+    },
+    {
+      "epoch": 0.5029333333333333,
+      "grad_norm": 0.3979750044490217,
+      "learning_rate": 0.00010397346583460971,
+      "loss": 0.6854,
+      "step": 943
+    },
+    {
+      "epoch": 0.5034666666666666,
+      "grad_norm": 0.405131010054494,
+      "learning_rate": 0.0001038007915812028,
+      "loss": 0.6202,
+      "step": 944
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.4208582594190111,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.6775,
+      "step": 945
+    },
+    {
+      "epoch": 0.5045333333333333,
+      "grad_norm": 0.4422173215338135,
+      "learning_rate": 0.0001034554095408326,
+      "loss": 0.6463,
+      "step": 946
+    },
+    {
+      "epoch": 0.5050666666666667,
+      "grad_norm": 0.4583800416518211,
+      "learning_rate": 0.00010328270278523256,
+      "loss": 0.7189,
+      "step": 947
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.4663487212335978,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7068,
+      "step": 948
+    },
+    {
+      "epoch": 0.5061333333333333,
+      "grad_norm": 0.6720076843435544,
+      "learning_rate": 0.00010293726038184393,
+      "loss": 0.8099,
+      "step": 949
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.41453475629692693,
+      "learning_rate": 0.00010276452576559879,
+      "loss": 0.7004,
+      "step": 950
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.4838049864154515,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7132,
+      "step": 951
+    },
+    {
+      "epoch": 0.5077333333333334,
+      "grad_norm": 0.38956248992999926,
+      "learning_rate": 0.00010241903228306431,
+      "loss": 0.6566,
+      "step": 952
+    },
+    {
+      "epoch": 0.5082666666666666,
+      "grad_norm": 0.4788107091066432,
+      "learning_rate": 0.0001022462744484709,
+      "loss": 0.7609,
+      "step": 953
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.42449048921034255,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.6608,
+      "step": 954
+    },
+    {
+      "epoch": 0.5093333333333333,
+      "grad_norm": 0.38315009506219316,
+      "learning_rate": 0.00010190073917203589,
+      "loss": 0.6547,
+      "step": 955
+    },
+    {
+      "epoch": 0.5098666666666667,
+      "grad_norm": 0.444624385828526,
+      "learning_rate": 0.00010172796276201503,
+      "loss": 0.6875,
+      "step": 956
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.38204646337438464,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.6206,
+      "step": 957
+    },
+    {
+      "epoch": 0.5109333333333334,
+      "grad_norm": 0.3892107373391419,
+      "learning_rate": 0.00010138239497804804,
+      "loss": 0.6646,
+      "step": 958
+    },
+    {
+      "epoch": 0.5114666666666666,
+      "grad_norm": 0.4612981285596248,
+      "learning_rate": 0.00010120960463601976,
+      "loss": 0.6786,
+      "step": 959
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.37446944446069946,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.6884,
+      "step": 960
+    },
+    {
+      "epoch": 0.5125333333333333,
+      "grad_norm": 0.4777690939015667,
+      "learning_rate": 0.00010086401363176305,
+      "loss": 0.7443,
+      "step": 961
+    },
+    {
+      "epoch": 0.5130666666666667,
+      "grad_norm": 0.37431633724955565,
+      "learning_rate": 0.00010069121400152181,
+      "loss": 0.6579,
+      "step": 962
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.48271017790023146,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.7562,
+      "step": 963
+    },
+    {
+      "epoch": 0.5141333333333333,
+      "grad_norm": 0.3137843926939316,
+      "learning_rate": 0.0001003456090648416,
+      "loss": 0.6008,
+      "step": 964
+    },
+    {
+      "epoch": 0.5146666666666667,
+      "grad_norm": 0.48926191422711196,
+      "learning_rate": 0.00010017280479043147,
+      "loss": 0.7124,
+      "step": 965
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4840106358402935,
+      "learning_rate": 0.0001,
+      "loss": 0.6094,
+      "step": 966
+    },
+    {
+      "epoch": 0.5157333333333334,
+      "grad_norm": 0.4109365942571878,
+      "learning_rate": 9.982719520956855e-05,
+      "loss": 0.6893,
+      "step": 967
+    },
+    {
+      "epoch": 0.5162666666666667,
+      "grad_norm": 0.459250743064764,
+      "learning_rate": 9.965439093515841e-05,
+      "loss": 0.7158,
+      "step": 968
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.37539218065461527,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7344,
+      "step": 969
+    },
+    {
+      "epoch": 0.5173333333333333,
+      "grad_norm": 0.45438928402880036,
+      "learning_rate": 9.930878599847821e-05,
+      "loss": 0.7003,
+      "step": 970
+    },
+    {
+      "epoch": 0.5178666666666667,
+      "grad_norm": 0.3850373296029251,
+      "learning_rate": 9.913598636823693e-05,
+      "loss": 0.6461,
+      "step": 971
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.44240440247711077,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.7101,
+      "step": 972
+    },
+    {
+      "epoch": 0.5189333333333334,
+      "grad_norm": 0.42555419166065234,
+      "learning_rate": 9.879039536398024e-05,
+      "loss": 0.7101,
+      "step": 973
+    },
+    {
+      "epoch": 0.5194666666666666,
+      "grad_norm": 0.40798177588241413,
+      "learning_rate": 9.861760502195197e-05,
+      "loss": 0.6487,
+      "step": 974
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.4794810474997049,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.624,
+      "step": 975
+    },
+    {
+      "epoch": 0.5205333333333333,
+      "grad_norm": 0.33123398004892485,
+      "learning_rate": 9.827203723798498e-05,
+      "loss": 0.6161,
+      "step": 976
+    },
+    {
+      "epoch": 0.5210666666666667,
+      "grad_norm": 0.43091338801889584,
+      "learning_rate": 9.809926082796415e-05,
+      "loss": 0.6245,
+      "step": 977
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3982056048141141,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.6311,
+      "step": 978
+    },
+    {
+      "epoch": 0.5221333333333333,
+      "grad_norm": 0.4264119518275276,
+      "learning_rate": 9.775372555152912e-05,
+      "loss": 0.7126,
+      "step": 979
+    },
+    {
+      "epoch": 0.5226666666666666,
+      "grad_norm": 0.4650117198486698,
+      "learning_rate": 9.758096771693573e-05,
+      "loss": 0.6909,
+      "step": 980
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.3855651504866148,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6563,
+      "step": 981
+    },
+    {
+      "epoch": 0.5237333333333334,
+      "grad_norm": 0.4004299561285694,
+      "learning_rate": 9.723547423440122e-05,
+      "loss": 0.6418,
+      "step": 982
+    },
+    {
+      "epoch": 0.5242666666666667,
+      "grad_norm": 0.436475294722715,
+      "learning_rate": 9.70627396181561e-05,
+      "loss": 0.68,
+      "step": 983
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4201228255812106,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7161,
+      "step": 984
+    },
+    {
+      "epoch": 0.5253333333333333,
+      "grad_norm": 0.44028062963521497,
+      "learning_rate": 9.671729721476746e-05,
+      "loss": 0.6964,
+      "step": 985
+    },
+    {
+      "epoch": 0.5258666666666667,
+      "grad_norm": 0.39430579219492956,
+      "learning_rate": 9.654459045916743e-05,
+      "loss": 0.6884,
+      "step": 986
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.41841383153582246,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7072,
+      "step": 987
+    },
+    {
+      "epoch": 0.5269333333333334,
+      "grad_norm": 0.39579772140427333,
+      "learning_rate": 9.619920841879725e-05,
+      "loss": 0.7173,
+      "step": 988
+    },
+    {
+      "epoch": 0.5274666666666666,
+      "grad_norm": 0.37978904842785305,
+      "learning_rate": 9.602653416539031e-05,
+      "loss": 0.6913,
+      "step": 989
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.44857958673888443,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.6634,
+      "step": 990
+    },
+    {
+      "epoch": 0.5285333333333333,
+      "grad_norm": 0.4136062585434959,
+      "learning_rate": 9.568122177028884e-05,
+      "loss": 0.6516,
+      "step": 991
+    },
+    {
+      "epoch": 0.5290666666666667,
+      "grad_norm": 0.40079332433727344,
+      "learning_rate": 9.550858465974958e-05,
+      "loss": 0.6692,
+      "step": 992
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.5191574423285716,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.7063,
+      "step": 993
+    },
+    {
+      "epoch": 0.5301333333333333,
+      "grad_norm": 0.4489649211382724,
+      "learning_rate": 9.516335119029546e-05,
+      "loss": 0.6717,
+      "step": 994
+    },
+    {
+      "epoch": 0.5306666666666666,
+      "grad_norm": 0.359869051326043,
+      "learning_rate": 9.499075586230013e-05,
+      "loss": 0.6032,
+      "step": 995
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.3655288390710915,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6612,
+      "step": 996
+    },
+    {
+      "epoch": 0.5317333333333333,
+      "grad_norm": 0.36150180799514897,
+      "learning_rate": 9.464561059675073e-05,
+      "loss": 0.6246,
+      "step": 997
+    },
+    {
+      "epoch": 0.5322666666666667,
+      "grad_norm": 0.39936177766924974,
+      "learning_rate": 9.44730616898528e-05,
+      "loss": 0.7136,
+      "step": 998
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.4561306271934777,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.66,
+      "step": 999
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.4339140449482403,
+      "learning_rate": 9.412801390409497e-05,
+      "loss": 0.6832,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5338666666666667,
+      "grad_norm": 0.49580888230308284,
+      "learning_rate": 9.395551605560018e-05,
+      "loss": 0.6612,
+      "step": 1001
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.449576174096107,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.6667,
+      "step": 1002
+    },
+    {
+      "epoch": 0.5349333333333334,
+      "grad_norm": 0.47223661195255384,
+      "learning_rate": 9.361057502290113e-05,
+      "loss": 0.6377,
+      "step": 1003
+    },
+    {
+      "epoch": 0.5354666666666666,
+      "grad_norm": 0.38485172720289945,
+      "learning_rate": 9.343813286874312e-05,
+      "loss": 0.6492,
+      "step": 1004
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.38502627740041523,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.6893,
+      "step": 1005
+    },
+    {
+      "epoch": 0.5365333333333333,
+      "grad_norm": 0.37223090200132836,
+      "learning_rate": 9.309330785950086e-05,
+      "loss": 0.6582,
+      "step": 1006
+    },
+    {
+      "epoch": 0.5370666666666667,
+      "grad_norm": 0.5748748140440352,
+      "learning_rate": 9.292092603411641e-05,
+      "loss": 0.7143,
+      "step": 1007
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.4305085245494093,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.6323,
+      "step": 1008
+    },
+    {
+      "epoch": 0.5381333333333334,
+      "grad_norm": 0.4024815591372681,
+      "learning_rate": 9.257622631561085e-05,
+      "loss": 0.643,
+      "step": 1009
+    },
+    {
+      "epoch": 0.5386666666666666,
+      "grad_norm": 0.39677262680295655,
+      "learning_rate": 9.240390945181543e-05,
+      "loss": 0.6761,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.5059510797763671,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.731,
+      "step": 1011
+    },
+    {
+      "epoch": 0.5397333333333333,
+      "grad_norm": 0.46477522381453795,
+      "learning_rate": 9.205934428795929e-05,
+      "loss": 0.6085,
+      "step": 1012
+    },
+    {
+      "epoch": 0.5402666666666667,
+      "grad_norm": 0.40850542130652917,
+      "learning_rate": 9.188709701682247e-05,
+      "loss": 0.7086,
+      "step": 1013
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.45607674638907475,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.688,
+      "step": 1014
+    },
+    {
+      "epoch": 0.5413333333333333,
+      "grad_norm": 0.3768920463507832,
+      "learning_rate": 9.154267566791223e-05,
+      "loss": 0.6476,
+      "step": 1015
+    },
+    {
+      "epoch": 0.5418666666666667,
+      "grad_norm": 0.42699732311359867,
+      "learning_rate": 9.137050261863324e-05,
+      "loss": 0.6976,
+      "step": 1016
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.36609008423564304,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.6408,
+      "step": 1017
+    },
+    {
+      "epoch": 0.5429333333333334,
+      "grad_norm": 0.3512204967360172,
+      "learning_rate": 9.102623434110028e-05,
+      "loss": 0.6572,
+      "step": 1018
+    },
+    {
+      "epoch": 0.5434666666666667,
+      "grad_norm": 0.4058479868686318,
+      "learning_rate": 9.085414014088369e-05,
+      "loss": 0.6939,
+      "step": 1019
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3748032216832234,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.664,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5445333333333333,
+      "grad_norm": 0.39182865176607806,
+      "learning_rate": 9.051003418704565e-05,
+      "loss": 0.6828,
+      "step": 1021
+    },
+    {
+      "epoch": 0.5450666666666667,
+      "grad_norm": 0.38776893265216555,
+      "learning_rate": 9.033802346097682e-05,
+      "loss": 0.6336,
+      "step": 1022
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.35586013216170675,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.6448,
+      "step": 1023
+    },
+    {
+      "epoch": 0.5461333333333334,
+      "grad_norm": 0.5289535489633533,
+      "learning_rate": 8.999408907878877e-05,
+      "loss": 0.7025,
+      "step": 1024
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 0.37004350986403134,
+      "learning_rate": 8.982216644970979e-05,
+      "loss": 0.6477,
+      "step": 1025
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.4022167263992691,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.6628,
+      "step": 1026
+    },
+    {
+      "epoch": 0.5477333333333333,
+      "grad_norm": 0.36263873606627256,
+      "learning_rate": 8.947841288251568e-05,
+      "loss": 0.6255,
+      "step": 1027
+    },
+    {
+      "epoch": 0.5482666666666667,
+      "grad_norm": 0.5169443403628885,
+      "learning_rate": 8.930658297090091e-05,
+      "loss": 0.7272,
+      "step": 1028
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.5077474735865758,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.7211,
+      "step": 1029
+    },
+    {
+      "epoch": 0.5493333333333333,
+      "grad_norm": 0.36978250143246966,
+      "learning_rate": 8.896301945718541e-05,
+      "loss": 0.6732,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5498666666666666,
+      "grad_norm": 0.4332494356774574,
+      "learning_rate": 8.879128688101749e-05,
+      "loss": 0.6573,
+      "step": 1031
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.41306769252647924,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.6417,
+      "step": 1032
+    },
+    {
+      "epoch": 0.5509333333333334,
+      "grad_norm": 0.39160267542461225,
+      "learning_rate": 8.844792265415738e-05,
+      "loss": 0.6577,
+      "step": 1033
+    },
+    {
+      "epoch": 0.5514666666666667,
+      "grad_norm": 0.3823998381292748,
+      "learning_rate": 8.827629202880293e-05,
+      "loss": 0.6811,
+      "step": 1034
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.3631664497827257,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7202,
+      "step": 1035
+    },
+    {
+      "epoch": 0.5525333333333333,
+      "grad_norm": 0.49425879174920706,
+      "learning_rate": 8.793313631681915e-05,
+      "loss": 0.6488,
+      "step": 1036
+    },
+    {
+      "epoch": 0.5530666666666667,
+      "grad_norm": 0.3697474194740872,
+      "learning_rate": 8.776161225490489e-05,
+      "loss": 0.713,
+      "step": 1037
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4310295688825336,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.6957,
+      "step": 1038
+    },
+    {
+      "epoch": 0.5541333333333334,
+      "grad_norm": 0.37156423458479143,
+      "learning_rate": 8.741867428021446e-05,
+      "loss": 0.714,
+      "step": 1039
+    },
+    {
+      "epoch": 0.5546666666666666,
+      "grad_norm": 0.41859740768728004,
+      "learning_rate": 8.724726139150318e-05,
+      "loss": 0.6635,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.39560270417941085,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.6528,
+      "step": 1041
+    },
+    {
+      "epoch": 0.5557333333333333,
+      "grad_norm": 0.4370306562106364,
+      "learning_rate": 8.690455037067141e-05,
+      "loss": 0.6906,
+      "step": 1042
+    },
+    {
+      "epoch": 0.5562666666666667,
+      "grad_norm": 0.3922761785579907,
+      "learning_rate": 8.673325326193806e-05,
+      "loss": 0.6699,
+      "step": 1043
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.4442771972537085,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.6697,
+      "step": 1044
+    },
+    {
+      "epoch": 0.5573333333333333,
+      "grad_norm": 0.3928923028223549,
+      "learning_rate": 8.639077840543077e-05,
+      "loss": 0.705,
+      "step": 1045
+    },
+    {
+      "epoch": 0.5578666666666666,
+      "grad_norm": 0.4084189852974764,
+      "learning_rate": 8.621960168033867e-05,
+      "loss": 0.7075,
+      "step": 1046
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.37486857783456,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.6281,
+      "step": 1047
+    },
+    {
+      "epoch": 0.5589333333333333,
+      "grad_norm": 0.3834285003923,
+      "learning_rate": 8.587737219227462e-05,
+      "loss": 0.6921,
+      "step": 1048
+    },
+    {
+      "epoch": 0.5594666666666667,
+      "grad_norm": 0.41008495293543173,
+      "learning_rate": 8.570632045125185e-05,
+      "loss": 0.679,
+      "step": 1049
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4293866733746269,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6939,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5605333333333333,
+      "grad_norm": 0.3722777267141457,
+      "learning_rate": 8.536434552915556e-05,
+      "loss": 0.6893,
+      "step": 1051
+    },
+    {
+      "epoch": 0.5610666666666667,
+      "grad_norm": 0.37247546020014505,
+      "learning_rate": 8.519342336927105e-05,
+      "loss": 0.6587,
+      "step": 1052
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.49495381015713114,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6979,
+      "step": 1053
+    },
+    {
+      "epoch": 0.5621333333333334,
+      "grad_norm": 0.42177932672113927,
+      "learning_rate": 8.485171220382545e-05,
+      "loss": 0.6318,
+      "step": 1054
+    },
+    {
+      "epoch": 0.5626666666666666,
+      "grad_norm": 0.4659911613298317,
+      "learning_rate": 8.468092421866573e-05,
+      "loss": 0.717,
+      "step": 1055
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.38000172119463005,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.6755,
+      "step": 1056
+    },
+    {
+      "epoch": 0.5637333333333333,
+      "grad_norm": 0.38658253768633705,
+      "learning_rate": 8.433948599346516e-05,
+      "loss": 0.6553,
+      "step": 1057
+    },
+    {
+      "epoch": 0.5642666666666667,
+      "grad_norm": 0.32209471560397346,
+      "learning_rate": 8.416883677301069e-05,
+      "loss": 0.5967,
+      "step": 1058
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.37915306478585464,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.5964,
+      "step": 1059
+    },
+    {
+      "epoch": 0.5653333333333334,
+      "grad_norm": 0.4144123664508527,
+      "learning_rate": 8.382768066431425e-05,
+      "loss": 0.722,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5658666666666666,
+      "grad_norm": 0.44310885866058436,
+      "learning_rate": 8.36571747948162e-05,
+      "loss": 0.663,
+      "step": 1061
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4473815955317685,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7348,
+      "step": 1062
+    },
+    {
+      "epoch": 0.5669333333333333,
+      "grad_norm": 0.4394755934053004,
+      "learning_rate": 8.33163099713009e-05,
+      "loss": 0.7117,
+      "step": 1063
+    },
+    {
+      "epoch": 0.5674666666666667,
+      "grad_norm": 0.41061642198296605,
+      "learning_rate": 8.31459520351578e-05,
+      "loss": 0.6948,
+      "step": 1064
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.40904245927736166,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.7221,
+      "step": 1065
+    },
+    {
+      "epoch": 0.5685333333333333,
+      "grad_norm": 0.46075497966600326,
+      "learning_rate": 8.280538765767235e-05,
+      "loss": 0.6659,
+      "step": 1066
+    },
+    {
+      "epoch": 0.5690666666666667,
+      "grad_norm": 0.40916738706197076,
+      "learning_rate": 8.263518223330697e-05,
+      "loss": 0.6964,
+      "step": 1067
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.43726657778993133,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7611,
+      "step": 1068
+    },
+    {
+      "epoch": 0.5701333333333334,
+      "grad_norm": 0.4257493496301308,
+      "learning_rate": 8.22949274546255e-05,
+      "loss": 0.644,
+      "step": 1069
+    },
+    {
+      "epoch": 0.5706666666666667,
+      "grad_norm": 0.42810911513519423,
+      "learning_rate": 8.212487911636184e-05,
+      "loss": 0.6709,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.39822224660334266,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6854,
+      "step": 1071
+    },
+    {
+      "epoch": 0.5717333333333333,
+      "grad_norm": 0.4119235536615006,
+      "learning_rate": 8.178494308093789e-05,
+      "loss": 0.6241,
+      "step": 1072
+    },
+    {
+      "epoch": 0.5722666666666667,
+      "grad_norm": 0.38188802844343217,
+      "learning_rate": 8.161505639887817e-05,
+      "loss": 0.6893,
+      "step": 1073
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3748132749109451,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.6572,
+      "step": 1074
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 0.4070627537552577,
+      "learning_rate": 8.127544824259889e-05,
+      "loss": 0.6247,
+      "step": 1075
+    },
+    {
+      "epoch": 0.5738666666666666,
+      "grad_norm": 0.4228495901369822,
+      "learning_rate": 8.110572778250085e-05,
+      "loss": 0.665,
+      "step": 1076
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.5125789789195911,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.757,
+      "step": 1077
+    },
+    {
+      "epoch": 0.5749333333333333,
+      "grad_norm": 0.37762127950207147,
+      "learning_rate": 8.076645663244168e-05,
+      "loss": 0.7095,
+      "step": 1078
+    },
+    {
+      "epoch": 0.5754666666666667,
+      "grad_norm": 0.45047309222896786,
+      "learning_rate": 8.059690695559568e-05,
+      "loss": 0.6102,
+      "step": 1079
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.37069355456738307,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.6253,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5765333333333333,
+      "grad_norm": 0.3832310801872979,
+      "learning_rate": 8.025798192977481e-05,
+      "loss": 0.6533,
+      "step": 1081
+    },
+    {
+      "epoch": 0.5770666666666666,
+      "grad_norm": 0.3907820180259579,
+      "learning_rate": 8.008860759288147e-05,
+      "loss": 0.6768,
+      "step": 1082
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.5219383239878169,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6795,
+      "step": 1083
+    },
+    {
+      "epoch": 0.5781333333333334,
+      "grad_norm": 0.4223255750226302,
+      "learning_rate": 7.975003780001485e-05,
+      "loss": 0.6748,
+      "step": 1084
+    },
+    {
+      "epoch": 0.5786666666666667,
+      "grad_norm": 0.34552505395735383,
+      "learning_rate": 7.958084335506239e-05,
+      "loss": 0.5693,
+      "step": 1085
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.38227932490961664,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6606,
+      "step": 1086
+    },
+    {
+      "epoch": 0.5797333333333333,
+      "grad_norm": 0.35869376565014843,
+      "learning_rate": 7.924263789431912e-05,
+      "loss": 0.6179,
+      "step": 1087
+    },
+    {
+      "epoch": 0.5802666666666667,
+      "grad_norm": 0.4402356169989974,
+      "learning_rate": 7.907362788846116e-05,
+      "loss": 0.6694,
+      "step": 1088
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.40006983810082125,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.6883,
+      "step": 1089
+    },
+    {
+      "epoch": 0.5813333333333334,
+      "grad_norm": 0.36870685385884555,
+      "learning_rate": 7.873579584921869e-05,
+      "loss": 0.6445,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5818666666666666,
+      "grad_norm": 0.4106821402930094,
+      "learning_rate": 7.856697482465196e-05,
+      "loss": 0.6718,
+      "step": 1091
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.463580435466865,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6595,
+      "step": 1092
+    },
+    {
+      "epoch": 0.5829333333333333,
+      "grad_norm": 0.35493664870979025,
+      "learning_rate": 7.822952528625191e-05,
+      "loss": 0.5964,
+      "step": 1093
+    },
+    {
+      "epoch": 0.5834666666666667,
+      "grad_norm": 0.45086792660255615,
+      "learning_rate": 7.806089778009421e-05,
+      "loss": 0.6241,
+      "step": 1094
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3931350665416031,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.6916,
+      "step": 1095
+    },
+    {
+      "epoch": 0.5845333333333333,
+      "grad_norm": 0.4173050433418133,
+      "learning_rate": 7.772383981159849e-05,
+      "loss": 0.6907,
+      "step": 1096
+    },
+    {
+      "epoch": 0.5850666666666666,
+      "grad_norm": 0.4500438797061833,
+      "learning_rate": 7.755541035576677e-05,
+      "loss": 0.723,
+      "step": 1097
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.3849060026134618,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6538,
+      "step": 1098
+    },
+    {
+      "epoch": 0.5861333333333333,
+      "grad_norm": 0.4572211266887904,
+      "learning_rate": 7.721875301571359e-05,
+      "loss": 0.6687,
+      "step": 1099
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.43618815957297913,
+      "learning_rate": 7.705052613680211e-05,
+      "loss": 0.6553,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.46688255334776935,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.6435,
+      "step": 1101
+    },
+    {
+      "epoch": 0.5877333333333333,
+      "grad_norm": 0.3769347514358419,
+      "learning_rate": 7.671427847296275e-05,
+      "loss": 0.5999,
+      "step": 1102
+    },
+    {
+      "epoch": 0.5882666666666667,
+      "grad_norm": 0.5186139577531543,
+      "learning_rate": 7.654625869212146e-05,
+      "loss": 0.7119,
+      "step": 1103
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.37935317504485616,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.6107,
+      "step": 1104
+    },
+    {
+      "epoch": 0.5893333333333334,
+      "grad_norm": 0.3638310753856444,
+      "learning_rate": 7.6210429741257e-05,
+      "loss": 0.6534,
+      "step": 1105
+    },
+    {
+      "epoch": 0.5898666666666667,
+      "grad_norm": 0.3352649331129673,
+      "learning_rate": 7.604262157407007e-05,
+      "loss": 0.6271,
+      "step": 1106
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.5109608294076012,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.7438,
+      "step": 1107
+    },
+    {
+      "epoch": 0.5909333333333333,
+      "grad_norm": 0.3952899095104239,
+      "learning_rate": 7.570722036168854e-05,
+      "loss": 0.6307,
+      "step": 1108
+    },
+    {
+      "epoch": 0.5914666666666667,
+      "grad_norm": 0.42262482407209273,
+      "learning_rate": 7.55396283180529e-05,
+      "loss": 0.6796,
+      "step": 1109
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.447862424578651,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.6812,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5925333333333334,
+      "grad_norm": 0.4486983192896598,
+      "learning_rate": 7.520466385816671e-05,
+      "loss": 0.7068,
+      "step": 1111
+    },
+    {
+      "epoch": 0.5930666666666666,
+      "grad_norm": 0.3510166579154817,
+      "learning_rate": 7.503729244217086e-05,
+      "loss": 0.6795,
+      "step": 1112
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.4132820453615415,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6577,
+      "step": 1113
+    },
+    {
+      "epoch": 0.5941333333333333,
+      "grad_norm": 0.47102117904417895,
+      "learning_rate": 7.470277373705461e-05,
+      "loss": 0.7211,
+      "step": 1114
+    },
+    {
+      "epoch": 0.5946666666666667,
+      "grad_norm": 0.5029222166630157,
+      "learning_rate": 7.453562744685778e-05,
+      "loss": 0.6415,
+      "step": 1115
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.37199366289640917,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.6991,
+      "step": 1116
+    },
+    {
+      "epoch": 0.5957333333333333,
+      "grad_norm": 0.36974073757686976,
+      "learning_rate": 7.42015634868062e-05,
+      "loss": 0.7037,
+      "step": 1117
+    },
+    {
+      "epoch": 0.5962666666666666,
+      "grad_norm": 0.5839954782570124,
+      "learning_rate": 7.403464681451715e-05,
+      "loss": 0.6726,
+      "step": 1118
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.43525495380757456,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6936,
+      "step": 1119
+    },
+    {
+      "epoch": 0.5973333333333334,
+      "grad_norm": 0.42660667566749433,
+      "learning_rate": 7.370104657760361e-05,
+      "loss": 0.6512,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5978666666666667,
+      "grad_norm": 0.48853798260934156,
+      "learning_rate": 7.353436400916004e-05,
+      "loss": 0.7014,
+      "step": 1121
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.40782743015049067,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.6324,
+      "step": 1122
+    },
+    {
+      "epoch": 0.5989333333333333,
+      "grad_norm": 0.39730932447158884,
+      "learning_rate": 7.320123646099519e-05,
+      "loss": 0.6312,
+      "step": 1123
+    },
+    {
+      "epoch": 0.5994666666666667,
+      "grad_norm": 0.43644781264951427,
+      "learning_rate": 7.303479247604332e-05,
+      "loss": 0.6762,
+      "step": 1124
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4357832284765578,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7199,
+      "step": 1125
+    },
+    {
+      "epoch": 0.6005333333333334,
+      "grad_norm": 0.390387285013816,
+      "learning_rate": 7.270214656953415e-05,
+      "loss": 0.6117,
+      "step": 1126
+    },
+    {
+      "epoch": 0.6010666666666666,
+      "grad_norm": 0.4554194962981507,
+      "learning_rate": 7.253594564130804e-05,
+      "loss": 0.7535,
+      "step": 1127
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.4437873015953556,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.6579,
+      "step": 1128
+    },
+    {
+      "epoch": 0.6021333333333333,
+      "grad_norm": 0.4099373197208037,
+      "learning_rate": 7.22037903164173e-05,
+      "loss": 0.6504,
+      "step": 1129
+    },
+    {
+      "epoch": 0.6026666666666667,
+      "grad_norm": 0.5887119870168502,
+      "learning_rate": 7.203783691161883e-05,
+      "loss": 0.6521,
+      "step": 1130
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.406654003215759,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.6003,
+      "step": 1131
+    },
+    {
+      "epoch": 0.6037333333333333,
+      "grad_norm": 0.46502921756379767,
+      "learning_rate": 7.170618109512465e-05,
+      "loss": 0.7211,
+      "step": 1132
+    },
+    {
+      "epoch": 0.6042666666666666,
+      "grad_norm": 0.4815546468614992,
+      "learning_rate": 7.154047967380354e-05,
+      "loss": 0.7307,
+      "step": 1133
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.45506880792711507,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.6761,
+      "step": 1134
+    },
+    {
+      "epoch": 0.6053333333333333,
+      "grad_norm": 0.4070671663392784,
+      "learning_rate": 7.12093322790597e-05,
+      "loss": 0.6572,
+      "step": 1135
+    },
+    {
+      "epoch": 0.6058666666666667,
+      "grad_norm": 0.38718530466773343,
+      "learning_rate": 7.104388729449338e-05,
+      "loss": 0.6572,
+      "step": 1136
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.3867510783801677,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.6178,
+      "step": 1137
+    },
+    {
+      "epoch": 0.6069333333333333,
+      "grad_norm": 0.3935342723371812,
+      "learning_rate": 7.071325722118963e-05,
+      "loss": 0.6705,
+      "step": 1138
+    },
+    {
+      "epoch": 0.6074666666666667,
+      "grad_norm": 0.3726697083680594,
+      "learning_rate": 7.054807311976379e-05,
+      "loss": 0.6068,
+      "step": 1139
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 1.0347332813609864,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6223,
+      "step": 1140
+    },
+    {
+      "epoch": 0.6085333333333334,
+      "grad_norm": 0.44285319936485756,
+      "learning_rate": 7.021796925368667e-05,
+      "loss": 0.7099,
+      "step": 1141
+    },
+    {
+      "epoch": 0.6090666666666666,
+      "grad_norm": 0.41594810747635413,
+      "learning_rate": 7.005305047477566e-05,
+      "loss": 0.7445,
+      "step": 1142
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.34338966810019267,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6373,
+      "step": 1143
+    },
+    {
+      "epoch": 0.6101333333333333,
+      "grad_norm": 0.3756166201718267,
+      "learning_rate": 6.972348168756983e-05,
+      "loss": 0.6856,
+      "step": 1144
+    },
+    {
+      "epoch": 0.6106666666666667,
+      "grad_norm": 0.4588254867492807,
+      "learning_rate": 6.955883266341741e-05,
+      "loss": 0.6636,
+      "step": 1145
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.4021662128404931,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.5913,
+      "step": 1146
+    },
+    {
+      "epoch": 0.6117333333333334,
+      "grad_norm": 0.45494546001161157,
+      "learning_rate": 6.922980781234699e-05,
+      "loss": 0.7814,
+      "step": 1147
+    },
+    {
+      "epoch": 0.6122666666666666,
+      "grad_norm": 0.39549127149064955,
+      "learning_rate": 6.906543296794714e-05,
+      "loss": 0.6147,
+      "step": 1148
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.5363666722798816,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6595,
+      "step": 1149
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.4401617730306245,
+      "learning_rate": 6.873696089565786e-05,
+      "loss": 0.7577,
+      "step": 1150
+    },
+    {
+      "epoch": 0.6138666666666667,
+      "grad_norm": 0.3863593737160001,
+      "learning_rate": 6.85728646486359e-05,
+      "loss": 0.6513,
+      "step": 1151
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.44121101952562514,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.7003,
+      "step": 1152
+    },
+    {
+      "epoch": 0.6149333333333333,
+      "grad_norm": 0.38262277713125686,
+      "learning_rate": 6.82449541829174e-05,
+      "loss": 0.6925,
+      "step": 1153
+    },
+    {
+      "epoch": 0.6154666666666667,
+      "grad_norm": 0.3983866461238884,
+      "learning_rate": 6.80811409434113e-05,
+      "loss": 0.6783,
+      "step": 1154
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.3804255660605288,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.6602,
+      "step": 1155
+    },
+    {
+      "epoch": 0.6165333333333334,
+      "grad_norm": 0.41078728515619706,
+      "learning_rate": 6.775380089695986e-05,
+      "loss": 0.6374,
+      "step": 1156
+    },
+    {
+      "epoch": 0.6170666666666667,
+      "grad_norm": 0.4598502622667575,
+      "learning_rate": 6.759027506750158e-05,
+      "loss": 0.7116,
+      "step": 1157
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.4094866119975501,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.6947,
+      "step": 1158
+    },
+    {
+      "epoch": 0.6181333333333333,
+      "grad_norm": 0.4113188707736787,
+      "learning_rate": 6.726351423768322e-05,
+      "loss": 0.72,
+      "step": 1159
+    },
+    {
+      "epoch": 0.6186666666666667,
+      "grad_norm": 0.46464533338121966,
+      "learning_rate": 6.710028021308061e-05,
+      "loss": 0.7239,
+      "step": 1160
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.39169990421551154,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.6028,
+      "step": 1161
+    },
+    {
+      "epoch": 0.6197333333333334,
+      "grad_norm": 0.378749358223809,
+      "learning_rate": 6.677410738169485e-05,
+      "loss": 0.6523,
+      "step": 1162
+    },
+    {
+      "epoch": 0.6202666666666666,
+      "grad_norm": 0.4694356959374317,
+      "learning_rate": 6.661116954891328e-05,
+      "loss": 0.6835,
+      "step": 1163
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.39386387986657706,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.5547,
+      "step": 1164
+    },
+    {
+      "epoch": 0.6213333333333333,
+      "grad_norm": 0.41022887333091956,
+      "learning_rate": 6.62855934819569e-05,
+      "loss": 0.6717,
+      "step": 1165
+    },
+    {
+      "epoch": 0.6218666666666667,
+      "grad_norm": 0.4869445304036569,
+      "learning_rate": 6.612295622000162e-05,
+      "loss": 0.6458,
+      "step": 1166
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.39617360324862894,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6666,
+      "step": 1167
+    },
+    {
+      "epoch": 0.6229333333333333,
+      "grad_norm": 0.4461612095821093,
+      "learning_rate": 6.579798566743314e-05,
+      "loss": 0.6914,
+      "step": 1168
+    },
+    {
+      "epoch": 0.6234666666666666,
+      "grad_norm": 0.360307198191942,
+      "learning_rate": 6.563565334723134e-05,
+      "loss": 0.6993,
+      "step": 1169
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.41306360228437694,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.6044,
+      "step": 1170
+    },
+    {
+      "epoch": 0.6245333333333334,
+      "grad_norm": 0.3920470324410973,
+      "learning_rate": 6.531129704273604e-05,
+      "loss": 0.6394,
+      "step": 1171
+    },
+    {
+      "epoch": 0.6250666666666667,
+      "grad_norm": 0.39942722731479635,
+      "learning_rate": 6.514927402701964e-05,
+      "loss": 0.6395,
+      "step": 1172
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.36525434652313277,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.5635,
+      "step": 1173
+    },
+    {
+      "epoch": 0.6261333333333333,
+      "grad_norm": 0.3697333028186257,
+      "learning_rate": 6.48255406877745e-05,
+      "loss": 0.6489,
+      "step": 1174
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 0.5034693761597592,
+      "learning_rate": 6.466383133096267e-05,
+      "loss": 0.6638,
+      "step": 1175
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.38198759421158524,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6039,
+      "step": 1176
+    },
+    {
+      "epoch": 0.6277333333333334,
+      "grad_norm": 0.3946903523364556,
+      "learning_rate": 6.434072965740242e-05,
+      "loss": 0.6744,
+      "step": 1177
+    },
+    {
+      "epoch": 0.6282666666666666,
+      "grad_norm": 0.40744526178631907,
+      "learning_rate": 6.417933830548467e-05,
+      "loss": 0.6556,
+      "step": 1178
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.3730874608664777,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.591,
+      "step": 1179
+    },
+    {
+      "epoch": 0.6293333333333333,
+      "grad_norm": 0.3707336373470534,
+      "learning_rate": 6.385687698106781e-05,
+      "loss": 0.6409,
+      "step": 1180
+    },
+    {
+      "epoch": 0.6298666666666667,
+      "grad_norm": 0.41760664397135294,
+      "learning_rate": 6.369580797148718e-05,
+      "loss": 0.603,
+      "step": 1181
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.417252907963043,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6289,
+      "step": 1182
+    },
+    {
+      "epoch": 0.6309333333333333,
+      "grad_norm": 0.4192323860449515,
+      "learning_rate": 6.337399566246257e-05,
+      "loss": 0.6418,
+      "step": 1183
+    },
+    {
+      "epoch": 0.6314666666666666,
+      "grad_norm": 0.4427327328342678,
+      "learning_rate": 6.321325332399903e-05,
+      "loss": 0.7087,
+      "step": 1184
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.5116801317498956,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7527,
+      "step": 1185
+    },
+    {
+      "epoch": 0.6325333333333333,
+      "grad_norm": 0.4260104138905186,
+      "learning_rate": 6.289209867917312e-05,
+      "loss": 0.6794,
+      "step": 1186
+    },
+    {
+      "epoch": 0.6330666666666667,
+      "grad_norm": 0.4665746999686809,
+      "learning_rate": 6.273168733182722e-05,
+      "loss": 0.6862,
+      "step": 1187
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.4279013793908042,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.7239,
+      "step": 1188
+    },
+    {
+      "epoch": 0.6341333333333333,
+      "grad_norm": 0.3541190181930973,
+      "learning_rate": 6.241119898233144e-05,
+      "loss": 0.6048,
+      "step": 1189
+    },
+    {
+      "epoch": 0.6346666666666667,
+      "grad_norm": 0.4741478976025896,
+      "learning_rate": 6.225112293720836e-05,
+      "loss": 0.7045,
+      "step": 1190
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.3998080163269796,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.6662,
+      "step": 1191
+    },
+    {
+      "epoch": 0.6357333333333334,
+      "grad_norm": 0.44602537035538536,
+      "learning_rate": 6.19313094962673e-05,
+      "loss": 0.6517,
+      "step": 1192
+    },
+    {
+      "epoch": 0.6362666666666666,
+      "grad_norm": 0.37199940613723986,
+      "learning_rate": 6.177157305546078e-05,
+      "loss": 0.6568,
+      "step": 1193
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.39541468515323636,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.7558,
+      "step": 1194
+    },
+    {
+      "epoch": 0.6373333333333333,
+      "grad_norm": 0.4990262998205763,
+      "learning_rate": 6.145244311816063e-05,
+      "loss": 0.709,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6378666666666667,
+      "grad_norm": 0.4283663258149843,
+      "learning_rate": 6.129305057463741e-05,
+      "loss": 0.6866,
+      "step": 1196
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.4000143487971441,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.6564,
+      "step": 1197
+    },
+    {
+      "epoch": 0.6389333333333334,
+      "grad_norm": 0.42369255064292277,
+      "learning_rate": 6.0974612717695004e-05,
+      "loss": 0.6963,
+      "step": 1198
+    },
+    {
+      "epoch": 0.6394666666666666,
+      "grad_norm": 0.39753999292591025,
+      "learning_rate": 6.0815568355179556e-05,
+      "loss": 0.6189,
+      "step": 1199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.37759579166297613,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6918,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6405333333333333,
+      "grad_norm": 0.439168186056477,
+      "learning_rate": 6.0497831136711836e-05,
+      "loss": 0.6296,
+      "step": 1201
+    },
+    {
+      "epoch": 0.6410666666666667,
+      "grad_norm": 0.3384595598430012,
+      "learning_rate": 6.0339139229571116e-05,
+      "loss": 0.6401,
+      "step": 1202
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.4397252068067179,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.6843,
+      "step": 1203
+    },
+    {
+      "epoch": 0.6421333333333333,
+      "grad_norm": 0.4495587434394545,
+      "learning_rate": 6.002211118886514e-05,
+      "loss": 0.686,
+      "step": 1204
+    },
+    {
+      "epoch": 0.6426666666666667,
+      "grad_norm": 0.4033178131180434,
+      "learning_rate": 5.986377600199371e-05,
+      "loss": 0.6749,
+      "step": 1205
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.41966382476424724,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6455,
+      "step": 1206
+    },
+    {
+      "epoch": 0.6437333333333334,
+      "grad_norm": 0.5730029679016254,
+      "learning_rate": 5.9547465659277215e-05,
+      "loss": 0.682,
+      "step": 1207
+    },
+    {
+      "epoch": 0.6442666666666667,
+      "grad_norm": 0.4780240044650562,
+      "learning_rate": 5.938949144798279e-05,
+      "loss": 0.6754,
+      "step": 1208
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.35299829115160963,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6331,
+      "step": 1209
+    },
+    {
+      "epoch": 0.6453333333333333,
+      "grad_norm": 0.43510483087266877,
+      "learning_rate": 5.907390730419507e-05,
+      "loss": 0.6175,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6458666666666667,
+      "grad_norm": 0.37019265249504896,
+      "learning_rate": 5.8916298314083915e-05,
+      "loss": 0.6513,
+      "step": 1211
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4344431213608693,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6875,
+      "step": 1212
+    },
+    {
+      "epoch": 0.6469333333333334,
+      "grad_norm": 0.3726653750133173,
+      "learning_rate": 5.860144885064751e-05,
+      "loss": 0.6514,
+      "step": 1213
+    },
+    {
+      "epoch": 0.6474666666666666,
+      "grad_norm": 0.4270131560555474,
+      "learning_rate": 5.8444209317510514e-05,
+      "loss": 0.6934,
+      "step": 1214
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.4344173959790919,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6117,
+      "step": 1215
+    },
+    {
+      "epoch": 0.6485333333333333,
+      "grad_norm": 0.3817477178203144,
+      "learning_rate": 5.813010299610313e-05,
+      "loss": 0.6606,
+      "step": 1216
+    },
+    {
+      "epoch": 0.6490666666666667,
+      "grad_norm": 0.3946044258169478,
+      "learning_rate": 5.797323714580192e-05,
+      "loss": 0.6195,
+      "step": 1217
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.5770606292806082,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6672,
+      "step": 1218
+    },
+    {
+      "epoch": 0.6501333333333333,
+      "grad_norm": 0.5431755903263947,
+      "learning_rate": 5.765988240812921e-05,
+      "loss": 0.7173,
+      "step": 1219
+    },
+    {
+      "epoch": 0.6506666666666666,
+      "grad_norm": 0.3757472330097249,
+      "learning_rate": 5.750339445648252e-05,
+      "loss": 0.6221,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.40837193866788307,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.635,
+      "step": 1221
+    },
+    {
+      "epoch": 0.6517333333333334,
+      "grad_norm": 0.49505128852289804,
+      "learning_rate": 5.7190799724050924e-05,
+      "loss": 0.6539,
+      "step": 1222
+    },
+    {
+      "epoch": 0.6522666666666667,
+      "grad_norm": 0.41076517945799657,
+      "learning_rate": 5.7034693876721376e-05,
+      "loss": 0.6393,
+      "step": 1223
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.39442404738518705,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.6199,
+      "step": 1224
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 0.4155139537955921,
+      "learning_rate": 5.6722867550612116e-05,
+      "loss": 0.6536,
+      "step": 1225
+    },
+    {
+      "epoch": 0.6538666666666667,
+      "grad_norm": 0.4099148371054838,
+      "learning_rate": 5.6567148002993164e-05,
+      "loss": 0.6662,
+      "step": 1226
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.36153713831866474,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.6329,
+      "step": 1227
+    },
+    {
+      "epoch": 0.6549333333333334,
+      "grad_norm": 0.43267639800342794,
+      "learning_rate": 5.625609846363622e-05,
+      "loss": 0.6379,
+      "step": 1228
+    },
+    {
+      "epoch": 0.6554666666666666,
+      "grad_norm": 0.5193961832474688,
+      "learning_rate": 5.6100769400739383e-05,
+      "loss": 0.643,
+      "step": 1229
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3961001135726747,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.633,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6565333333333333,
+      "grad_norm": 0.3776635113091816,
+      "learning_rate": 5.579050500768836e-05,
+      "loss": 0.5916,
+      "step": 1231
+    },
+    {
+      "epoch": 0.6570666666666667,
+      "grad_norm": 0.40026739610815426,
+      "learning_rate": 5.5635570604030705e-05,
+      "loss": 0.6106,
+      "step": 1232
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.42506452958981844,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.6091,
+      "step": 1233
+    },
+    {
+      "epoch": 0.6581333333333333,
+      "grad_norm": 0.4767583065398524,
+      "learning_rate": 5.53260996957381e-05,
+      "loss": 0.6733,
+      "step": 1234
+    },
+    {
+      "epoch": 0.6586666666666666,
+      "grad_norm": 0.3825141222402969,
+      "learning_rate": 5.5171564115230254e-05,
+      "loss": 0.6489,
+      "step": 1235
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.40932401078675557,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6656,
+      "step": 1236
+    },
+    {
+      "epoch": 0.6597333333333333,
+      "grad_norm": 0.4683859535287396,
+      "learning_rate": 5.486289500882355e-05,
+      "loss": 0.6306,
+      "step": 1237
+    },
+    {
+      "epoch": 0.6602666666666667,
+      "grad_norm": 0.49713749196368245,
+      "learning_rate": 5.47087624046575e-05,
+      "loss": 0.6261,
+      "step": 1238
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.4046846776425327,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.6369,
+      "step": 1239
+    },
+    {
+      "epoch": 0.6613333333333333,
+      "grad_norm": 0.4226828114463185,
+      "learning_rate": 5.4400903395715366e-05,
+      "loss": 0.6508,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6618666666666667,
+      "grad_norm": 0.3926616257205702,
+      "learning_rate": 5.424717791025302e-05,
+      "loss": 0.6502,
+      "step": 1241
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.423404611081691,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.6449,
+      "step": 1242
+    },
+    {
+      "epoch": 0.6629333333333334,
+      "grad_norm": 0.4535086604738742,
+      "learning_rate": 5.394013727258254e-05,
+      "loss": 0.6527,
+      "step": 1243
+    },
+    {
+      "epoch": 0.6634666666666666,
+      "grad_norm": 0.40480446195046826,
+      "learning_rate": 5.378682303724435e-05,
+      "loss": 0.5772,
+      "step": 1244
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4499097332556954,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.661,
+      "step": 1245
+    },
+    {
+      "epoch": 0.6645333333333333,
+      "grad_norm": 0.39306632797949126,
+      "learning_rate": 5.348060902265871e-05,
+      "loss": 0.6068,
+      "step": 1246
+    },
+    {
+      "epoch": 0.6650666666666667,
+      "grad_norm": 0.44711370463148087,
+      "learning_rate": 5.332771015781275e-05,
+      "loss": 0.6795,
+      "step": 1247
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.3948332953812635,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.6274,
+      "step": 1248
+    },
+    {
+      "epoch": 0.6661333333333334,
+      "grad_norm": 0.4146152458688994,
+      "learning_rate": 5.302233099590928e-05,
+      "loss": 0.6675,
+      "step": 1249
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.37633458731901637,
+      "learning_rate": 5.286985161076029e-05,
+      "loss": 0.6474,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.38096296648560274,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6363,
+      "step": 1251
+    },
+    {
+      "epoch": 0.6677333333333333,
+      "grad_norm": 0.451814194484857,
+      "learning_rate": 5.2565315508699376e-05,
+      "loss": 0.609,
+      "step": 1252
+    },
+    {
+      "epoch": 0.6682666666666667,
+      "grad_norm": 0.4083807036318623,
+      "learning_rate": 5.2413259701178505e-05,
+      "loss": 0.6431,
+      "step": 1253
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.4284353582157567,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.605,
+      "step": 1254
+    },
+    {
+      "epoch": 0.6693333333333333,
+      "grad_norm": 0.4395428406670857,
+      "learning_rate": 5.210957484346314e-05,
+      "loss": 0.6975,
+      "step": 1255
+    },
+    {
+      "epoch": 0.6698666666666667,
+      "grad_norm": 0.3854301445851761,
+      "learning_rate": 5.195794670011776e-05,
+      "loss": 0.6316,
+      "step": 1256
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.4636660907598242,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6691,
+      "step": 1257
+    },
+    {
+      "epoch": 0.6709333333333334,
+      "grad_norm": 0.439135771165176,
+      "learning_rate": 5.165512124837344e-05,
+      "loss": 0.6261,
+      "step": 1258
+    },
+    {
+      "epoch": 0.6714666666666667,
+      "grad_norm": 0.40163744257019357,
+      "learning_rate": 5.150392484425728e-05,
+      "loss": 0.6819,
+      "step": 1259
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 1.0646340907060863,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.7305,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6725333333333333,
+      "grad_norm": 0.4462724535627756,
+      "learning_rate": 5.120196693701267e-05,
+      "loss": 0.6636,
+      "step": 1261
+    },
+    {
+      "epoch": 0.6730666666666667,
+      "grad_norm": 0.3836630584129171,
+      "learning_rate": 5.105120633557634e-05,
+      "loss": 0.6037,
+      "step": 1262
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.407531748252144,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.6044,
+      "step": 1263
+    },
+    {
+      "epoch": 0.6741333333333334,
+      "grad_norm": 0.38400199882654357,
+      "learning_rate": 5.075012408804458e-05,
+      "loss": 0.6967,
+      "step": 1264
+    },
+    {
+      "epoch": 0.6746666666666666,
+      "grad_norm": 0.45061107663107125,
+      "learning_rate": 5.059980334102637e-05,
+      "loss": 0.6063,
+      "step": 1265
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4455163000859113,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6501,
+      "step": 1266
+    },
+    {
+      "epoch": 0.6757333333333333,
+      "grad_norm": 0.39898912495896044,
+      "learning_rate": 5.0299604844886985e-05,
+      "loss": 0.6516,
+      "step": 1267
+    },
+    {
+      "epoch": 0.6762666666666667,
+      "grad_norm": 0.37570427299861775,
+      "learning_rate": 5.014972799220403e-05,
+      "loss": 0.6154,
+      "step": 1268
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.40027078748491896,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.624,
+      "step": 1269
+    },
+    {
+      "epoch": 0.6773333333333333,
+      "grad_norm": 0.5043252218944329,
+      "learning_rate": 4.985042131538545e-05,
+      "loss": 0.7256,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6778666666666666,
+      "grad_norm": 0.4508746476798115,
+      "learning_rate": 4.9700992385024934e-05,
+      "loss": 0.7035,
+      "step": 1271
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.38034510318763615,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.5853,
+      "step": 1272
+    },
+    {
+      "epoch": 0.6789333333333334,
+      "grad_norm": 0.41062616229759885,
+      "learning_rate": 4.940258557148765e-05,
+      "loss": 0.6216,
+      "step": 1273
+    },
+    {
+      "epoch": 0.6794666666666667,
+      "grad_norm": 0.389224894970331,
+      "learning_rate": 4.9253608579398855e-05,
+      "loss": 0.6734,
+      "step": 1274
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.5504549150709207,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6572,
+      "step": 1275
+    },
+    {
+      "epoch": 0.6805333333333333,
+      "grad_norm": 0.4841059494145302,
+      "learning_rate": 4.895610964891923e-05,
+      "loss": 0.6196,
+      "step": 1276
+    },
+    {
+      "epoch": 0.6810666666666667,
+      "grad_norm": 0.3959549009580358,
+      "learning_rate": 4.880758859890536e-05,
+      "loss": 0.5784,
+      "step": 1277
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.38260545640032856,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6067,
+      "step": 1278
+    },
+    {
+      "epoch": 0.6821333333333334,
+      "grad_norm": 0.33628993948453895,
+      "learning_rate": 4.851100554686021e-05,
+      "loss": 0.5741,
+      "step": 1279
+    },
+    {
+      "epoch": 0.6826666666666666,
+      "grad_norm": 0.46023486039434475,
+      "learning_rate": 4.836294443047088e-05,
+      "loss": 0.6257,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.4434097007127942,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.6798,
+      "step": 1281
+    },
+    {
+      "epoch": 0.6837333333333333,
+      "grad_norm": 0.3910731004224021,
+      "learning_rate": 4.8067285227622404e-05,
+      "loss": 0.6605,
+      "step": 1282
+    },
+    {
+      "epoch": 0.6842666666666667,
+      "grad_norm": 0.4144363824122161,
+      "learning_rate": 4.791968802404648e-05,
+      "loss": 0.6141,
+      "step": 1283
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.3688557031791854,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.643,
+      "step": 1284
+    },
+    {
+      "epoch": 0.6853333333333333,
+      "grad_norm": 0.4454768883445016,
+      "learning_rate": 4.762496061632814e-05,
+      "loss": 0.6393,
+      "step": 1285
+    },
+    {
+      "epoch": 0.6858666666666666,
+      "grad_norm": 0.4182386561668562,
+      "learning_rate": 4.747783129228656e-05,
+      "loss": 0.6309,
+      "step": 1286
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.46168326168634666,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.6586,
+      "step": 1287
+    },
+    {
+      "epoch": 0.6869333333333333,
+      "grad_norm": 0.4425894286074168,
+      "learning_rate": 4.718404360058966e-05,
+      "loss": 0.6884,
+      "step": 1288
+    },
+    {
+      "epoch": 0.6874666666666667,
+      "grad_norm": 0.39908913622004444,
+      "learning_rate": 4.7037386110228985e-05,
+      "loss": 0.7055,
+      "step": 1289
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3792121179497391,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.6024,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6885333333333333,
+      "grad_norm": 0.5066824688175311,
+      "learning_rate": 4.6744546030189486e-05,
+      "loss": 0.6424,
+      "step": 1291
+    },
+    {
+      "epoch": 0.6890666666666667,
+      "grad_norm": 0.3692295300726596,
+      "learning_rate": 4.659836431497563e-05,
+      "loss": 0.6723,
+      "step": 1292
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.39902602821920813,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.6735,
+      "step": 1293
+    },
+    {
+      "epoch": 0.6901333333333334,
+      "grad_norm": 0.40162107956838344,
+      "learning_rate": 4.630647971676232e-05,
+      "loss": 0.6347,
+      "step": 1294
+    },
+    {
+      "epoch": 0.6906666666666667,
+      "grad_norm": 0.3675015493410282,
+      "learning_rate": 4.6160777705374524e-05,
+      "loss": 0.6275,
+      "step": 1295
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.46216279662287024,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.6528,
+      "step": 1296
+    },
+    {
+      "epoch": 0.6917333333333333,
+      "grad_norm": 0.4298759543085148,
+      "learning_rate": 4.586985643347717e-05,
+      "loss": 0.7145,
+      "step": 1297
+    },
+    {
+      "epoch": 0.6922666666666667,
+      "grad_norm": 0.3898675773824648,
+      "learning_rate": 4.572463804170263e-05,
+      "loss": 0.6606,
+      "step": 1298
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.3856297054663061,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6189,
+      "step": 1299
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.39208089448633,
+      "learning_rate": 4.543468791472131e-05,
+      "loss": 0.6709,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6938666666666666,
+      "grad_norm": 0.38246061383981467,
+      "learning_rate": 4.5289957045349653e-05,
+      "loss": 0.6356,
+      "step": 1301
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.369162140041079,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.6162,
+      "step": 1302
+    },
+    {
+      "epoch": 0.6949333333333333,
+      "grad_norm": 0.4032891411955045,
+      "learning_rate": 4.5000985855784746e-05,
+      "loss": 0.6242,
+      "step": 1303
+    },
+    {
+      "epoch": 0.6954666666666667,
+      "grad_norm": 0.42453013494560543,
+      "learning_rate": 4.485674639850333e-05,
+      "loss": 0.6939,
+      "step": 1304
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.37344003789130603,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.5724,
+      "step": 1305
+    },
+    {
+      "epoch": 0.6965333333333333,
+      "grad_norm": 0.38728418385000496,
+      "learning_rate": 4.456876191254582e-05,
+      "loss": 0.6005,
+      "step": 1306
+    },
+    {
+      "epoch": 0.6970666666666666,
+      "grad_norm": 0.41380676225434393,
+      "learning_rate": 4.442501774383515e-05,
+      "loss": 0.6631,
+      "step": 1307
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.4531001232710246,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.6574,
+      "step": 1308
+    },
+    {
+      "epoch": 0.6981333333333334,
+      "grad_norm": 0.42079095655839355,
+      "learning_rate": 4.413802770115816e-05,
+      "loss": 0.6314,
+      "step": 1309
+    },
+    {
+      "epoch": 0.6986666666666667,
+      "grad_norm": 0.3954924432799224,
+      "learning_rate": 4.399478268418771e-05,
+      "loss": 0.6395,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.4647611116985209,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.6011,
+      "step": 1311
+    },
+    {
+      "epoch": 0.6997333333333333,
+      "grad_norm": 0.4062107993968112,
+      "learning_rate": 4.3708794797738375e-05,
+      "loss": 0.6177,
+      "step": 1312
+    },
+    {
+      "epoch": 0.7002666666666667,
+      "grad_norm": 0.39643487227860147,
+      "learning_rate": 4.3566052782262735e-05,
+      "loss": 0.6665,
+      "step": 1313
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.3740905841064871,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6259,
+      "step": 1314
+    },
+    {
+      "epoch": 0.7013333333333334,
+      "grad_norm": 0.4109066238005628,
+      "learning_rate": 4.328107473805487e-05,
+      "loss": 0.5959,
+      "step": 1315
+    },
+    {
+      "epoch": 0.7018666666666666,
+      "grad_norm": 0.4002381936076298,
+      "learning_rate": 4.3138839560310303e-05,
+      "loss": 0.6319,
+      "step": 1316
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.3645989491641406,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6716,
+      "step": 1317
+    },
+    {
+      "epoch": 0.7029333333333333,
+      "grad_norm": 0.3584276941610215,
+      "learning_rate": 4.2854879017217894e-05,
+      "loss": 0.6122,
+      "step": 1318
+    },
+    {
+      "epoch": 0.7034666666666667,
+      "grad_norm": 0.4567974782395917,
+      "learning_rate": 4.271315449981934e-05,
+      "loss": 0.6032,
+      "step": 1319
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3730897467914509,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.6037,
+      "step": 1320
+    },
+    {
+      "epoch": 0.7045333333333333,
+      "grad_norm": 0.3578999826745592,
+      "learning_rate": 4.2430219089370823e-05,
+      "loss": 0.5955,
+      "step": 1321
+    },
+    {
+      "epoch": 0.7050666666666666,
+      "grad_norm": 0.5023846237437409,
+      "learning_rate": 4.228900904120895e-05,
+      "loss": 0.6897,
+      "step": 1322
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.43293579540195665,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.6451,
+      "step": 1323
+    },
+    {
+      "epoch": 0.7061333333333333,
+      "grad_norm": 0.35556202301195655,
+      "learning_rate": 4.200710636738189e-05,
+      "loss": 0.5978,
+      "step": 1324
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 0.3992386181720559,
+      "learning_rate": 4.1866414583520877e-05,
+      "loss": 0.5945,
+      "step": 1325
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.40295555105868464,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6423,
+      "step": 1326
+    },
+    {
+      "epoch": 0.7077333333333333,
+      "grad_norm": 0.4117021021613468,
+      "learning_rate": 4.158555222253771e-05,
+      "loss": 0.6592,
+      "step": 1327
+    },
+    {
+      "epoch": 0.7082666666666667,
+      "grad_norm": 0.3599343489939831,
+      "learning_rate": 4.14453824841132e-05,
+      "loss": 0.6306,
+      "step": 1328
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.3660990801052241,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6033,
+      "step": 1329
+    },
+    {
+      "epoch": 0.7093333333333334,
+      "grad_norm": 0.3679590577933425,
+      "learning_rate": 4.1165567984237764e-05,
+      "loss": 0.5818,
+      "step": 1330
+    },
+    {
+      "epoch": 0.7098666666666666,
+      "grad_norm": 0.429818252922498,
+      "learning_rate": 4.102592405835536e-05,
+      "loss": 0.6942,
+      "step": 1331
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3818104692821199,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.6015,
+      "step": 1332
+    },
+    {
+      "epoch": 0.7109333333333333,
+      "grad_norm": 0.3736973177830105,
+      "learning_rate": 4.074716493968975e-05,
+      "loss": 0.6408,
+      "step": 1333
+    },
+    {
+      "epoch": 0.7114666666666667,
+      "grad_norm": 0.4112187825282222,
+      "learning_rate": 4.060805057932359e-05,
+      "loss": 0.6375,
+      "step": 1334
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3651825655448954,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6108,
+      "step": 1335
+    },
+    {
+      "epoch": 0.7125333333333334,
+      "grad_norm": 0.3597691430568277,
+      "learning_rate": 4.0330354333606234e-05,
+      "loss": 0.5925,
+      "step": 1336
+    },
+    {
+      "epoch": 0.7130666666666666,
+      "grad_norm": 0.40261787330850607,
+      "learning_rate": 4.019177327749822e-05,
+      "loss": 0.6734,
+      "step": 1337
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.42290744504416355,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6564,
+      "step": 1338
+    },
+    {
+      "epoch": 0.7141333333333333,
+      "grad_norm": 0.37878361366296864,
+      "learning_rate": 3.991514736790258e-05,
+      "loss": 0.6349,
+      "step": 1339
+    },
+    {
+      "epoch": 0.7146666666666667,
+      "grad_norm": 0.3743736013061357,
+      "learning_rate": 3.977710334046193e-05,
+      "loss": 0.596,
+      "step": 1340
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.4795105204161491,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.7251,
+      "step": 1341
+    },
+    {
+      "epoch": 0.7157333333333333,
+      "grad_norm": 0.422585936491689,
+      "learning_rate": 3.950155520139581e-05,
+      "loss": 0.639,
+      "step": 1342
+    },
+    {
+      "epoch": 0.7162666666666667,
+      "grad_norm": 0.39752662522983573,
+      "learning_rate": 3.936405191259891e-05,
+      "loss": 0.6051,
+      "step": 1343
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.39881541592941316,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6247,
+      "step": 1344
+    },
+    {
+      "epoch": 0.7173333333333334,
+      "grad_norm": 0.3922418850731424,
+      "learning_rate": 3.9089588949504655e-05,
+      "loss": 0.6036,
+      "step": 1345
+    },
+    {
+      "epoch": 0.7178666666666667,
+      "grad_norm": 0.3634862394883397,
+      "learning_rate": 3.895263009479534e-05,
+      "loss": 0.6165,
+      "step": 1346
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.35977693427692065,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6347,
+      "step": 1347
+    },
+    {
+      "epoch": 0.7189333333333333,
+      "grad_norm": 0.43728907962622665,
+      "learning_rate": 3.867925968395085e-05,
+      "loss": 0.6513,
+      "step": 1348
+    },
+    {
+      "epoch": 0.7194666666666667,
+      "grad_norm": 0.45508225007313574,
+      "learning_rate": 3.854284894414122e-05,
+      "loss": 0.6269,
+      "step": 1349
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.4892633215660128,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.7104,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7205333333333334,
+      "grad_norm": 0.35236445947495665,
+      "learning_rate": 3.82705784324618e-05,
+      "loss": 0.63,
+      "step": 1351
+    },
+    {
+      "epoch": 0.7210666666666666,
+      "grad_norm": 0.380777365620293,
+      "learning_rate": 3.8134719473633094e-05,
+      "loss": 0.6042,
+      "step": 1352
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.36454466275990527,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6202,
+      "step": 1353
+    },
+    {
+      "epoch": 0.7221333333333333,
+      "grad_norm": 0.4098549804530554,
+      "learning_rate": 3.786355617847385e-05,
+      "loss": 0.6482,
+      "step": 1354
+    },
+    {
+      "epoch": 0.7226666666666667,
+      "grad_norm": 0.4588665532339407,
+      "learning_rate": 3.772825265187802e-05,
+      "loss": 0.6486,
+      "step": 1355
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.43242597005494954,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.6449,
+      "step": 1356
+    },
+    {
+      "epoch": 0.7237333333333333,
+      "grad_norm": 0.3678373825512149,
+      "learning_rate": 3.7458203860837234e-05,
+      "loss": 0.6247,
+      "step": 1357
+    },
+    {
+      "epoch": 0.7242666666666666,
+      "grad_norm": 0.33858758539495293,
+      "learning_rate": 3.732345940279893e-05,
+      "loss": 0.6243,
+      "step": 1358
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.4283505606342545,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.6022,
+      "step": 1359
+    },
+    {
+      "epoch": 0.7253333333333334,
+      "grad_norm": 0.46383355745425225,
+      "learning_rate": 3.705453237352227e-05,
+      "loss": 0.6582,
+      "step": 1360
+    },
+    {
+      "epoch": 0.7258666666666667,
+      "grad_norm": 0.3864518214986883,
+      "learning_rate": 3.692035060534088e-05,
+      "loss": 0.6434,
+      "step": 1361
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.39866820912672296,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.6447,
+      "step": 1362
+    },
+    {
+      "epoch": 0.7269333333333333,
+      "grad_norm": 0.40650008293660495,
+      "learning_rate": 3.665255256532638e-05,
+      "loss": 0.6116,
+      "step": 1363
+    },
+    {
+      "epoch": 0.7274666666666667,
+      "grad_norm": 0.37924622645477385,
+      "learning_rate": 3.651893709317887e-05,
+      "loss": 0.6261,
+      "step": 1364
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4342175377696183,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6537,
+      "step": 1365
+    },
+    {
+      "epoch": 0.7285333333333334,
+      "grad_norm": 0.4005682284224154,
+      "learning_rate": 3.625227523958252e-05,
+      "loss": 0.6126,
+      "step": 1366
+    },
+    {
+      "epoch": 0.7290666666666666,
+      "grad_norm": 0.3888776684269537,
+      "learning_rate": 3.611922965442648e-05,
+      "loss": 0.6268,
+      "step": 1367
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.41947214703791696,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.6733,
+      "step": 1368
+    },
+    {
+      "epoch": 0.7301333333333333,
+      "grad_norm": 0.4617847420212906,
+      "learning_rate": 3.5853711153868965e-05,
+      "loss": 0.6783,
+      "step": 1369
+    },
+    {
+      "epoch": 0.7306666666666667,
+      "grad_norm": 0.37886265475351183,
+      "learning_rate": 3.5721239031346066e-05,
+      "loss": 0.66,
+      "step": 1370
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.3562473191738787,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.6021,
+      "step": 1371
+    },
+    {
+      "epoch": 0.7317333333333333,
+      "grad_norm": 0.483350601891624,
+      "learning_rate": 3.545687101972013e-05,
+      "loss": 0.7105,
+      "step": 1372
+    },
+    {
+      "epoch": 0.7322666666666666,
+      "grad_norm": 0.36425377392262603,
+      "learning_rate": 3.53249759200601e-05,
+      "loss": 0.627,
+      "step": 1373
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.42862663832817294,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.638,
+      "step": 1374
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.41118852381669363,
+      "learning_rate": 3.506176550233863e-05,
+      "loss": 0.6834,
+      "step": 1375
+    },
+    {
+      "epoch": 0.7338666666666667,
+      "grad_norm": 0.3805524696767454,
+      "learning_rate": 3.4930450970263485e-05,
+      "loss": 0.6736,
+      "step": 1376
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.5077091862940181,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6506,
+      "step": 1377
+    },
+    {
+      "epoch": 0.7349333333333333,
+      "grad_norm": 0.4306152496489916,
+      "learning_rate": 3.46684052203088e-05,
+      "loss": 0.6736,
+      "step": 1378
+    },
+    {
+      "epoch": 0.7354666666666667,
+      "grad_norm": 0.3678045610641948,
+      "learning_rate": 3.4537674784937614e-05,
+      "loss": 0.6492,
+      "step": 1379
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.39191865227661193,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.6401,
+      "step": 1380
+    },
+    {
+      "epoch": 0.7365333333333334,
+      "grad_norm": 0.3670367608752555,
+      "learning_rate": 3.427680074531113e-05,
+      "loss": 0.632,
+      "step": 1381
+    },
+    {
+      "epoch": 0.7370666666666666,
+      "grad_norm": 0.4252473651270437,
+      "learning_rate": 3.4146657920065285e-05,
+      "loss": 0.6437,
+      "step": 1382
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.45318004841032933,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.5808,
+      "step": 1383
+    },
+    {
+      "epoch": 0.7381333333333333,
+      "grad_norm": 0.5098946265869971,
+      "learning_rate": 3.388696260183832e-05,
+      "loss": 0.676,
+      "step": 1384
+    },
+    {
+      "epoch": 0.7386666666666667,
+      "grad_norm": 0.37726528836211715,
+      "learning_rate": 3.3757410884346894e-05,
+      "loss": 0.6112,
+      "step": 1385
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.36393333240005293,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.5759,
+      "step": 1386
+    },
+    {
+      "epoch": 0.7397333333333334,
+      "grad_norm": 0.35515819390154585,
+      "learning_rate": 3.3498901266912396e-05,
+      "loss": 0.6279,
+      "step": 1387
+    },
+    {
+      "epoch": 0.7402666666666666,
+      "grad_norm": 0.6485146562541209,
+      "learning_rate": 3.336994413891828e-05,
+      "loss": 0.5874,
+      "step": 1388
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.5288235736362663,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.722,
+      "step": 1389
+    },
+    {
+      "epoch": 0.7413333333333333,
+      "grad_norm": 0.477940679451871,
+      "learning_rate": 3.3112627169802946e-05,
+      "loss": 0.6631,
+      "step": 1390
+    },
+    {
+      "epoch": 0.7418666666666667,
+      "grad_norm": 0.42816101605739654,
+      "learning_rate": 3.298426809706928e-05,
+      "loss": 0.6371,
+      "step": 1391
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.385614118024758,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6454,
+      "step": 1392
+    },
+    {
+      "epoch": 0.7429333333333333,
+      "grad_norm": 0.4216743611785189,
+      "learning_rate": 3.2728150691747115e-05,
+      "loss": 0.6432,
+      "step": 1393
+    },
+    {
+      "epoch": 0.7434666666666667,
+      "grad_norm": 0.3854726314225563,
+      "learning_rate": 3.2600393123964113e-05,
+      "loss": 0.6107,
+      "step": 1394
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.36890568603266516,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6186,
+      "step": 1395
+    },
+    {
+      "epoch": 0.7445333333333334,
+      "grad_norm": 0.41984084864865084,
+      "learning_rate": 3.234548216567049e-05,
+      "loss": 0.6838,
+      "step": 1396
+    },
+    {
+      "epoch": 0.7450666666666667,
+      "grad_norm": 0.42672354262943574,
+      "learning_rate": 3.2218329536362704e-05,
+      "loss": 0.6472,
+      "step": 1397
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.43571153242149063,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.6476,
+      "step": 1398
+    },
+    {
+      "epoch": 0.7461333333333333,
+      "grad_norm": 0.47946532001055614,
+      "learning_rate": 3.196463187590929e-05,
+      "loss": 0.6546,
+      "step": 1399
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.44619839471848427,
+      "learning_rate": 3.1838087602343344e-05,
+      "loss": 0.6947,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.35869339050145127,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6402,
+      "step": 1401
+    },
+    {
+      "epoch": 0.7477333333333334,
+      "grad_norm": 0.3846343282936006,
+      "learning_rate": 3.158561005793402e-05,
+      "loss": 0.635,
+      "step": 1402
+    },
+    {
+      "epoch": 0.7482666666666666,
+      "grad_norm": 0.3822661388073339,
+      "learning_rate": 3.145967754102691e-05,
+      "loss": 0.6331,
+      "step": 1403
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.4087421275219027,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.63,
+      "step": 1404
+    },
+    {
+      "epoch": 0.7493333333333333,
+      "grad_norm": 0.38026134208371226,
+      "learning_rate": 3.120842689807468e-05,
+      "loss": 0.6017,
+      "step": 1405
+    },
+    {
+      "epoch": 0.7498666666666667,
+      "grad_norm": 0.44220657991710305,
+      "learning_rate": 3.108310952230212e-05,
+      "loss": 0.6339,
+      "step": 1406
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.39305850071696574,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6463,
+      "step": 1407
+    },
+    {
+      "epoch": 0.7509333333333333,
+      "grad_norm": 0.4115319755731726,
+      "learning_rate": 3.083309253324651e-05,
+      "loss": 0.6387,
+      "step": 1408
+    },
+    {
+      "epoch": 0.7514666666666666,
+      "grad_norm": 0.4434644719318613,
+      "learning_rate": 3.070839366655215e-05,
+      "loss": 0.6322,
+      "step": 1409
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.39059993149387545,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6084,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7525333333333334,
+      "grad_norm": 0.45522559328114276,
+      "learning_rate": 3.0459617050677868e-05,
+      "loss": 0.675,
+      "step": 1411
+    },
+    {
+      "epoch": 0.7530666666666667,
+      "grad_norm": 0.40345951732394286,
+      "learning_rate": 3.0335540044382694e-05,
+      "loss": 0.6051,
+      "step": 1412
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.38291898115365075,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.575,
+      "step": 1413
+    },
+    {
+      "epoch": 0.7541333333333333,
+      "grad_norm": 0.4009439451613568,
+      "learning_rate": 3.008801048763914e-05,
+      "loss": 0.6205,
+      "step": 1414
+    },
+    {
+      "epoch": 0.7546666666666667,
+      "grad_norm": 0.42740356313299444,
+      "learning_rate": 2.996455867635155e-05,
+      "loss": 0.5966,
+      "step": 1415
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.42203414312961285,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.615,
+      "step": 1416
+    },
+    {
+      "epoch": 0.7557333333333334,
+      "grad_norm": 0.37462441142482394,
+      "learning_rate": 2.9718282831172883e-05,
+      "loss": 0.6232,
+      "step": 1417
+    },
+    {
+      "epoch": 0.7562666666666666,
+      "grad_norm": 0.4453257424581649,
+      "learning_rate": 2.9595459532698854e-05,
+      "loss": 0.5901,
+      "step": 1418
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.4497703092124468,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6283,
+      "step": 1419
+    },
+    {
+      "epoch": 0.7573333333333333,
+      "grad_norm": 0.4506692794303581,
+      "learning_rate": 2.9350444017825385e-05,
+      "loss": 0.6499,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7578666666666667,
+      "grad_norm": 0.4163203546144899,
+      "learning_rate": 2.922825253307947e-05,
+      "loss": 0.6459,
+      "step": 1421
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.399255167598255,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6229,
+      "step": 1422
+    },
+    {
+      "epoch": 0.7589333333333333,
+      "grad_norm": 0.455231939617007,
+      "learning_rate": 2.898450393337977e-05,
+      "loss": 0.6895,
+      "step": 1423
+    },
+    {
+      "epoch": 0.7594666666666666,
+      "grad_norm": 0.4308466172348704,
+      "learning_rate": 2.8862947546296315e-05,
+      "loss": 0.6152,
+      "step": 1424
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.42234717675112027,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.613,
+      "step": 1425
+    },
+    {
+      "epoch": 0.7605333333333333,
+      "grad_norm": 0.40401344881413,
+      "learning_rate": 2.8620472412590228e-05,
+      "loss": 0.5976,
+      "step": 1426
+    },
+    {
+      "epoch": 0.7610666666666667,
+      "grad_norm": 0.453079034843873,
+      "learning_rate": 2.8499554390035143e-05,
+      "loss": 0.6798,
+      "step": 1427
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.39168209780175084,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.5983,
+      "step": 1428
+    },
+    {
+      "epoch": 0.7621333333333333,
+      "grad_norm": 0.4641371617302695,
+      "learning_rate": 2.8258359238917665e-05,
+      "loss": 0.6513,
+      "step": 1429
+    },
+    {
+      "epoch": 0.7626666666666667,
+      "grad_norm": 0.4854567373155207,
+      "learning_rate": 2.8138082830600554e-05,
+      "loss": 0.6578,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.3946896301102641,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6225,
+      "step": 1431
+    },
+    {
+      "epoch": 0.7637333333333334,
+      "grad_norm": 0.3871040364117289,
+      "learning_rate": 2.7898174144266732e-05,
+      "loss": 0.7264,
+      "step": 1432
+    },
+    {
+      "epoch": 0.7642666666666666,
+      "grad_norm": 0.5499109389426425,
+      "learning_rate": 2.7778542582653744e-05,
+      "loss": 0.7151,
+      "step": 1433
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.352017631785061,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.5777,
+      "step": 1434
+    },
+    {
+      "epoch": 0.7653333333333333,
+      "grad_norm": 0.39393949674624945,
+      "learning_rate": 2.753992680872457e-05,
+      "loss": 0.6065,
+      "step": 1435
+    },
+    {
+      "epoch": 0.7658666666666667,
+      "grad_norm": 0.4445275221573401,
+      "learning_rate": 2.7420943308951284e-05,
+      "loss": 0.6938,
+      "step": 1436
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.4912464243057575,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.6495,
+      "step": 1437
+    },
+    {
+      "epoch": 0.7669333333333334,
+      "grad_norm": 0.40632202592724853,
+      "learning_rate": 2.7183626860300247e-05,
+      "loss": 0.5698,
+      "step": 1438
+    },
+    {
+      "epoch": 0.7674666666666666,
+      "grad_norm": 0.4169507683934783,
+      "learning_rate": 2.7065294620085424e-05,
+      "loss": 0.6843,
+      "step": 1439
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4887464175157345,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.5975,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7685333333333333,
+      "grad_norm": 0.38067928789356154,
+      "learning_rate": 2.6829283874666233e-05,
+      "loss": 0.6141,
+      "step": 1441
+    },
+    {
+      "epoch": 0.7690666666666667,
+      "grad_norm": 0.38185216981431297,
+      "learning_rate": 2.6711606074225782e-05,
+      "loss": 0.5865,
+      "step": 1442
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.3978552826513095,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6347,
+      "step": 1443
+    },
+    {
+      "epoch": 0.7701333333333333,
+      "grad_norm": 0.41171428006598065,
+      "learning_rate": 2.647690737490106e-05,
+      "loss": 0.6522,
+      "step": 1444
+    },
+    {
+      "epoch": 0.7706666666666667,
+      "grad_norm": 0.44946607999593247,
+      "learning_rate": 2.6359887176862718e-05,
+      "loss": 0.6652,
+      "step": 1445
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.4154439285961479,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6861,
+      "step": 1446
+    },
+    {
+      "epoch": 0.7717333333333334,
+      "grad_norm": 0.41163727461473826,
+      "learning_rate": 2.6126506831233344e-05,
+      "loss": 0.6541,
+      "step": 1447
+    },
+    {
+      "epoch": 0.7722666666666667,
+      "grad_norm": 0.3964474748802533,
+      "learning_rate": 2.6010147380551475e-05,
+      "loss": 0.6496,
+      "step": 1448
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.44965777952677877,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.551,
+      "step": 1449
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.4218007844873207,
+      "learning_rate": 2.577809166078716e-05,
+      "loss": 0.6031,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7738666666666667,
+      "grad_norm": 0.3798275175729113,
+      "learning_rate": 2.566239608465838e-05,
+      "loss": 0.6004,
+      "step": 1451
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4987480211561112,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.6531,
+      "step": 1452
+    },
+    {
+      "epoch": 0.7749333333333334,
+      "grad_norm": 0.44030873475463034,
+      "learning_rate": 2.543167122732918e-05,
+      "loss": 0.6322,
+      "step": 1453
+    },
+    {
+      "epoch": 0.7754666666666666,
+      "grad_norm": 0.39592177686674895,
+      "learning_rate": 2.5316642635108244e-05,
+      "loss": 0.5608,
+      "step": 1454
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.4296304690593407,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.6666,
+      "step": 1455
+    },
+    {
+      "epoch": 0.7765333333333333,
+      "grad_norm": 0.3440283545512249,
+      "learning_rate": 2.508725484101684e-05,
+      "loss": 0.6247,
+      "step": 1456
+    },
+    {
+      "epoch": 0.7770666666666667,
+      "grad_norm": 0.39678078164363756,
+      "learning_rate": 2.4972896324133144e-05,
+      "loss": 0.6011,
+      "step": 1457
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.41787884337990133,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6511,
+      "step": 1458
+    },
+    {
+      "epoch": 0.7781333333333333,
+      "grad_norm": 0.4373438589162138,
+      "learning_rate": 2.4744851758148156e-05,
+      "loss": 0.62,
+      "step": 1459
+    },
+    {
+      "epoch": 0.7786666666666666,
+      "grad_norm": 0.4285871794267623,
+      "learning_rate": 2.4631166390022574e-05,
+      "loss": 0.6149,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.39194465346106083,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6221,
+      "step": 1461
+    },
+    {
+      "epoch": 0.7797333333333333,
+      "grad_norm": 0.49852232175056294,
+      "learning_rate": 2.4404471180913058e-05,
+      "loss": 0.6959,
+      "step": 1462
+    },
+    {
+      "epoch": 0.7802666666666667,
+      "grad_norm": 0.49015137819027743,
+      "learning_rate": 2.429146201687538e-05,
+      "loss": 0.6071,
+      "step": 1463
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.4561759156549268,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6924,
+      "step": 1464
+    },
+    {
+      "epoch": 0.7813333333333333,
+      "grad_norm": 0.40457984813967485,
+      "learning_rate": 2.4066122257145894e-05,
+      "loss": 0.6405,
+      "step": 1465
+    },
+    {
+      "epoch": 0.7818666666666667,
+      "grad_norm": 0.4552863080098185,
+      "learning_rate": 2.3953792334352787e-05,
+      "loss": 0.6249,
+      "step": 1466
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.4045385459595967,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6344,
+      "step": 1467
+    },
+    {
+      "epoch": 0.7829333333333334,
+      "grad_norm": 0.4872630042726612,
+      "learning_rate": 2.3729814080079816e-05,
+      "loss": 0.6896,
+      "step": 1468
+    },
+    {
+      "epoch": 0.7834666666666666,
+      "grad_norm": 0.39672074889756576,
+      "learning_rate": 2.361816641743303e-05,
+      "loss": 0.5996,
+      "step": 1469
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.450988094347731,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.6091,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7845333333333333,
+      "grad_norm": 0.44983794037069097,
+      "learning_rate": 2.339555568810221e-05,
+      "loss": 0.6867,
+      "step": 1471
+    },
+    {
+      "epoch": 0.7850666666666667,
+      "grad_norm": 0.41541069931669183,
+      "learning_rate": 2.328459328616759e-05,
+      "loss": 0.6059,
+      "step": 1472
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.42946789211648667,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6451,
+      "step": 1473
+    },
+    {
+      "epoch": 0.7861333333333334,
+      "grad_norm": 0.43525600732114966,
+      "learning_rate": 2.306335606451181e-05,
+      "loss": 0.5966,
+      "step": 1474
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 0.437918783066002,
+      "learning_rate": 2.295308190543859e-05,
+      "loss": 0.6068,
+      "step": 1475
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.4314902293258785,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.677,
+      "step": 1476
+    },
+    {
+      "epoch": 0.7877333333333333,
+      "grad_norm": 1.1126329635651677,
+      "learning_rate": 2.2733224137277366e-05,
+      "loss": 0.6036,
+      "step": 1477
+    },
+    {
+      "epoch": 0.7882666666666667,
+      "grad_norm": 0.45498482196176615,
+      "learning_rate": 2.262364118471805e-05,
+      "loss": 0.6571,
+      "step": 1478
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.4298552259283547,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6105,
+      "step": 1479
+    },
+    {
+      "epoch": 0.7893333333333333,
+      "grad_norm": 0.4730774388482903,
+      "learning_rate": 2.2405168778797646e-05,
+      "loss": 0.7216,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7898666666666667,
+      "grad_norm": 0.3903834882469152,
+      "learning_rate": 2.2296279977828337e-05,
+      "loss": 0.5768,
+      "step": 1481
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.4339246885401077,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.6352,
+      "step": 1482
+    },
+    {
+      "epoch": 0.7909333333333334,
+      "grad_norm": 0.3881044724903212,
+      "learning_rate": 2.2079198805662914e-05,
+      "loss": 0.6078,
+      "step": 1483
+    },
+    {
+      "epoch": 0.7914666666666667,
+      "grad_norm": 0.3557673839680774,
+      "learning_rate": 2.1971007082704164e-05,
+      "loss": 0.5902,
+      "step": 1484
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.4352593345765754,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.5597,
+      "step": 1485
+    },
+    {
+      "epoch": 0.7925333333333333,
+      "grad_norm": 0.43357196154806343,
+      "learning_rate": 2.1755322978418137e-05,
+      "loss": 0.6785,
+      "step": 1486
+    },
+    {
+      "epoch": 0.7930666666666667,
+      "grad_norm": 0.3669378366091731,
+      "learning_rate": 2.1647831241156302e-05,
+      "loss": 0.6085,
+      "step": 1487
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.4412473289705872,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.5943,
+      "step": 1488
+    },
+    {
+      "epoch": 0.7941333333333334,
+      "grad_norm": 0.5208888595319536,
+      "learning_rate": 2.1433550001327373e-05,
+      "loss": 0.6504,
+      "step": 1489
+    },
+    {
+      "epoch": 0.7946666666666666,
+      "grad_norm": 0.4323005556458796,
+      "learning_rate": 2.1326761138636553e-05,
+      "loss": 0.6607,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.4108862936123872,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6142,
+      "step": 1491
+    },
+    {
+      "epoch": 0.7957333333333333,
+      "grad_norm": 0.4346260375930054,
+      "learning_rate": 2.111388852214001e-05,
+      "loss": 0.64,
+      "step": 1492
+    },
+    {
+      "epoch": 0.7962666666666667,
+      "grad_norm": 0.40308813177940356,
+      "learning_rate": 2.1007805404004242e-05,
+      "loss": 0.5921,
+      "step": 1493
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.589901272143973,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.7345,
+      "step": 1494
+    },
+    {
+      "epoch": 0.7973333333333333,
+      "grad_norm": 0.5024783982715955,
+      "learning_rate": 2.0796347131858186e-05,
+      "loss": 0.6399,
+      "step": 1495
+    },
+    {
+      "epoch": 0.7978666666666666,
+      "grad_norm": 0.39008869591246187,
+      "learning_rate": 2.069097260929439e-05,
+      "loss": 0.5918,
+      "step": 1496
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.38403357760120854,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6283,
+      "step": 1497
+    },
+    {
+      "epoch": 0.7989333333333334,
+      "grad_norm": 0.3703153151685138,
+      "learning_rate": 2.048093436450603e-05,
+      "loss": 0.5995,
+      "step": 1498
+    },
+    {
+      "epoch": 0.7994666666666667,
+      "grad_norm": 0.43898842684286316,
+      "learning_rate": 2.0376271269487514e-05,
+      "loss": 0.6952,
+      "step": 1499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3947244506684329,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6212,
+      "step": 1500
+    },
+    {
+      "epoch": 0.8005333333333333,
+      "grad_norm": 0.4670396325093309,
+      "learning_rate": 2.0167658696900317e-05,
+      "loss": 0.6289,
+      "step": 1501
+    },
+    {
+      "epoch": 0.8010666666666667,
+      "grad_norm": 0.4422436470698676,
+      "learning_rate": 2.0063709842280432e-05,
+      "loss": 0.6243,
+      "step": 1502
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.40660022590354994,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.6212,
+      "step": 1503
+    },
+    {
+      "epoch": 0.8021333333333334,
+      "grad_norm": 0.4123894521046528,
+      "learning_rate": 1.985652854842247e-05,
+      "loss": 0.6577,
+      "step": 1504
+    },
+    {
+      "epoch": 0.8026666666666666,
+      "grad_norm": 0.3771549277141085,
+      "learning_rate": 1.9753296727859195e-05,
+      "loss": 0.6173,
+      "step": 1505
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3849243912963897,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6123,
+      "step": 1506
+    },
+    {
+      "epoch": 0.8037333333333333,
+      "grad_norm": 0.4457368375092948,
+      "learning_rate": 1.9547552280792524e-05,
+      "loss": 0.6922,
+      "step": 1507
+    },
+    {
+      "epoch": 0.8042666666666667,
+      "grad_norm": 0.4284937802894128,
+      "learning_rate": 1.9445040268673298e-05,
+      "loss": 0.5886,
+      "step": 1508
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.43580633967681354,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.6229,
+      "step": 1509
+    },
+    {
+      "epoch": 0.8053333333333333,
+      "grad_norm": 0.3753516553746708,
+      "learning_rate": 1.9240738197844278e-05,
+      "loss": 0.6282,
+      "step": 1510
+    },
+    {
+      "epoch": 0.8058666666666666,
+      "grad_norm": 0.3901069216414992,
+      "learning_rate": 1.9138948749211472e-05,
+      "loss": 0.6321,
+      "step": 1511
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3808570126535211,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.5877,
+      "step": 1512
+    },
+    {
+      "epoch": 0.8069333333333333,
+      "grad_norm": 0.3889992683662418,
+      "learning_rate": 1.8936094545302095e-05,
+      "loss": 0.6821,
+      "step": 1513
+    },
+    {
+      "epoch": 0.8074666666666667,
+      "grad_norm": 0.5578844893113173,
+      "learning_rate": 1.883503039577894e-05,
+      "loss": 0.74,
+      "step": 1514
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.39448407277793424,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.6496,
+      "step": 1515
+    },
+    {
+      "epoch": 0.8085333333333333,
+      "grad_norm": 0.43531248342604767,
+      "learning_rate": 1.8633629510559314e-05,
+      "loss": 0.6313,
+      "step": 1516
+    },
+    {
+      "epoch": 0.8090666666666667,
+      "grad_norm": 0.3958844767624935,
+      "learning_rate": 1.8533293376276472e-05,
+      "loss": 0.6081,
+      "step": 1517
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.41290102969058634,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.6707,
+      "step": 1518
+    },
+    {
+      "epoch": 0.8101333333333334,
+      "grad_norm": 0.49966471827528597,
+      "learning_rate": 1.8333351222458407e-05,
+      "loss": 0.6603,
+      "step": 1519
+    },
+    {
+      "epoch": 0.8106666666666666,
+      "grad_norm": 0.41459538303684473,
+      "learning_rate": 1.8233745799980817e-05,
+      "loss": 0.6364,
+      "step": 1520
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.46718220105261027,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.653,
+      "step": 1521
+    },
+    {
+      "epoch": 0.8117333333333333,
+      "grad_norm": 0.4228244529956391,
+      "learning_rate": 1.803526775107217e-05,
+      "loss": 0.6597,
+      "step": 1522
+    },
+    {
+      "epoch": 0.8122666666666667,
+      "grad_norm": 0.40558898963210477,
+      "learning_rate": 1.7936395717326704e-05,
+      "loss": 0.6829,
+      "step": 1523
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4137737033146253,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6164,
+      "step": 1524
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 0.41323312383960475,
+      "learning_rate": 1.773938710748706e-05,
+      "loss": 0.6567,
+      "step": 1525
+    },
+    {
+      "epoch": 0.8138666666666666,
+      "grad_norm": 0.38724609494835577,
+      "learning_rate": 1.7641251119690505e-05,
+      "loss": 0.5872,
+      "step": 1526
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.4023325194626976,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.6137,
+      "step": 1527
+    },
+    {
+      "epoch": 0.8149333333333333,
+      "grad_norm": 0.4319165152042328,
+      "learning_rate": 1.744571724358789e-05,
+      "loss": 0.6258,
+      "step": 1528
+    },
+    {
+      "epoch": 0.8154666666666667,
+      "grad_norm": 0.38427160535412597,
+      "learning_rate": 1.7348319939175637e-05,
+      "loss": 0.6338,
+      "step": 1529
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.41824609399104956,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.6524,
+      "step": 1530
+    },
+    {
+      "epoch": 0.8165333333333333,
+      "grad_norm": 0.3643343854443445,
+      "learning_rate": 1.715426605184407e-05,
+      "loss": 0.6074,
+      "step": 1531
+    },
+    {
+      "epoch": 0.8170666666666667,
+      "grad_norm": 0.37863486863529766,
+      "learning_rate": 1.705761004839911e-05,
+      "loss": 0.6208,
+      "step": 1532
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.40498774926576103,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.6275,
+      "step": 1533
+    },
+    {
+      "epoch": 0.8181333333333334,
+      "grad_norm": 0.4024760095065096,
+      "learning_rate": 1.6865041365097435e-05,
+      "loss": 0.6499,
+      "step": 1534
+    },
+    {
+      "epoch": 0.8186666666666667,
+      "grad_norm": 0.4931732634039112,
+      "learning_rate": 1.676912926028007e-05,
+      "loss": 0.5824,
+      "step": 1535
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3812008164178614,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6607,
+      "step": 1536
+    },
+    {
+      "epoch": 0.8197333333333333,
+      "grad_norm": 0.47719901456654096,
+      "learning_rate": 1.6578050956351886e-05,
+      "loss": 0.6999,
+      "step": 1537
+    },
+    {
+      "epoch": 0.8202666666666667,
+      "grad_norm": 0.45637998849936456,
+      "learning_rate": 1.6482885327829913e-05,
+      "loss": 0.703,
+      "step": 1538
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.39587011423510193,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6047,
+      "step": 1539
+    },
+    {
+      "epoch": 0.8213333333333334,
+      "grad_norm": 0.3739121533561166,
+      "learning_rate": 1.6293302538564382e-05,
+      "loss": 0.6546,
+      "step": 1540
+    },
+    {
+      "epoch": 0.8218666666666666,
+      "grad_norm": 0.44386622402899223,
+      "learning_rate": 1.619888594394382e-05,
+      "loss": 0.6553,
+      "step": 1541
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.5140460706419457,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6769,
+      "step": 1542
+    },
+    {
+      "epoch": 0.8229333333333333,
+      "grad_norm": 0.39203232423336304,
+      "learning_rate": 1.601080376443763e-05,
+      "loss": 0.6486,
+      "step": 1543
+    },
+    {
+      "epoch": 0.8234666666666667,
+      "grad_norm": 0.4117622174758577,
+      "learning_rate": 1.5917138741193973e-05,
+      "loss": 0.5995,
+      "step": 1544
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.40390414816218734,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6332,
+      "step": 1545
+    },
+    {
+      "epoch": 0.8245333333333333,
+      "grad_norm": 0.41077549636484323,
+      "learning_rate": 1.573056222621453e-05,
+      "loss": 0.6146,
+      "step": 1546
+    },
+    {
+      "epoch": 0.8250666666666666,
+      "grad_norm": 0.4676051434023212,
+      "learning_rate": 1.5637651291624523e-05,
+      "loss": 0.6488,
+      "step": 1547
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3337707882907331,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.5979,
+      "step": 1548
+    },
+    {
+      "epoch": 0.8261333333333334,
+      "grad_norm": 0.38180335470696575,
+      "learning_rate": 1.5452585455473977e-05,
+      "loss": 0.5706,
+      "step": 1549
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.37409126572075707,
+      "learning_rate": 1.536043110654809e-05,
+      "loss": 0.6201,
+      "step": 1550
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.41119546624395015,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6301,
+      "step": 1551
+    },
+    {
+      "epoch": 0.8277333333333333,
+      "grad_norm": 0.47312142410018665,
+      "learning_rate": 1.5176880922928616e-05,
+      "loss": 0.6679,
+      "step": 1552
+    },
+    {
+      "epoch": 0.8282666666666667,
+      "grad_norm": 0.3972626708375532,
+      "learning_rate": 1.5085485636343755e-05,
+      "loss": 0.5996,
+      "step": 1553
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.39227793537484773,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6321,
+      "step": 1554
+    },
+    {
+      "epoch": 0.8293333333333334,
+      "grad_norm": 0.4076798390203009,
+      "learning_rate": 1.4903456038223939e-05,
+      "loss": 0.624,
+      "step": 1555
+    },
+    {
+      "epoch": 0.8298666666666666,
+      "grad_norm": 0.41269879748749927,
+      "learning_rate": 1.4812822270257009e-05,
+      "loss": 0.6039,
+      "step": 1556
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.7960448041645097,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.6188,
+      "step": 1557
+    },
+    {
+      "epoch": 0.8309333333333333,
+      "grad_norm": 0.4101254671403745,
+      "learning_rate": 1.4632318149739177e-05,
+      "loss": 0.5887,
+      "step": 1558
+    },
+    {
+      "epoch": 0.8314666666666667,
+      "grad_norm": 0.438443003520308,
+      "learning_rate": 1.454244833620102e-05,
+      "loss": 0.6455,
+      "step": 1559
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.39712667151133474,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6158,
+      "step": 1560
+    },
+    {
+      "epoch": 0.8325333333333333,
+      "grad_norm": 0.36115816596213624,
+      "learning_rate": 1.4363474544389877e-05,
+      "loss": 0.5585,
+      "step": 1561
+    },
+    {
+      "epoch": 0.8330666666666666,
+      "grad_norm": 0.4329200804299213,
+      "learning_rate": 1.4274371100559791e-05,
+      "loss": 0.6209,
+      "step": 1562
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.4193382300021258,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.6392,
+      "step": 1563
+    },
+    {
+      "epoch": 0.8341333333333333,
+      "grad_norm": 0.33435332736478696,
+      "learning_rate": 1.409693244743192e-05,
+      "loss": 0.608,
+      "step": 1564
+    },
+    {
+      "epoch": 0.8346666666666667,
+      "grad_norm": 0.42534928789770504,
+      "learning_rate": 1.4008597767992871e-05,
+      "loss": 0.5901,
+      "step": 1565
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.4139743745692021,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.6503,
+      "step": 1566
+    },
+    {
+      "epoch": 0.8357333333333333,
+      "grad_norm": 0.40463453872964994,
+      "learning_rate": 1.3832699022267515e-05,
+      "loss": 0.6574,
+      "step": 1567
+    },
+    {
+      "epoch": 0.8362666666666667,
+      "grad_norm": 0.43593014759934173,
+      "learning_rate": 1.37451354812416e-05,
+      "loss": 0.6661,
+      "step": 1568
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.42209842220072213,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.5941,
+      "step": 1569
+    },
+    {
+      "epoch": 0.8373333333333334,
+      "grad_norm": 0.49208295342548763,
+      "learning_rate": 1.3570781370252582e-05,
+      "loss": 0.6361,
+      "step": 1570
+    },
+    {
+      "epoch": 0.8378666666666666,
+      "grad_norm": 0.3927021791281994,
+      "learning_rate": 1.3483991320937306e-05,
+      "loss": 0.6627,
+      "step": 1571
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.47280659071619807,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.7039,
+      "step": 1572
+    },
+    {
+      "epoch": 0.8389333333333333,
+      "grad_norm": 0.6259957800318892,
+      "learning_rate": 1.3311186530505838e-05,
+      "loss": 0.6268,
+      "step": 1573
+    },
+    {
+      "epoch": 0.8394666666666667,
+      "grad_norm": 0.39543579754561403,
+      "learning_rate": 1.322517230541096e-05,
+      "loss": 0.6407,
+      "step": 1574
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.3922214144058078,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.5788,
+      "step": 1575
+    },
+    {
+      "epoch": 0.8405333333333334,
+      "grad_norm": 0.38711273483843417,
+      "learning_rate": 1.30539214797198e-05,
+      "loss": 0.5876,
+      "step": 1576
+    },
+    {
+      "epoch": 0.8410666666666666,
+      "grad_norm": 0.4025097370460652,
+      "learning_rate": 1.2968685390504465e-05,
+      "loss": 0.6288,
+      "step": 1577
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.35716229127592924,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.5842,
+      "step": 1578
+    },
+    {
+      "epoch": 0.8421333333333333,
+      "grad_norm": 0.3991265311076169,
+      "learning_rate": 1.2798993131973091e-05,
+      "loss": 0.6152,
+      "step": 1579
+    },
+    {
+      "epoch": 0.8426666666666667,
+      "grad_norm": 0.3891308287737353,
+      "learning_rate": 1.2714537469383858e-05,
+      "loss": 0.579,
+      "step": 1580
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.3845123738441986,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.5741,
+      "step": 1581
+    },
+    {
+      "epoch": 0.8437333333333333,
+      "grad_norm": 0.4231506210111426,
+      "learning_rate": 1.2546408338544769e-05,
+      "loss": 0.6283,
+      "step": 1582
+    },
+    {
+      "epoch": 0.8442666666666667,
+      "grad_norm": 0.4593327149707268,
+      "learning_rate": 1.2462735372353996e-05,
+      "loss": 0.6965,
+      "step": 1583
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.4881232112672962,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6438,
+      "step": 1584
+    },
+    {
+      "epoch": 0.8453333333333334,
+      "grad_norm": 0.44925678677877706,
+      "learning_rate": 1.2296173887730123e-05,
+      "loss": 0.6114,
+      "step": 1585
+    },
+    {
+      "epoch": 0.8458666666666667,
+      "grad_norm": 0.42025807837123813,
+      "learning_rate": 1.2213285866674905e-05,
+      "loss": 0.6227,
+      "step": 1586
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.42849134787650295,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.601,
+      "step": 1587
+    },
+    {
+      "epoch": 0.8469333333333333,
+      "grad_norm": 0.3895228109770154,
+      "learning_rate": 1.2048296504658207e-05,
+      "loss": 0.654,
+      "step": 1588
+    },
+    {
+      "epoch": 0.8474666666666667,
+      "grad_norm": 0.4835696916394425,
+      "learning_rate": 1.1966195656380031e-05,
+      "loss": 0.7137,
+      "step": 1589
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3656139578822936,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.5439,
+      "step": 1590
+    },
+    {
+      "epoch": 0.8485333333333334,
+      "grad_norm": 0.42596194837532975,
+      "learning_rate": 1.1802782851111205e-05,
+      "loss": 0.6934,
+      "step": 1591
+    },
+    {
+      "epoch": 0.8490666666666666,
+      "grad_norm": 0.3999166729778335,
+      "learning_rate": 1.1721471382096027e-05,
+      "loss": 0.62,
+      "step": 1592
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.42934630139262464,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.645,
+      "step": 1593
+    },
+    {
+      "epoch": 0.8501333333333333,
+      "grad_norm": 0.43080016509115493,
+      "learning_rate": 1.1559639525345311e-05,
+      "loss": 0.61,
+      "step": 1594
+    },
+    {
+      "epoch": 0.8506666666666667,
+      "grad_norm": 0.4732621945870718,
+      "learning_rate": 1.1479119620864276e-05,
+      "loss": 0.6043,
+      "step": 1595
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.46741353727957474,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6325,
+      "step": 1596
+    },
+    {
+      "epoch": 0.8517333333333333,
+      "grad_norm": 0.34396909556193217,
+      "learning_rate": 1.1318873061913405e-05,
+      "loss": 0.6357,
+      "step": 1597
+    },
+    {
+      "epoch": 0.8522666666666666,
+      "grad_norm": 0.3824092719516941,
+      "learning_rate": 1.123914688596409e-05,
+      "loss": 0.6324,
+      "step": 1598
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.4018558685092868,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6341,
+      "step": 1599
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.3720701038342301,
+      "learning_rate": 1.1080489931489391e-05,
+      "loss": 0.5664,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8538666666666667,
+      "grad_norm": 0.3973775262116455,
+      "learning_rate": 1.1001559626737756e-05,
+      "loss": 0.5774,
+      "step": 1601
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.4458687962544333,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6339,
+      "step": 1602
+    },
+    {
+      "epoch": 0.8549333333333333,
+      "grad_norm": 0.3754996203130371,
+      "learning_rate": 1.0844496540694515e-05,
+      "loss": 0.6895,
+      "step": 1603
+    },
+    {
+      "epoch": 0.8554666666666667,
+      "grad_norm": 0.4445078911948587,
+      "learning_rate": 1.0766364228417148e-05,
+      "loss": 0.5656,
+      "step": 1604
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.4844531254962157,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6585,
+      "step": 1605
+    },
+    {
+      "epoch": 0.8565333333333334,
+      "grad_norm": 0.4631582566632992,
+      "learning_rate": 1.0610899231924886e-05,
+      "loss": 0.7185,
+      "step": 1606
+    },
+    {
+      "epoch": 0.8570666666666666,
+      "grad_norm": 0.41355293965505663,
+      "learning_rate": 1.0533567011952094e-05,
+      "loss": 0.5511,
+      "step": 1607
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.37894823409900164,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6239,
+      "step": 1608
+    },
+    {
+      "epoch": 0.8581333333333333,
+      "grad_norm": 0.3894635070241822,
+      "learning_rate": 1.0379704283181179e-05,
+      "loss": 0.6214,
+      "step": 1609
+    },
+    {
+      "epoch": 0.8586666666666667,
+      "grad_norm": 0.49077750080918164,
+      "learning_rate": 1.0303174233840528e-05,
+      "loss": 0.7006,
+      "step": 1610
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.45879660209545653,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6914,
+      "step": 1611
+    },
+    {
+      "epoch": 0.8597333333333333,
+      "grad_norm": 0.39586721561275995,
+      "learning_rate": 1.0150917907899926e-05,
+      "loss": 0.6312,
+      "step": 1612
+    },
+    {
+      "epoch": 0.8602666666666666,
+      "grad_norm": 0.40507398909220027,
+      "learning_rate": 1.007519208596045e-05,
+      "loss": 0.6476,
+      "step": 1613
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.43893696970881396,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6376,
+      "step": 1614
+    },
+    {
+      "epoch": 0.8613333333333333,
+      "grad_norm": 0.4092043057017648,
+      "learning_rate": 9.924546254786493e-06,
+      "loss": 0.5815,
+      "step": 1615
+    },
+    {
+      "epoch": 0.8618666666666667,
+      "grad_norm": 0.41025146911962923,
+      "learning_rate": 9.849626695403324e-06,
+      "loss": 0.6329,
+      "step": 1616
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.37822235338995613,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.5991,
+      "step": 1617
+    },
+    {
+      "epoch": 0.8629333333333333,
+      "grad_norm": 0.548954869584662,
+      "learning_rate": 9.700595407649805e-06,
+      "loss": 0.7331,
+      "step": 1618
+    },
+    {
+      "epoch": 0.8634666666666667,
+      "grad_norm": 0.41246247504461564,
+      "learning_rate": 9.62648412430951e-06,
+      "loss": 0.6008,
+      "step": 1619
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3776720322716301,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6276,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8645333333333334,
+      "grad_norm": 0.412600168791054,
+      "learning_rate": 9.479071385238892e-06,
+      "loss": 0.581,
+      "step": 1621
+    },
+    {
+      "epoch": 0.8650666666666667,
+      "grad_norm": 0.39743555097964806,
+      "learning_rate": 9.40577036970538e-06,
+      "loss": 0.5672,
+      "step": 1622
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.42723272556656094,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6693,
+      "step": 1623
+    },
+    {
+      "epoch": 0.8661333333333333,
+      "grad_norm": 0.3856500310258893,
+      "learning_rate": 9.259980141081115e-06,
+      "loss": 0.6425,
+      "step": 1624
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.3778877156204836,
+      "learning_rate": 9.187491363342093e-06,
+      "loss": 0.5925,
+      "step": 1625
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3732515087037962,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6723,
+      "step": 1626
+    },
+    {
+      "epoch": 0.8677333333333334,
+      "grad_norm": 0.3783984343604273,
+      "learning_rate": 9.043327563322112e-06,
+      "loss": 0.6461,
+      "step": 1627
+    },
+    {
+      "epoch": 0.8682666666666666,
+      "grad_norm": 0.5938742034895879,
+      "learning_rate": 8.971652971536148e-06,
+      "loss": 0.6332,
+      "step": 1628
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.40222798676157995,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6326,
+      "step": 1629
+    },
+    {
+      "epoch": 0.8693333333333333,
+      "grad_norm": 0.48395039233064074,
+      "learning_rate": 8.829119474567671e-06,
+      "loss": 0.6593,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8698666666666667,
+      "grad_norm": 0.42651977915224326,
+      "learning_rate": 8.758260995011825e-06,
+      "loss": 0.5677,
+      "step": 1631
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4882721676027432,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.656,
+      "step": 1632
+    },
+    {
+      "epoch": 0.8709333333333333,
+      "grad_norm": 0.3986508067953702,
+      "learning_rate": 8.617361631727138e-06,
+      "loss": 0.5809,
+      "step": 1633
+    },
+    {
+      "epoch": 0.8714666666666666,
+      "grad_norm": 0.40311993641305327,
+      "learning_rate": 8.547321168745193e-06,
+      "loss": 0.571,
+      "step": 1634
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.4242736221579273,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.6385,
+      "step": 1635
+    },
+    {
+      "epoch": 0.8725333333333334,
+      "grad_norm": 0.44687484952940953,
+      "learning_rate": 8.408059725858719e-06,
+      "loss": 0.6346,
+      "step": 1636
+    },
+    {
+      "epoch": 0.8730666666666667,
+      "grad_norm": 0.4693488072014337,
+      "learning_rate": 8.338839161809997e-06,
+      "loss": 0.6168,
+      "step": 1637
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.39243857044150077,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.5902,
+      "step": 1638
+    },
+    {
+      "epoch": 0.8741333333333333,
+      "grad_norm": 0.42414896118706413,
+      "learning_rate": 8.201219382016556e-06,
+      "loss": 0.6685,
+      "step": 1639
+    },
+    {
+      "epoch": 0.8746666666666667,
+      "grad_norm": 0.4169527469706719,
+      "learning_rate": 8.132820577225387e-06,
+      "loss": 0.7182,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.42925948132129943,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6058,
+      "step": 1641
+    },
+    {
+      "epoch": 0.8757333333333334,
+      "grad_norm": 0.37048627327545736,
+      "learning_rate": 7.996846159099557e-06,
+      "loss": 0.6276,
+      "step": 1642
+    },
+    {
+      "epoch": 0.8762666666666666,
+      "grad_norm": 0.4124669750341732,
+      "learning_rate": 7.929270951805178e-06,
+      "loss": 0.614,
+      "step": 1643
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.4103369678456755,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6566,
+      "step": 1644
+    },
+    {
+      "epoch": 0.8773333333333333,
+      "grad_norm": 0.3719918884790809,
+      "learning_rate": 7.794945549701993e-06,
+      "loss": 0.641,
+      "step": 1645
+    },
+    {
+      "epoch": 0.8778666666666667,
+      "grad_norm": 0.36160408673716565,
+      "learning_rate": 7.728195756009204e-06,
+      "loss": 0.6149,
+      "step": 1646
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.371123351701688,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.5774,
+      "step": 1647
+    },
+    {
+      "epoch": 0.8789333333333333,
+      "grad_norm": 0.37577075425861084,
+      "learning_rate": 7.595522979965819e-06,
+      "loss": 0.5897,
+      "step": 1648
+    },
+    {
+      "epoch": 0.8794666666666666,
+      "grad_norm": 0.4220524922249047,
+      "learning_rate": 7.529600393796232e-06,
+      "loss": 0.5942,
+      "step": 1649
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4171383655929499,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6207,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8805333333333333,
+      "grad_norm": 0.3944743568418283,
+      "learning_rate": 7.3985838094349444e-06,
+      "loss": 0.6455,
+      "step": 1651
+    },
+    {
+      "epoch": 0.8810666666666667,
+      "grad_norm": 0.519956292062287,
+      "learning_rate": 7.333490202478666e-06,
+      "loss": 0.6385,
+      "step": 1652
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.44435779218312194,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6475,
+      "step": 1653
+    },
+    {
+      "epoch": 0.8821333333333333,
+      "grad_norm": 0.6111002333666475,
+      "learning_rate": 7.204133330911178e-06,
+      "loss": 0.6577,
+      "step": 1654
+    },
+    {
+      "epoch": 0.8826666666666667,
+      "grad_norm": 0.4053432632644683,
+      "learning_rate": 7.1398704525792e-06,
+      "loss": 0.592,
+      "step": 1655
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.39684452419339145,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.5735,
+      "step": 1656
+    },
+    {
+      "epoch": 0.8837333333333334,
+      "grad_norm": 0.4694868727655645,
+      "learning_rate": 7.012176770311862e-06,
+      "loss": 0.7099,
+      "step": 1657
+    },
+    {
+      "epoch": 0.8842666666666666,
+      "grad_norm": 0.48108357618208775,
+      "learning_rate": 6.948746347689183e-06,
+      "loss": 0.6422,
+      "step": 1658
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.38441229981223335,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.5825,
+      "step": 1659
+    },
+    {
+      "epoch": 0.8853333333333333,
+      "grad_norm": 0.4556440972296517,
+      "learning_rate": 6.8227192865295995e-06,
+      "loss": 0.642,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8858666666666667,
+      "grad_norm": 0.40616369952408726,
+      "learning_rate": 6.760123024328624e-06,
+      "loss": 0.6632,
+      "step": 1661
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.3793897321641102,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.6461,
+      "step": 1662
+    },
+    {
+      "epoch": 0.8869333333333334,
+      "grad_norm": 0.4203113911725351,
+      "learning_rate": 6.635765971293484e-06,
+      "loss": 0.633,
+      "step": 1663
+    },
+    {
+      "epoch": 0.8874666666666666,
+      "grad_norm": 0.5910693329538524,
+      "learning_rate": 6.5740055518083375e-06,
+      "loss": 0.6138,
+      "step": 1664
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.4832743885903782,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6647,
+      "step": 1665
+    },
+    {
+      "epoch": 0.8885333333333333,
+      "grad_norm": 0.4005174298622957,
+      "learning_rate": 6.451321849032288e-06,
+      "loss": 0.643,
+      "step": 1666
+    },
+    {
+      "epoch": 0.8890666666666667,
+      "grad_norm": 0.3873921802193626,
+      "learning_rate": 6.390398932093555e-06,
+      "loss": 0.676,
+      "step": 1667
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.48315319790454375,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6335,
+      "step": 1668
+    },
+    {
+      "epoch": 0.8901333333333333,
+      "grad_norm": 0.3886811729463303,
+      "learning_rate": 6.269391876739495e-06,
+      "loss": 0.6104,
+      "step": 1669
+    },
+    {
+      "epoch": 0.8906666666666667,
+      "grad_norm": 0.4614013846175197,
+      "learning_rate": 6.209308099669597e-06,
+      "loss": 0.6364,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.40131057041768275,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.6214,
+      "step": 1671
+    },
+    {
+      "epoch": 0.8917333333333334,
+      "grad_norm": 0.4580884350765841,
+      "learning_rate": 6.089980943839924e-06,
+      "loss": 0.5913,
+      "step": 1672
+    },
+    {
+      "epoch": 0.8922666666666667,
+      "grad_norm": 0.4242718974878402,
+      "learning_rate": 6.030737921409169e-06,
+      "loss": 0.5749,
+      "step": 1673
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.40212616092166964,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6156,
+      "step": 1674
+    },
+    {
+      "epoch": 0.8933333333333333,
+      "grad_norm": 0.4243176189821529,
+      "learning_rate": 5.913093872058528e-06,
+      "loss": 0.6396,
+      "step": 1675
+    },
+    {
+      "epoch": 0.8938666666666667,
+      "grad_norm": 0.403021958023578,
+      "learning_rate": 5.854693196441641e-06,
+      "loss": 0.5815,
+      "step": 1676
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.41790627192937685,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6764,
+      "step": 1677
+    },
+    {
+      "epoch": 0.8949333333333334,
+      "grad_norm": 0.44462662217109117,
+      "learning_rate": 5.738735415290642e-06,
+      "loss": 0.651,
+      "step": 1678
+    },
+    {
+      "epoch": 0.8954666666666666,
+      "grad_norm": 0.4498091596916975,
+      "learning_rate": 5.681178656024055e-06,
+      "loss": 0.6066,
+      "step": 1679
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.5160909094338417,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6899,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8965333333333333,
+      "grad_norm": 0.4418308692888828,
+      "learning_rate": 5.566910259474289e-06,
+      "loss": 0.6064,
+      "step": 1681
+    },
+    {
+      "epoch": 0.8970666666666667,
+      "grad_norm": 0.4296839923775005,
+      "learning_rate": 5.510198963413881e-06,
+      "loss": 0.6417,
+      "step": 1682
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.3855594591230224,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6506,
+      "step": 1683
+    },
+    {
+      "epoch": 0.8981333333333333,
+      "grad_norm": 0.3634407750565033,
+      "learning_rate": 5.397623022464226e-06,
+      "loss": 0.5948,
+      "step": 1684
+    },
+    {
+      "epoch": 0.8986666666666666,
+      "grad_norm": 0.40767612541599146,
+      "learning_rate": 5.341758713743828e-06,
+      "loss": 0.6902,
+      "step": 1685
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.48155348442982904,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6122,
+      "step": 1686
+    },
+    {
+      "epoch": 0.8997333333333334,
+      "grad_norm": 0.4499236148193907,
+      "learning_rate": 5.230878253907912e-06,
+      "loss": 0.6439,
+      "step": 1687
+    },
+    {
+      "epoch": 0.9002666666666667,
+      "grad_norm": 0.3735644137361036,
+      "learning_rate": 5.175862433898282e-06,
+      "loss": 0.6033,
+      "step": 1688
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.474407271254516,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6532,
+      "step": 1689
+    },
+    {
+      "epoch": 0.9013333333333333,
+      "grad_norm": 0.3771963682513783,
+      "learning_rate": 5.066680435123106e-06,
+      "loss": 0.6001,
+      "step": 1690
+    },
+    {
+      "epoch": 0.9018666666666667,
+      "grad_norm": 0.438966461969124,
+      "learning_rate": 5.012514582391592e-06,
+      "loss": 0.6232,
+      "step": 1691
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.4540021611590887,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6328,
+      "step": 1692
+    },
+    {
+      "epoch": 0.9029333333333334,
+      "grad_norm": 0.3905382736624076,
+      "learning_rate": 4.905033978977491e-06,
+      "loss": 0.6274,
+      "step": 1693
+    },
+    {
+      "epoch": 0.9034666666666666,
+      "grad_norm": 0.38257765103487784,
+      "learning_rate": 4.851719549248301e-06,
+      "loss": 0.6249,
+      "step": 1694
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.3859713396947559,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.5994,
+      "step": 1695
+    },
+    {
+      "epoch": 0.9045333333333333,
+      "grad_norm": 0.5426179257393545,
+      "learning_rate": 4.745943229770122e-06,
+      "loss": 0.6514,
+      "step": 1696
+    },
+    {
+      "epoch": 0.9050666666666667,
+      "grad_norm": 0.43028387031700205,
+      "learning_rate": 4.693481655885257e-06,
+      "loss": 0.5861,
+      "step": 1697
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.4246457047582852,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6456,
+      "step": 1698
+    },
+    {
+      "epoch": 0.9061333333333333,
+      "grad_norm": 0.5092107153404754,
+      "learning_rate": 4.58941246311464e-06,
+      "loss": 0.5961,
+      "step": 1699
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.39858739998911535,
+      "learning_rate": 4.537805154995278e-06,
+      "loss": 0.6359,
+      "step": 1700
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.46706381998750435,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6519,
+      "step": 1701
+    },
+    {
+      "epoch": 0.9077333333333333,
+      "grad_norm": 0.43640298644156794,
+      "learning_rate": 4.435445885824285e-06,
+      "loss": 0.6849,
+      "step": 1702
+    },
+    {
+      "epoch": 0.9082666666666667,
+      "grad_norm": 0.4163289069229634,
+      "learning_rate": 4.384694230432984e-06,
+      "loss": 0.647,
+      "step": 1703
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3772054625966044,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6088,
+      "step": 1704
+    },
+    {
+      "epoch": 0.9093333333333333,
+      "grad_norm": 0.4836480960152808,
+      "learning_rate": 4.2840476357989825e-06,
+      "loss": 0.6964,
+      "step": 1705
+    },
+    {
+      "epoch": 0.9098666666666667,
+      "grad_norm": 0.3822465242646973,
+      "learning_rate": 4.2341529971023255e-06,
+      "loss": 0.597,
+      "step": 1706
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.417599387034245,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6582,
+      "step": 1707
+    },
+    {
+      "epoch": 0.9109333333333334,
+      "grad_norm": 0.4057234115610521,
+      "learning_rate": 4.135221781914034e-06,
+      "loss": 0.7205,
+      "step": 1708
+    },
+    {
+      "epoch": 0.9114666666666666,
+      "grad_norm": 0.46278320430493924,
+      "learning_rate": 4.0861855008460405e-06,
+      "loss": 0.6431,
+      "step": 1709
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.5983770454803956,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.7032,
+      "step": 1710
+    },
+    {
+      "epoch": 0.9125333333333333,
+      "grad_norm": 0.4542534348324114,
+      "learning_rate": 3.988972323910778e-06,
+      "loss": 0.6547,
+      "step": 1711
+    },
+    {
+      "epoch": 0.9130666666666667,
+      "grad_norm": 0.38144264321719323,
+      "learning_rate": 3.9407957183368095e-06,
+      "loss": 0.6794,
+      "step": 1712
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.4448017300662135,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.5934,
+      "step": 1713
+    },
+    {
+      "epoch": 0.9141333333333334,
+      "grad_norm": 0.48633585978396293,
+      "learning_rate": 3.845303192289074e-06,
+      "loss": 0.7325,
+      "step": 1714
+    },
+    {
+      "epoch": 0.9146666666666666,
+      "grad_norm": 0.41169580199804856,
+      "learning_rate": 3.797987556970495e-06,
+      "loss": 0.5883,
+      "step": 1715
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.4690029675675022,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6954,
+      "step": 1716
+    },
+    {
+      "epoch": 0.9157333333333333,
+      "grad_norm": 0.4186555967581732,
+      "learning_rate": 3.7042182482018075e-06,
+      "loss": 0.5946,
+      "step": 1717
+    },
+    {
+      "epoch": 0.9162666666666667,
+      "grad_norm": 0.4114249847423077,
+      "learning_rate": 3.6577648547611033e-06,
+      "loss": 0.6068,
+      "step": 1718
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.446459810055769,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6508,
+      "step": 1719
+    },
+    {
+      "epoch": 0.9173333333333333,
+      "grad_norm": 0.40735661424570757,
+      "learning_rate": 3.565721283350931e-06,
+      "loss": 0.5591,
+      "step": 1720
+    },
+    {
+      "epoch": 0.9178666666666667,
+      "grad_norm": 0.6078548328155869,
+      "learning_rate": 3.5201313802375456e-06,
+      "loss": 0.7193,
+      "step": 1721
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.3991174488262027,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.5677,
+      "step": 1722
+    },
+    {
+      "epoch": 0.9189333333333334,
+      "grad_norm": 0.40029407084167784,
+      "learning_rate": 3.4298160198856568e-06,
+      "loss": 0.6153,
+      "step": 1723
+    },
+    {
+      "epoch": 0.9194666666666667,
+      "grad_norm": 0.3859536278935306,
+      "learning_rate": 3.3850908323424967e-06,
+      "loss": 0.565,
+      "step": 1724
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.4306086879247975,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6185,
+      "step": 1725
+    },
+    {
+      "epoch": 0.9205333333333333,
+      "grad_norm": 0.4681143432489656,
+      "learning_rate": 3.296506110302422e-06,
+      "loss": 0.6531,
+      "step": 1726
+    },
+    {
+      "epoch": 0.9210666666666667,
+      "grad_norm": 0.4494640559550906,
+      "learning_rate": 3.252646840332918e-06,
+      "loss": 0.635,
+      "step": 1727
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4913072174432582,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6709,
+      "step": 1728
+    },
+    {
+      "epoch": 0.9221333333333334,
+      "grad_norm": 0.4117660804615872,
+      "learning_rate": 3.1657951373467497e-06,
+      "loss": 0.6076,
+      "step": 1729
+    },
+    {
+      "epoch": 0.9226666666666666,
+      "grad_norm": 0.3909460342982364,
+      "learning_rate": 3.1228029636824475e-06,
+      "loss": 0.6417,
+      "step": 1730
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.38730337425406364,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6187,
+      "step": 1731
+    },
+    {
+      "epoch": 0.9237333333333333,
+      "grad_norm": 0.45208377709908015,
+      "learning_rate": 3.037686613916857e-06,
+      "loss": 0.6333,
+      "step": 1732
+    },
+    {
+      "epoch": 0.9242666666666667,
+      "grad_norm": 0.3817534811560179,
+      "learning_rate": 2.995562691985898e-06,
+      "loss": 0.6242,
+      "step": 1733
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.37087697886372456,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.605,
+      "step": 1734
+    },
+    {
+      "epoch": 0.9253333333333333,
+      "grad_norm": 0.39915063942181117,
+      "learning_rate": 2.912183982969385e-06,
+      "loss": 0.6565,
+      "step": 1735
+    },
+    {
+      "epoch": 0.9258666666666666,
+      "grad_norm": 0.40708734210415587,
+      "learning_rate": 2.8709294448653225e-06,
+      "loss": 0.6002,
+      "step": 1736
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.4986110062360719,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6355,
+      "step": 1737
+    },
+    {
+      "epoch": 0.9269333333333334,
+      "grad_norm": 0.3686135121919218,
+      "learning_rate": 2.789290617426765e-06,
+      "loss": 0.5633,
+      "step": 1738
+    },
+    {
+      "epoch": 0.9274666666666667,
+      "grad_norm": 0.3721249749726393,
+      "learning_rate": 2.748906571878207e-06,
+      "loss": 0.6423,
+      "step": 1739
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.41943545492108947,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.6054,
+      "step": 1740
+    },
+    {
+      "epoch": 0.9285333333333333,
+      "grad_norm": 0.43284070259335555,
+      "learning_rate": 2.6690098200866098e-06,
+      "loss": 0.6079,
+      "step": 1741
+    },
+    {
+      "epoch": 0.9290666666666667,
+      "grad_norm": 0.44396734327680126,
+      "learning_rate": 2.6294973524274125e-06,
+      "loss": 0.6188,
+      "step": 1742
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.41322427472792067,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6049,
+      "step": 1743
+    },
+    {
+      "epoch": 0.9301333333333334,
+      "grad_norm": 0.42906402552795747,
+      "learning_rate": 2.551344823532964e-06,
+      "loss": 0.5813,
+      "step": 1744
+    },
+    {
+      "epoch": 0.9306666666666666,
+      "grad_norm": 0.4178783262910209,
+      "learning_rate": 2.5127049956730207e-06,
+      "loss": 0.5671,
+      "step": 1745
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.43260523351026015,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.6239,
+      "step": 1746
+    },
+    {
+      "epoch": 0.9317333333333333,
+      "grad_norm": 0.3651195153793656,
+      "learning_rate": 2.436298790049363e-06,
+      "loss": 0.617,
+      "step": 1747
+    },
+    {
+      "epoch": 0.9322666666666667,
+      "grad_norm": 0.44024604737122075,
+      "learning_rate": 2.3985326404461604e-06,
+      "loss": 0.6431,
+      "step": 1748
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.4575895735344115,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6819,
+      "step": 1749
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.44772975142293386,
+      "learning_rate": 2.3238748115339324e-06,
+      "loss": 0.6679,
+      "step": 1750
+    },
+    {
+      "epoch": 0.9338666666666666,
+      "grad_norm": 0.5346957922210491,
+      "learning_rate": 2.286983355164529e-06,
+      "loss": 0.6276,
+      "step": 1751
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.4020209615295135,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.5827,
+      "step": 1752
+    },
+    {
+      "epoch": 0.9349333333333333,
+      "grad_norm": 0.5126635789939333,
+      "learning_rate": 2.2140759094162467e-06,
+      "loss": 0.6318,
+      "step": 1753
+    },
+    {
+      "epoch": 0.9354666666666667,
+      "grad_norm": 0.4510740259510137,
+      "learning_rate": 2.178060137750071e-06,
+      "loss": 0.6483,
+      "step": 1754
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.4026713047765242,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.6363,
+      "step": 1755
+    },
+    {
+      "epoch": 0.9365333333333333,
+      "grad_norm": 0.4474478889729829,
+      "learning_rate": 2.106905034576112e-06,
+      "loss": 0.6043,
+      "step": 1756
+    },
+    {
+      "epoch": 0.9370666666666667,
+      "grad_norm": 0.4646913173150557,
+      "learning_rate": 2.0717659155482738e-06,
+      "loss": 0.6526,
+      "step": 1757
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.42690876968217645,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.6381,
+      "step": 1758
+    },
+    {
+      "epoch": 0.9381333333333334,
+      "grad_norm": 0.42916131016030773,
+      "learning_rate": 2.002365067264289e-06,
+      "loss": 0.6438,
+      "step": 1759
+    },
+    {
+      "epoch": 0.9386666666666666,
+      "grad_norm": 0.40725518641858555,
+      "learning_rate": 1.968103545249611e-06,
+      "loss": 0.6125,
+      "step": 1760
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.3457561401505633,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6245,
+      "step": 1761
+    },
+    {
+      "epoch": 0.9397333333333333,
+      "grad_norm": 0.3695374114392267,
+      "learning_rate": 1.900458817025097e-06,
+      "loss": 0.5687,
+      "step": 1762
+    },
+    {
+      "epoch": 0.9402666666666667,
+      "grad_norm": 0.4447834422895891,
+      "learning_rate": 1.8670758128126909e-06,
+      "loss": 0.6677,
+      "step": 1763
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.460077593372982,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.592,
+      "step": 1764
+    },
+    {
+      "epoch": 0.9413333333333334,
+      "grad_norm": 0.4517788956280518,
+      "learning_rate": 1.8011890226208527e-06,
+      "loss": 0.6451,
+      "step": 1765
+    },
+    {
+      "epoch": 0.9418666666666666,
+      "grad_norm": 0.394925199205411,
+      "learning_rate": 1.7686854333893833e-06,
+      "loss": 0.5707,
+      "step": 1766
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.459032469937519,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6741,
+      "step": 1767
+    },
+    {
+      "epoch": 0.9429333333333333,
+      "grad_norm": 0.3912866116764875,
+      "learning_rate": 1.7045583519583074e-06,
+      "loss": 0.5979,
+      "step": 1768
+    },
+    {
+      "epoch": 0.9434666666666667,
+      "grad_norm": 0.47860367523518543,
+      "learning_rate": 1.6729350512519005e-06,
+      "loss": 0.6502,
+      "step": 1769
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3975155074917245,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.583,
+      "step": 1770
+    },
+    {
+      "epoch": 0.9445333333333333,
+      "grad_norm": 0.3716372371034665,
+      "learning_rate": 1.6105694020169593e-06,
+      "loss": 0.5901,
+      "step": 1771
+    },
+    {
+      "epoch": 0.9450666666666667,
+      "grad_norm": 0.4859361284510846,
+      "learning_rate": 1.5798272397217095e-06,
+      "loss": 0.6439,
+      "step": 1772
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.41541400854188787,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.588,
+      "step": 1773
+    },
+    {
+      "epoch": 0.9461333333333334,
+      "grad_norm": 0.37724181861323997,
+      "learning_rate": 1.5192246987791981e-06,
+      "loss": 0.5866,
+      "step": 1774
+    },
+    {
+      "epoch": 0.9466666666666667,
+      "grad_norm": 0.3888776447864694,
+      "learning_rate": 1.489364501100332e-06,
+      "loss": 0.5849,
+      "step": 1775
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.36143809968939544,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6142,
+      "step": 1776
+    },
+    {
+      "epoch": 0.9477333333333333,
+      "grad_norm": 0.41988646763204995,
+      "learning_rate": 1.430526697162482e-06,
+      "loss": 0.6423,
+      "step": 1777
+    },
+    {
+      "epoch": 0.9482666666666667,
+      "grad_norm": 0.4302947634147954,
+      "learning_rate": 1.4015492666021312e-06,
+      "loss": 0.615,
+      "step": 1778
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.40408026549902326,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.6154,
+      "step": 1779
+    },
+    {
+      "epoch": 0.9493333333333334,
+      "grad_norm": 0.3702788572172086,
+      "learning_rate": 1.344477780953346e-06,
+      "loss": 0.5793,
+      "step": 1780
+    },
+    {
+      "epoch": 0.9498666666666666,
+      "grad_norm": 0.4091136909985783,
+      "learning_rate": 1.3163838962890195e-06,
+      "loss": 0.6359,
+      "step": 1781
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.43247437071136985,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6648,
+      "step": 1782
+    },
+    {
+      "epoch": 0.9509333333333333,
+      "grad_norm": 0.40990498701517686,
+      "learning_rate": 1.261080262743297e-06,
+      "loss": 0.6579,
+      "step": 1783
+    },
+    {
+      "epoch": 0.9514666666666667,
+      "grad_norm": 0.43275080338022903,
+      "learning_rate": 1.2338706790069431e-06,
+      "loss": 0.711,
+      "step": 1784
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4010719626124903,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.6459,
+      "step": 1785
+    },
+    {
+      "epoch": 0.9525333333333333,
+      "grad_norm": 0.4267113725935911,
+      "learning_rate": 1.1803363838667092e-06,
+      "loss": 0.631,
+      "step": 1786
+    },
+    {
+      "epoch": 0.9530666666666666,
+      "grad_norm": 0.4035417244970245,
+      "learning_rate": 1.1540118323243865e-06,
+      "loss": 0.6164,
+      "step": 1787
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.4940228329440211,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6548,
+      "step": 1788
+    },
+    {
+      "epoch": 0.9541333333333334,
+      "grad_norm": 0.5050931635987523,
+      "learning_rate": 1.1022483143405705e-06,
+      "loss": 0.6136,
+      "step": 1789
+    },
+    {
+      "epoch": 0.9546666666666667,
+      "grad_norm": 0.496454817575372,
+      "learning_rate": 1.076809502472831e-06,
+      "loss": 0.6281,
+      "step": 1790
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.36726201288264415,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6117,
+      "step": 1791
+    },
+    {
+      "epoch": 0.9557333333333333,
+      "grad_norm": 0.45333278650410636,
+      "learning_rate": 1.0268181528061749e-06,
+      "loss": 0.6164,
+      "step": 1792
+    },
+    {
+      "epoch": 0.9562666666666667,
+      "grad_norm": 0.4458064552754602,
+      "learning_rate": 1.0022657642890231e-06,
+      "loss": 0.6111,
+      "step": 1793
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.42360344085642504,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.608,
+      "step": 1794
+    },
+    {
+      "epoch": 0.9573333333333334,
+      "grad_norm": 0.3697115317534701,
+      "learning_rate": 9.540479264726676e-07,
+      "loss": 0.6007,
+      "step": 1795
+    },
+    {
+      "epoch": 0.9578666666666666,
+      "grad_norm": 0.44121091021401987,
+      "learning_rate": 9.303826211592315e-07,
+      "loss": 0.6,
+      "step": 1796
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.38813603165578076,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.607,
+      "step": 1797
+    },
+    {
+      "epoch": 0.9589333333333333,
+      "grad_norm": 0.37885506588245343,
+      "learning_rate": 8.839395910626213e-07,
+      "loss": 0.6216,
+      "step": 1798
+    },
+    {
+      "epoch": 0.9594666666666667,
+      "grad_norm": 0.3861215708237205,
+      "learning_rate": 8.611620049653879e-07,
+      "loss": 0.5602,
+      "step": 1799
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.45455262060354645,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6146,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9605333333333334,
+      "grad_norm": 0.40691806764142746,
+      "learning_rate": 8.16495030759501e-07,
+      "loss": 0.6232,
+      "step": 1801
+    },
+    {
+      "epoch": 0.9610666666666666,
+      "grad_norm": 0.41159366816015297,
+      "learning_rate": 7.946057760332193e-07,
+      "loss": 0.5863,
+      "step": 1802
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.4694407527138338,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.622,
+      "step": 1803
+    },
+    {
+      "epoch": 0.9621333333333333,
+      "grad_norm": 0.41801063287095436,
+      "learning_rate": 7.517160581569372e-07,
+      "loss": 0.6284,
+      "step": 1804
+    },
+    {
+      "epoch": 0.9626666666666667,
+      "grad_norm": 0.42719497482572355,
+      "learning_rate": 7.307157230821426e-07,
+      "loss": 0.6371,
+      "step": 1805
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.41285117718354075,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.663,
+      "step": 1806
+    },
+    {
+      "epoch": 0.9637333333333333,
+      "grad_norm": 0.42562719337945415,
+      "learning_rate": 6.896044142100433e-07,
+      "loss": 0.6214,
+      "step": 1807
+    },
+    {
+      "epoch": 0.9642666666666667,
+      "grad_norm": 0.5395690970992174,
+      "learning_rate": 6.694935631773258e-07,
+      "loss": 0.7304,
+      "step": 1808
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.41032328674571755,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6573,
+      "step": 1809
+    },
+    {
+      "epoch": 0.9653333333333334,
+      "grad_norm": 0.38578810785439155,
+      "learning_rate": 6.301617681886863e-07,
+      "loss": 0.625,
+      "step": 1810
+    },
+    {
+      "epoch": 0.9658666666666667,
+      "grad_norm": 0.4491101885346572,
+      "learning_rate": 6.109409416834688e-07,
+      "loss": 0.6477,
+      "step": 1811
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.45385553898315706,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.7031,
+      "step": 1812
+    },
+    {
+      "epoch": 0.9669333333333333,
+      "grad_norm": 0.4018436943756649,
+      "learning_rate": 5.733897176325665e-07,
+      "loss": 0.5933,
+      "step": 1813
+    },
+    {
+      "epoch": 0.9674666666666667,
+      "grad_norm": 0.41608012446601444,
+      "learning_rate": 5.550594322205504e-07,
+      "loss": 0.6371,
+      "step": 1814
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.4294878835451814,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.663,
+      "step": 1815
+    },
+    {
+      "epoch": 0.9685333333333334,
+      "grad_norm": 0.43563463276347536,
+      "learning_rate": 5.192897883082747e-07,
+      "loss": 0.6652,
+      "step": 1816
+    },
+    {
+      "epoch": 0.9690666666666666,
+      "grad_norm": 0.6004249075539235,
+      "learning_rate": 5.018505366216175e-07,
+      "loss": 0.6469,
+      "step": 1817
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.4095224316449897,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6075,
+      "step": 1818
+    },
+    {
+      "epoch": 0.9701333333333333,
+      "grad_norm": 0.45997622670248234,
+      "learning_rate": 4.678634341683252e-07,
+      "loss": 0.6584,
+      "step": 1819
+    },
+    {
+      "epoch": 0.9706666666666667,
+      "grad_norm": 0.45250759953032205,
+      "learning_rate": 4.5131568489236166e-07,
+      "loss": 0.6387,
+      "step": 1820
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.44090708330324574,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6431,
+      "step": 1821
+    },
+    {
+      "epoch": 0.9717333333333333,
+      "grad_norm": 0.4062126991791442,
+      "learning_rate": 4.191120373120749e-07,
+      "loss": 0.621,
+      "step": 1822
+    },
+    {
+      "epoch": 0.9722666666666666,
+      "grad_norm": 0.4009031195905101,
+      "learning_rate": 4.034562351727389e-07,
+      "loss": 0.6211,
+      "step": 1823
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.4712789131210934,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6996,
+      "step": 1824
+    },
+    {
+      "epoch": 0.9733333333333334,
+      "grad_norm": 0.5029551258999793,
+      "learning_rate": 3.73036907948543e-07,
+      "loss": 0.7219,
+      "step": 1825
+    },
+    {
+      "epoch": 0.9738666666666667,
+      "grad_norm": 0.4654582549113369,
+      "learning_rate": 3.582734737004101e-07,
+      "loss": 0.6843,
+      "step": 1826
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.4156968107183663,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.5907,
+      "step": 1827
+    },
+    {
+      "epoch": 0.9749333333333333,
+      "grad_norm": 0.5084323948101466,
+      "learning_rate": 3.296392843612273e-07,
+      "loss": 0.6472,
+      "step": 1828
+    },
+    {
+      "epoch": 0.9754666666666667,
+      "grad_norm": 0.4685193906875141,
+      "learning_rate": 3.1576861477621287e-07,
+      "loss": 0.6377,
+      "step": 1829
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.41999282975670604,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6202,
+      "step": 1830
+    },
+    {
+      "epoch": 0.9765333333333334,
+      "grad_norm": 0.4746059190621131,
+      "learning_rate": 2.889203328748424e-07,
+      "loss": 0.6289,
+      "step": 1831
+    },
+    {
+      "epoch": 0.9770666666666666,
+      "grad_norm": 0.3894929861098244,
+      "learning_rate": 2.759428007315212e-07,
+      "loss": 0.6419,
+      "step": 1832
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.4248066340308863,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6349,
+      "step": 1833
+    },
+    {
+      "epoch": 0.9781333333333333,
+      "grad_norm": 0.4379302229320316,
+      "learning_rate": 2.5088114782392257e-07,
+      "loss": 0.6643,
+      "step": 1834
+    },
+    {
+      "epoch": 0.9786666666666667,
+      "grad_norm": 0.40591424881704585,
+      "learning_rate": 2.3879710189753656e-07,
+      "loss": 0.6127,
+      "step": 1835
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.4795653299529607,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6369,
+      "step": 1836
+    },
+    {
+      "epoch": 0.9797333333333333,
+      "grad_norm": 0.40234598685828443,
+      "learning_rate": 2.15522751523467e-07,
+      "loss": 0.6332,
+      "step": 1837
+    },
+    {
+      "epoch": 0.9802666666666666,
+      "grad_norm": 0.4775141956768784,
+      "learning_rate": 2.0433251657653308e-07,
+      "loss": 0.6368,
+      "step": 1838
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.41833903955171153,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6183,
+      "step": 1839
+    },
+    {
+      "epoch": 0.9813333333333333,
+      "grad_norm": 0.44110044016760425,
+      "learning_rate": 1.8284609424142895e-07,
+      "loss": 0.6753,
+      "step": 1840
+    },
+    {
+      "epoch": 0.9818666666666667,
+      "grad_norm": 0.4284480343917125,
+      "learning_rate": 1.7254997101500137e-07,
+      "loss": 0.6903,
+      "step": 1841
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.4078046651085013,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.672,
+      "step": 1842
+    },
+    {
+      "epoch": 0.9829333333333333,
+      "grad_norm": 0.4991980338518636,
+      "learning_rate": 1.5285205417319149e-07,
+      "loss": 0.6064,
+      "step": 1843
+    },
+    {
+      "epoch": 0.9834666666666667,
+      "grad_norm": 0.48846284151389696,
+      "learning_rate": 1.4345031937879062e-07,
+      "loss": 0.65,
+      "step": 1844
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.37930792575343436,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6736,
+      "step": 1845
+    },
+    {
+      "epoch": 0.9845333333333334,
+      "grad_norm": 0.3641625378901866,
+      "learning_rate": 1.255414374179531e-07,
+      "loss": 0.6127,
+      "step": 1846
+    },
+    {
+      "epoch": 0.9850666666666666,
+      "grad_norm": 0.35150803268088227,
+      "learning_rate": 1.170343437301491e-07,
+      "loss": 0.5941,
+      "step": 1847
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3867345224975431,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.6007,
+      "step": 1848
+    },
+    {
+      "epoch": 0.9861333333333333,
+      "grad_norm": 0.43264281734236804,
+      "learning_rate": 1.0091497795706728e-07,
+      "loss": 0.6525,
+      "step": 1849
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.41498583015649615,
+      "learning_rate": 9.330275400666332e-08,
+      "loss": 0.6632,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.3836090777226854,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6285,
+      "step": 1851
+    },
+    {
+      "epoch": 0.9877333333333334,
+      "grad_norm": 0.3700712115206776,
+      "learning_rate": 7.8973337634336e-08,
+      "loss": 0.5941,
+      "step": 1852
+    },
+    {
+      "epoch": 0.9882666666666666,
+      "grad_norm": 0.41995580213863215,
+      "learning_rate": 7.225618800222877e-08,
+      "loss": 0.6505,
+      "step": 1853
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.3676612351752429,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.5938,
+      "step": 1854
+    },
+    {
+      "epoch": 0.9893333333333333,
+      "grad_norm": 0.3788444588538138,
+      "learning_rate": 5.971710613821291e-08,
+      "loss": 0.6074,
+      "step": 1855
+    },
+    {
+      "epoch": 0.9898666666666667,
+      "grad_norm": 0.4059418675155657,
+      "learning_rate": 5.389521134989695e-08,
+      "loss": 0.584,
+      "step": 1856
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.39567068196168576,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6367,
+      "step": 1857
+    },
+    {
+      "epoch": 0.9909333333333333,
+      "grad_norm": 0.38232772509276153,
+      "learning_rate": 4.314680098592705e-08,
+      "loss": 0.6167,
+      "step": 1858
+    },
+    {
+      "epoch": 0.9914666666666667,
+      "grad_norm": 0.4651180985905164,
+      "learning_rate": 3.8220317506654226e-08,
+      "loss": 0.6676,
+      "step": 1859
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.5197233380793199,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6607,
+      "step": 1860
+    },
+    {
+      "epoch": 0.9925333333333334,
+      "grad_norm": 0.42387500086006535,
+      "learning_rate": 2.9262867509605163e-08,
+      "loss": 0.5764,
+      "step": 1861
+    },
+    {
+      "epoch": 0.9930666666666667,
+      "grad_norm": 0.3687634794920366,
+      "learning_rate": 2.5231927740154704e-08,
+      "loss": 0.5823,
+      "step": 1862
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.4116728872012044,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.5885,
+      "step": 1863
+    },
+    {
+      "epoch": 0.9941333333333333,
+      "grad_norm": 0.4006186632561352,
+      "learning_rate": 1.8065678844314538e-08,
+      "loss": 0.6065,
+      "step": 1864
+    },
+    {
+      "epoch": 0.9946666666666667,
+      "grad_norm": 0.38454378122537636,
+      "learning_rate": 1.4930391117451426e-08,
+      "loss": 0.6451,
+      "step": 1865
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.3479609493816106,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.5613,
+      "step": 1866
+    },
+    {
+      "epoch": 0.9957333333333334,
+      "grad_norm": 0.5098758914604183,
+      "learning_rate": 9.555535917993297e-09,
+      "loss": 0.6181,
+      "step": 1867
+    },
+    {
+      "epoch": 0.9962666666666666,
+      "grad_norm": 0.47578953791996026,
+      "learning_rate": 7.315984495548378e-09,
+      "loss": 0.6215,
+      "step": 1868
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.3989169212785679,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6668,
+      "step": 1869
+    },
+    {
+      "epoch": 0.9973333333333333,
+      "grad_norm": 0.41939163612008407,
+      "learning_rate": 3.732667443390181e-09,
+      "loss": 0.6006,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9978666666666667,
+      "grad_norm": 0.4019676815392459,
+      "learning_rate": 2.388912514017516e-09,
+      "loss": 0.5955,
+      "step": 1871
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.411973876920795,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.5836,
+      "step": 1872
+    },
+    {
+      "epoch": 0.9989333333333333,
+      "grad_norm": 0.40402033416292676,
+      "learning_rate": 5.972299119250125e-10,
+      "loss": 0.6264,
+      "step": 1873
+    },
+    {
+      "epoch": 0.9994666666666666,
+      "grad_norm": 0.4217962070916454,
+      "learning_rate": 1.4930758944764479e-10,
+      "loss": 0.6045,
+      "step": 1874
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.43905748046918114,
+      "learning_rate": 0.0,
+      "loss": 0.5911,
+      "step": 1875
+    },
+    {
+      "epoch": 1.0,
+      "step": 1875,
+      "total_flos": 1648395321212928.0,
+      "train_loss": 0.7008718222618103,
+      "train_runtime": 29232.1086,
+      "train_samples_per_second": 1.026,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1648395321212928.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..2bb36c4f7e4b4040a327d36e4f85b8303dd65dc1
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj",
+    "k_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8e2d30fcb7c279e1629e668c1054f6aa396dabe0
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd6c4e8627b9252cbe4113cddd5eba428b94fffef00805e5dd38c407823cddfe
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a3adfe91a9dee1d52ca4ba9b0463fe85459763ca
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2589d852b36e90288eade3695331cd6aa472d4777b16091fd0f467268df7c71d
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b60daf4758d2dabe7517094cbd315a70baa9f0fa
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,13167 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005333333333333334,
+      "grad_norm": 0.9530075362191299,
+      "learning_rate": 3.5087719298245615e-06,
+      "loss": 1.3589,
+      "step": 1
+    },
+    {
+      "epoch": 0.0010666666666666667,
+      "grad_norm": 1.1209233254545106,
+      "learning_rate": 7.017543859649123e-06,
+      "loss": 1.5026,
+      "step": 2
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.1647674798314256,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.5517,
+      "step": 3
+    },
+    {
+      "epoch": 0.0021333333333333334,
+      "grad_norm": 1.2984126685547293,
+      "learning_rate": 1.4035087719298246e-05,
+      "loss": 1.4062,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026666666666666666,
+      "grad_norm": 1.0496306752362106,
+      "learning_rate": 1.7543859649122806e-05,
+      "loss": 1.5315,
+      "step": 5
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9148416499381024,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.399,
+      "step": 6
+    },
+    {
+      "epoch": 0.0037333333333333333,
+      "grad_norm": 0.8129126710040641,
+      "learning_rate": 2.456140350877193e-05,
+      "loss": 1.2746,
+      "step": 7
+    },
+    {
+      "epoch": 0.004266666666666667,
+      "grad_norm": 0.9055627148585419,
+      "learning_rate": 2.8070175438596492e-05,
+      "loss": 1.31,
+      "step": 8
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.8675338084750814,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.1451,
+      "step": 9
+    },
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 0.9104967299301532,
+      "learning_rate": 3.508771929824561e-05,
+      "loss": 1.1524,
+      "step": 10
+    },
+    {
+      "epoch": 0.005866666666666667,
+      "grad_norm": 0.8735803073190872,
+      "learning_rate": 3.859649122807018e-05,
+      "loss": 1.0964,
+      "step": 11
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8812665889972949,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.0851,
+      "step": 12
+    },
+    {
+      "epoch": 0.006933333333333333,
+      "grad_norm": 0.9365546710795096,
+      "learning_rate": 4.56140350877193e-05,
+      "loss": 1.0147,
+      "step": 13
+    },
+    {
+      "epoch": 0.007466666666666667,
+      "grad_norm": 0.8118625054570385,
+      "learning_rate": 4.912280701754386e-05,
+      "loss": 0.965,
+      "step": 14
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.9224027448538653,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.0044,
+      "step": 15
+    },
+    {
+      "epoch": 0.008533333333333334,
+      "grad_norm": 0.8240090470074574,
+      "learning_rate": 5.6140350877192984e-05,
+      "loss": 1.0274,
+      "step": 16
+    },
+    {
+      "epoch": 0.009066666666666667,
+      "grad_norm": 0.6776694584147641,
+      "learning_rate": 5.9649122807017544e-05,
+      "loss": 0.9909,
+      "step": 17
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.5622462135521928,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 0.8621,
+      "step": 18
+    },
+    {
+      "epoch": 0.010133333333333333,
+      "grad_norm": 0.6494376384802008,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.9268,
+      "step": 19
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": 0.4473607453727907,
+      "learning_rate": 7.017543859649122e-05,
+      "loss": 0.7989,
+      "step": 20
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.5970208228748919,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.8776,
+      "step": 21
+    },
+    {
+      "epoch": 0.011733333333333333,
+      "grad_norm": 0.5330213047566539,
+      "learning_rate": 7.719298245614036e-05,
+      "loss": 0.8361,
+      "step": 22
+    },
+    {
+      "epoch": 0.012266666666666667,
+      "grad_norm": 0.5647289108548379,
+      "learning_rate": 8.070175438596491e-05,
+      "loss": 0.8336,
+      "step": 23
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.5154584725991561,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.9474,
+      "step": 24
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 0.6485272082665403,
+      "learning_rate": 8.771929824561403e-05,
+      "loss": 0.9765,
+      "step": 25
+    },
+    {
+      "epoch": 0.013866666666666666,
+      "grad_norm": 0.5381159594051211,
+      "learning_rate": 9.12280701754386e-05,
+      "loss": 0.8683,
+      "step": 26
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.5204362272884198,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.8516,
+      "step": 27
+    },
+    {
+      "epoch": 0.014933333333333333,
+      "grad_norm": 0.640201554627158,
+      "learning_rate": 9.824561403508771e-05,
+      "loss": 0.9099,
+      "step": 28
+    },
+    {
+      "epoch": 0.015466666666666667,
+      "grad_norm": 0.5651120390420362,
+      "learning_rate": 0.0001017543859649123,
+      "loss": 0.8992,
+      "step": 29
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5202204909426368,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.8774,
+      "step": 30
+    },
+    {
+      "epoch": 0.016533333333333334,
+      "grad_norm": 0.5210698306695373,
+      "learning_rate": 0.00010877192982456141,
+      "loss": 0.8541,
+      "step": 31
+    },
+    {
+      "epoch": 0.017066666666666667,
+      "grad_norm": 0.618852912966987,
+      "learning_rate": 0.00011228070175438597,
+      "loss": 0.8841,
+      "step": 32
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.5421728477298867,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.8934,
+      "step": 33
+    },
+    {
+      "epoch": 0.018133333333333335,
+      "grad_norm": 0.5290161885606602,
+      "learning_rate": 0.00011929824561403509,
+      "loss": 0.9035,
+      "step": 34
+    },
+    {
+      "epoch": 0.018666666666666668,
+      "grad_norm": 0.8211442273789911,
+      "learning_rate": 0.00012280701754385965,
+      "loss": 0.9033,
+      "step": 35
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5324475940959741,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8117,
+      "step": 36
+    },
+    {
+      "epoch": 0.019733333333333332,
+      "grad_norm": 0.46451991320560043,
+      "learning_rate": 0.0001298245614035088,
+      "loss": 0.8473,
+      "step": 37
+    },
+    {
+      "epoch": 0.020266666666666665,
+      "grad_norm": 0.4599769992852389,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.8296,
+      "step": 38
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.5242429491517732,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.8684,
+      "step": 39
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 0.5175210915839688,
+      "learning_rate": 0.00014035087719298245,
+      "loss": 0.9087,
+      "step": 40
+    },
+    {
+      "epoch": 0.021866666666666666,
+      "grad_norm": 0.4500331694359257,
+      "learning_rate": 0.00014385964912280703,
+      "loss": 0.8495,
+      "step": 41
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5223537308179806,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.8686,
+      "step": 42
+    },
+    {
+      "epoch": 0.022933333333333333,
+      "grad_norm": 0.48999253255749625,
+      "learning_rate": 0.00015087719298245616,
+      "loss": 0.7974,
+      "step": 43
+    },
+    {
+      "epoch": 0.023466666666666667,
+      "grad_norm": 0.5282601459712972,
+      "learning_rate": 0.0001543859649122807,
+      "loss": 0.851,
+      "step": 44
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.4901323293268392,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8303,
+      "step": 45
+    },
+    {
+      "epoch": 0.024533333333333334,
+      "grad_norm": 0.5134194007789946,
+      "learning_rate": 0.00016140350877192982,
+      "loss": 0.9225,
+      "step": 46
+    },
+    {
+      "epoch": 0.025066666666666668,
+      "grad_norm": 0.5514875605483572,
+      "learning_rate": 0.0001649122807017544,
+      "loss": 0.862,
+      "step": 47
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5473890815611594,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.7422,
+      "step": 48
+    },
+    {
+      "epoch": 0.026133333333333335,
+      "grad_norm": 0.5145757379137041,
+      "learning_rate": 0.00017192982456140353,
+      "loss": 0.8397,
+      "step": 49
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.5882197077979645,
+      "learning_rate": 0.00017543859649122806,
+      "loss": 0.8466,
+      "step": 50
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.48284574637139754,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.7566,
+      "step": 51
+    },
+    {
+      "epoch": 0.027733333333333332,
+      "grad_norm": 0.49211571789559033,
+      "learning_rate": 0.0001824561403508772,
+      "loss": 0.7974,
+      "step": 52
+    },
+    {
+      "epoch": 0.028266666666666666,
+      "grad_norm": 0.52322891431274,
+      "learning_rate": 0.00018596491228070177,
+      "loss": 0.8635,
+      "step": 53
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.4501147985707925,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8766,
+      "step": 54
+    },
+    {
+      "epoch": 0.029333333333333333,
+      "grad_norm": 0.48922557158979185,
+      "learning_rate": 0.00019298245614035088,
+      "loss": 0.8648,
+      "step": 55
+    },
+    {
+      "epoch": 0.029866666666666666,
+      "grad_norm": 0.546341643940991,
+      "learning_rate": 0.00019649122807017543,
+      "loss": 0.8634,
+      "step": 56
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.47936360056551997,
+      "learning_rate": 0.0002,
+      "loss": 0.8317,
+      "step": 57
+    },
+    {
+      "epoch": 0.030933333333333334,
+      "grad_norm": 0.45765672456621115,
+      "learning_rate": 0.00019999985069241055,
+      "loss": 0.8067,
+      "step": 58
+    },
+    {
+      "epoch": 0.031466666666666664,
+      "grad_norm": 0.5118061525014858,
+      "learning_rate": 0.00019999940277008808,
+      "loss": 0.7969,
+      "step": 59
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5145855756469134,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8368,
+      "step": 60
+    },
+    {
+      "epoch": 0.03253333333333333,
+      "grad_norm": 0.4784625296076965,
+      "learning_rate": 0.00019999761108748597,
+      "loss": 0.8651,
+      "step": 61
+    },
+    {
+      "epoch": 0.03306666666666667,
+      "grad_norm": 0.46151113581587216,
+      "learning_rate": 0.00019999626733255662,
+      "loss": 0.8007,
+      "step": 62
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.49175484535289365,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.8093,
+      "step": 63
+    },
+    {
+      "epoch": 0.034133333333333335,
+      "grad_norm": 0.4074528589017721,
+      "learning_rate": 0.00019999268401550447,
+      "loss": 0.7676,
+      "step": 64
+    },
+    {
+      "epoch": 0.034666666666666665,
+      "grad_norm": 0.43776886250252567,
+      "learning_rate": 0.000199990444464082,
+      "loss": 0.8463,
+      "step": 65
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5009560087554507,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8322,
+      "step": 66
+    },
+    {
+      "epoch": 0.03573333333333333,
+      "grad_norm": 0.4846580960169461,
+      "learning_rate": 0.00019998506960888256,
+      "loss": 0.7734,
+      "step": 67
+    },
+    {
+      "epoch": 0.03626666666666667,
+      "grad_norm": 0.5340418592386688,
+      "learning_rate": 0.00019998193432115572,
+      "loss": 0.746,
+      "step": 68
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.5775043682417678,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8457,
+      "step": 69
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 0.5016677566081483,
+      "learning_rate": 0.00019997476807225985,
+      "loss": 0.8604,
+      "step": 70
+    },
+    {
+      "epoch": 0.037866666666666667,
+      "grad_norm": 0.4397189251517687,
+      "learning_rate": 0.0001999707371324904,
+      "loss": 0.8256,
+      "step": 71
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.4900884380603229,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8567,
+      "step": 72
+    },
+    {
+      "epoch": 0.038933333333333334,
+      "grad_norm": 0.4580109565797004,
+      "learning_rate": 0.00019996177968249334,
+      "loss": 0.7163,
+      "step": 73
+    },
+    {
+      "epoch": 0.039466666666666664,
+      "grad_norm": 0.4586077587006553,
+      "learning_rate": 0.0001999568531990141,
+      "loss": 0.7916,
+      "step": 74
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.4627256679086634,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.7894,
+      "step": 75
+    },
+    {
+      "epoch": 0.04053333333333333,
+      "grad_norm": 0.4988403345065922,
+      "learning_rate": 0.00019994610478865011,
+      "loss": 0.8367,
+      "step": 76
+    },
+    {
+      "epoch": 0.04106666666666667,
+      "grad_norm": 0.5236286527623206,
+      "learning_rate": 0.0001999402828938618,
+      "loss": 0.8507,
+      "step": 77
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.6922765686729557,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8996,
+      "step": 78
+    },
+    {
+      "epoch": 0.042133333333333335,
+      "grad_norm": 0.4783234228874865,
+      "learning_rate": 0.00019992774381199778,
+      "loss": 0.7708,
+      "step": 79
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 0.612578763748735,
+      "learning_rate": 0.00019992102666236566,
+      "loss": 0.7545,
+      "step": 80
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.46203764584243073,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.7887,
+      "step": 81
+    },
+    {
+      "epoch": 0.04373333333333333,
+      "grad_norm": 0.45244911162443047,
+      "learning_rate": 0.00019990669724599336,
+      "loss": 0.7744,
+      "step": 82
+    },
+    {
+      "epoch": 0.04426666666666667,
+      "grad_norm": 0.6566375324709214,
+      "learning_rate": 0.00019989908502204292,
+      "loss": 0.833,
+      "step": 83
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5194199924365458,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8446,
+      "step": 84
+    },
+    {
+      "epoch": 0.04533333333333334,
+      "grad_norm": 0.45945184735452566,
+      "learning_rate": 0.00019988296565626987,
+      "loss": 0.8348,
+      "step": 85
+    },
+    {
+      "epoch": 0.04586666666666667,
+      "grad_norm": 0.46089092617175464,
+      "learning_rate": 0.00019987445856258206,
+      "loss": 0.7633,
+      "step": 86
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.47514977024471644,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.8203,
+      "step": 87
+    },
+    {
+      "epoch": 0.046933333333333334,
+      "grad_norm": 0.6291425538885136,
+      "learning_rate": 0.00019985654968062122,
+      "loss": 0.7998,
+      "step": 88
+    },
+    {
+      "epoch": 0.047466666666666664,
+      "grad_norm": 0.549901839877944,
+      "learning_rate": 0.00019984714794582683,
+      "loss": 0.8657,
+      "step": 89
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.46661448203093603,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8133,
+      "step": 90
+    },
+    {
+      "epoch": 0.04853333333333333,
+      "grad_norm": 0.459206129392748,
+      "learning_rate": 0.000199827450028985,
+      "loss": 0.8299,
+      "step": 91
+    },
+    {
+      "epoch": 0.04906666666666667,
+      "grad_norm": 0.5623178864601807,
+      "learning_rate": 0.00019981715390575858,
+      "loss": 0.8385,
+      "step": 92
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.45348731897178707,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.7574,
+      "step": 93
+    },
+    {
+      "epoch": 0.050133333333333335,
+      "grad_norm": 0.535542349820639,
+      "learning_rate": 0.00019979566748342347,
+      "loss": 0.8661,
+      "step": 94
+    },
+    {
+      "epoch": 0.050666666666666665,
+      "grad_norm": 0.5022061293461977,
+      "learning_rate": 0.00019978447724847652,
+      "loss": 0.7923,
+      "step": 95
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5202146438079589,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.8313,
+      "step": 96
+    },
+    {
+      "epoch": 0.05173333333333333,
+      "grad_norm": 0.4201263667221269,
+      "learning_rate": 0.00019976120289810247,
+      "loss": 0.8469,
+      "step": 97
+    },
+    {
+      "epoch": 0.05226666666666667,
+      "grad_norm": 0.48233205307936633,
+      "learning_rate": 0.00019974911885217608,
+      "loss": 0.8151,
+      "step": 98
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.4671449281242293,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8306,
+      "step": 99
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.48299374492785263,
+      "learning_rate": 0.0001997240571992685,
+      "loss": 0.8501,
+      "step": 100
+    },
+    {
+      "epoch": 0.05386666666666667,
+      "grad_norm": 0.48753971529645357,
+      "learning_rate": 0.00019971107966712518,
+      "loss": 0.7638,
+      "step": 101
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.4674860574247839,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.7976,
+      "step": 102
+    },
+    {
+      "epoch": 0.054933333333333334,
+      "grad_norm": 0.6491059212719197,
+      "learning_rate": 0.0001996842313852238,
+      "loss": 0.8005,
+      "step": 103
+    },
+    {
+      "epoch": 0.055466666666666664,
+      "grad_norm": 0.4415398829830088,
+      "learning_rate": 0.00019967036071563877,
+      "loss": 0.7475,
+      "step": 104
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.5024903122111678,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.7964,
+      "step": 105
+    },
+    {
+      "epoch": 0.05653333333333333,
+      "grad_norm": 0.5094836435189203,
+      "learning_rate": 0.0001996417265262996,
+      "loss": 0.7924,
+      "step": 106
+    },
+    {
+      "epoch": 0.05706666666666667,
+      "grad_norm": 0.5046458098684509,
+      "learning_rate": 0.00019962696309205148,
+      "loss": 0.8139,
+      "step": 107
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5319455233865822,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.895,
+      "step": 108
+    },
+    {
+      "epoch": 0.058133333333333335,
+      "grad_norm": 0.5553777319473407,
+      "learning_rate": 0.0001995965437648273,
+      "loss": 0.8232,
+      "step": 109
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 0.43313878035714637,
+      "learning_rate": 0.00019958088796268793,
+      "loss": 0.7928,
+      "step": 110
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.4831267278735094,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.7515,
+      "step": 111
+    },
+    {
+      "epoch": 0.05973333333333333,
+      "grad_norm": 0.5429097406906558,
+      "learning_rate": 0.00019954868431510764,
+      "loss": 0.8682,
+      "step": 112
+    },
+    {
+      "epoch": 0.06026666666666667,
+      "grad_norm": 0.44352321291184216,
+      "learning_rate": 0.00019953213656583168,
+      "loss": 0.8025,
+      "step": 113
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.4945646959168046,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8142,
+      "step": 114
+    },
+    {
+      "epoch": 0.06133333333333333,
+      "grad_norm": 0.48152743089150607,
+      "learning_rate": 0.00019949814946337838,
+      "loss": 0.8533,
+      "step": 115
+    },
+    {
+      "epoch": 0.06186666666666667,
+      "grad_norm": 0.5239290869484868,
+      "learning_rate": 0.00019948071021169174,
+      "loss": 0.8501,
+      "step": 116
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.5108716008294788,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.8221,
+      "step": 117
+    },
+    {
+      "epoch": 0.06293333333333333,
+      "grad_norm": 0.5882597720261666,
+      "learning_rate": 0.00019944494056777946,
+      "loss": 0.7737,
+      "step": 118
+    },
+    {
+      "epoch": 0.06346666666666667,
+      "grad_norm": 0.5275886823336764,
+      "learning_rate": 0.00019942661028236745,
+      "loss": 0.7448,
+      "step": 119
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4512469637764529,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.752,
+      "step": 120
+    },
+    {
+      "epoch": 0.06453333333333333,
+      "grad_norm": 0.6026573460803222,
+      "learning_rate": 0.00019938905905831654,
+      "loss": 0.8495,
+      "step": 121
+    },
+    {
+      "epoch": 0.06506666666666666,
+      "grad_norm": 0.4621277101738424,
+      "learning_rate": 0.00019936983823181132,
+      "loss": 0.7959,
+      "step": 122
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.5248664374789274,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.8501,
+      "step": 123
+    },
+    {
+      "epoch": 0.06613333333333334,
+      "grad_norm": 0.5040839994674604,
+      "learning_rate": 0.00019933050643682269,
+      "loss": 0.7992,
+      "step": 124
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.4824258987952713,
+      "learning_rate": 0.00019931039558578997,
+      "loss": 0.7573,
+      "step": 125
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.44725470330345835,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.7947,
+      "step": 126
+    },
+    {
+      "epoch": 0.06773333333333334,
+      "grad_norm": 0.47105156541524995,
+      "learning_rate": 0.00019926928427691786,
+      "loss": 0.8229,
+      "step": 127
+    },
+    {
+      "epoch": 0.06826666666666667,
+      "grad_norm": 0.4933021592132359,
+      "learning_rate": 0.00019924828394184306,
+      "loss": 0.8778,
+      "step": 128
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.45689750144583013,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.7978,
+      "step": 129
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 0.4567165648731524,
+      "learning_rate": 0.0001992053942239668,
+      "loss": 0.8177,
+      "step": 130
+    },
+    {
+      "epoch": 0.06986666666666666,
+      "grad_norm": 0.5683715743251585,
+      "learning_rate": 0.0001991835049692405,
+      "loss": 0.8763,
+      "step": 131
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.44138942802531556,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.737,
+      "step": 132
+    },
+    {
+      "epoch": 0.07093333333333333,
+      "grad_norm": 0.5293349882470612,
+      "learning_rate": 0.0001991388379950346,
+      "loss": 0.7573,
+      "step": 133
+    },
+    {
+      "epoch": 0.07146666666666666,
+      "grad_norm": 0.55464443039697,
+      "learning_rate": 0.0001991160604089374,
+      "loss": 0.879,
+      "step": 134
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5069743884549222,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.7757,
+      "step": 135
+    },
+    {
+      "epoch": 0.07253333333333334,
+      "grad_norm": 0.4737485373103175,
+      "learning_rate": 0.00019906961737884077,
+      "loss": 0.7917,
+      "step": 136
+    },
+    {
+      "epoch": 0.07306666666666667,
+      "grad_norm": 0.4426742568055296,
+      "learning_rate": 0.00019904595207352737,
+      "loss": 0.7931,
+      "step": 137
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.4836372167456052,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.7804,
+      "step": 138
+    },
+    {
+      "epoch": 0.07413333333333333,
+      "grad_norm": 0.649942284751794,
+      "learning_rate": 0.000198997734235711,
+      "loss": 0.7834,
+      "step": 139
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.4783968078909489,
+      "learning_rate": 0.00019897318184719385,
+      "loss": 0.7234,
+      "step": 140
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.5941916056960217,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.8439,
+      "step": 141
+    },
+    {
+      "epoch": 0.07573333333333333,
+      "grad_norm": 0.5213384394980335,
+      "learning_rate": 0.0001989231904975272,
+      "loss": 0.7826,
+      "step": 142
+    },
+    {
+      "epoch": 0.07626666666666666,
+      "grad_norm": 0.4231337229376453,
+      "learning_rate": 0.00019889775168565943,
+      "loss": 0.7664,
+      "step": 143
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.497995175665305,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.8113,
+      "step": 144
+    },
+    {
+      "epoch": 0.07733333333333334,
+      "grad_norm": 0.40749924047351943,
+      "learning_rate": 0.00019884598816767563,
+      "loss": 0.7408,
+      "step": 145
+    },
+    {
+      "epoch": 0.07786666666666667,
+      "grad_norm": 0.4534240521727463,
+      "learning_rate": 0.0001988196636161333,
+      "loss": 0.7769,
+      "step": 146
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.4344782410376436,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.7083,
+      "step": 147
+    },
+    {
+      "epoch": 0.07893333333333333,
+      "grad_norm": 0.4854700699850418,
+      "learning_rate": 0.00019876612932099308,
+      "loss": 0.8423,
+      "step": 148
+    },
+    {
+      "epoch": 0.07946666666666667,
+      "grad_norm": 0.48673791357084556,
+      "learning_rate": 0.0001987389197372567,
+      "loss": 0.824,
+      "step": 149
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.40601082970046964,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.7531,
+      "step": 150
+    },
+    {
+      "epoch": 0.08053333333333333,
+      "grad_norm": 0.45113518119875684,
+      "learning_rate": 0.00019868361610371097,
+      "loss": 0.7536,
+      "step": 151
+    },
+    {
+      "epoch": 0.08106666666666666,
+      "grad_norm": 0.4554053402986199,
+      "learning_rate": 0.00019865552221904665,
+      "loss": 0.731,
+      "step": 152
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.4662146968116339,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.7935,
+      "step": 153
+    },
+    {
+      "epoch": 0.08213333333333334,
+      "grad_norm": 0.4695700873307507,
+      "learning_rate": 0.00019859845073339787,
+      "loss": 0.8181,
+      "step": 154
+    },
+    {
+      "epoch": 0.08266666666666667,
+      "grad_norm": 0.5550879597141647,
+      "learning_rate": 0.00019856947330283752,
+      "loss": 0.8821,
+      "step": 155
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.47521271819956,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.7992,
+      "step": 156
+    },
+    {
+      "epoch": 0.08373333333333334,
+      "grad_norm": 0.47136209486211345,
+      "learning_rate": 0.0001985106354988997,
+      "loss": 0.8273,
+      "step": 157
+    },
+    {
+      "epoch": 0.08426666666666667,
+      "grad_norm": 0.43887650969681197,
+      "learning_rate": 0.00019848077530122083,
+      "loss": 0.779,
+      "step": 158
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.5143492554378482,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.7945,
+      "step": 159
+    },
+    {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.44170754373898863,
+      "learning_rate": 0.00019842017276027832,
+      "loss": 0.8181,
+      "step": 160
+    },
+    {
+      "epoch": 0.08586666666666666,
+      "grad_norm": 0.45954856452138815,
+      "learning_rate": 0.00019838943059798304,
+      "loss": 0.7921,
+      "step": 161
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.48147848641435625,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.8148,
+      "step": 162
+    },
+    {
+      "epoch": 0.08693333333333333,
+      "grad_norm": 0.44842730592611174,
+      "learning_rate": 0.0001983270649487481,
+      "loss": 0.7334,
+      "step": 163
+    },
+    {
+      "epoch": 0.08746666666666666,
+      "grad_norm": 0.4508095645687276,
+      "learning_rate": 0.0001982954416480417,
+      "loss": 0.7911,
+      "step": 164
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.46608885480827533,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.8108,
+      "step": 165
+    },
+    {
+      "epoch": 0.08853333333333334,
+      "grad_norm": 0.45877070958739413,
+      "learning_rate": 0.00019823131456661063,
+      "loss": 0.7783,
+      "step": 166
+    },
+    {
+      "epoch": 0.08906666666666667,
+      "grad_norm": 0.4304921622882453,
+      "learning_rate": 0.00019819881097737915,
+      "loss": 0.7841,
+      "step": 167
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4366462173180608,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.7464,
+      "step": 168
+    },
+    {
+      "epoch": 0.09013333333333333,
+      "grad_norm": 0.43983667199720133,
+      "learning_rate": 0.00019813292418718732,
+      "loss": 0.7576,
+      "step": 169
+    },
+    {
+      "epoch": 0.09066666666666667,
+      "grad_norm": 0.41920982657832667,
+      "learning_rate": 0.0001980995411829749,
+      "loss": 0.7612,
+      "step": 170
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.5046837642313663,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.6914,
+      "step": 171
+    },
+    {
+      "epoch": 0.09173333333333333,
+      "grad_norm": 0.5634990573175862,
+      "learning_rate": 0.0001980318964547504,
+      "loss": 0.798,
+      "step": 172
+    },
+    {
+      "epoch": 0.09226666666666666,
+      "grad_norm": 0.4873656252594583,
+      "learning_rate": 0.0001979976349327357,
+      "loss": 0.8175,
+      "step": 173
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.47219333626555077,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.8465,
+      "step": 174
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 0.46148219457852263,
+      "learning_rate": 0.00019792823408445174,
+      "loss": 0.792,
+      "step": 175
+    },
+    {
+      "epoch": 0.09386666666666667,
+      "grad_norm": 0.41285793437442386,
+      "learning_rate": 0.0001978930949654239,
+      "loss": 0.6803,
+      "step": 176
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.5131710262594903,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.8314,
+      "step": 177
+    },
+    {
+      "epoch": 0.09493333333333333,
+      "grad_norm": 0.4061201541559843,
+      "learning_rate": 0.00019782193986224995,
+      "loss": 0.795,
+      "step": 178
+    },
+    {
+      "epoch": 0.09546666666666667,
+      "grad_norm": 0.6144398063126736,
+      "learning_rate": 0.00019778592409058378,
+      "loss": 0.8947,
+      "step": 179
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.48165864832622596,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.7788,
+      "step": 180
+    },
+    {
+      "epoch": 0.09653333333333333,
+      "grad_norm": 0.5924083535442879,
+      "learning_rate": 0.0001977130166448355,
+      "loss": 0.7752,
+      "step": 181
+    },
+    {
+      "epoch": 0.09706666666666666,
+      "grad_norm": 0.4606648845819176,
+      "learning_rate": 0.00019767612518846608,
+      "loss": 0.8138,
+      "step": 182
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.42228257086642856,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.797,
+      "step": 183
+    },
+    {
+      "epoch": 0.09813333333333334,
+      "grad_norm": 0.4761085936785347,
+      "learning_rate": 0.00019760146735955388,
+      "loss": 0.7721,
+      "step": 184
+    },
+    {
+      "epoch": 0.09866666666666667,
+      "grad_norm": 0.6890945548578655,
+      "learning_rate": 0.00019756370120995066,
+      "loss": 0.9284,
+      "step": 185
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4251932131230442,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7526,
+      "step": 186
+    },
+    {
+      "epoch": 0.09973333333333333,
+      "grad_norm": 0.47175939719829446,
+      "learning_rate": 0.000197487295004327,
+      "loss": 0.8705,
+      "step": 187
+    },
+    {
+      "epoch": 0.10026666666666667,
+      "grad_norm": 0.40835561208500687,
+      "learning_rate": 0.00019744865517646706,
+      "loss": 0.7245,
+      "step": 188
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.46055583539677014,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.8185,
+      "step": 189
+    },
+    {
+      "epoch": 0.10133333333333333,
+      "grad_norm": 0.3937627480015215,
+      "learning_rate": 0.0001973705026475726,
+      "loss": 0.7648,
+      "step": 190
+    },
+    {
+      "epoch": 0.10186666666666666,
+      "grad_norm": 0.5247079534589798,
+      "learning_rate": 0.00019733099017991341,
+      "loss": 0.8725,
+      "step": 191
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.39572320717463216,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.8015,
+      "step": 192
+    },
+    {
+      "epoch": 0.10293333333333334,
+      "grad_norm": 0.38594413201314715,
+      "learning_rate": 0.0001972510934281218,
+      "loss": 0.7124,
+      "step": 193
+    },
+    {
+      "epoch": 0.10346666666666667,
+      "grad_norm": 0.47713898115659037,
+      "learning_rate": 0.00019721070938257324,
+      "loss": 0.7909,
+      "step": 194
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.4402783065445849,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.7051,
+      "step": 195
+    },
+    {
+      "epoch": 0.10453333333333334,
+      "grad_norm": 0.4810832587373293,
+      "learning_rate": 0.0001971290705551347,
+      "loss": 0.7395,
+      "step": 196
+    },
+    {
+      "epoch": 0.10506666666666667,
+      "grad_norm": 0.49793847135263325,
+      "learning_rate": 0.00019708781601703065,
+      "loss": 0.8246,
+      "step": 197
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.5163620957941052,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.7877,
+      "step": 198
+    },
+    {
+      "epoch": 0.10613333333333333,
+      "grad_norm": 0.47559073746323993,
+      "learning_rate": 0.00019700443730801413,
+      "loss": 0.7746,
+      "step": 199
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.5135088027783049,
+      "learning_rate": 0.00019696231338608316,
+      "loss": 0.784,
+      "step": 200
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.50758928100271,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.7726,
+      "step": 201
+    },
+    {
+      "epoch": 0.10773333333333333,
+      "grad_norm": 0.49973486643233367,
+      "learning_rate": 0.00019687719703631755,
+      "loss": 0.7804,
+      "step": 202
+    },
+    {
+      "epoch": 0.10826666666666666,
+      "grad_norm": 0.49498188569913276,
+      "learning_rate": 0.00019683420486265327,
+      "loss": 0.7357,
+      "step": 203
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.4848853436284693,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.7357,
+      "step": 204
+    },
+    {
+      "epoch": 0.10933333333333334,
+      "grad_norm": 0.6515538005832042,
+      "learning_rate": 0.0001967473531596671,
+      "loss": 0.7494,
+      "step": 205
+    },
+    {
+      "epoch": 0.10986666666666667,
+      "grad_norm": 0.4230006960565803,
+      "learning_rate": 0.0001967034938896976,
+      "loss": 0.7791,
+      "step": 206
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.5197799537090696,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8335,
+      "step": 207
+    },
+    {
+      "epoch": 0.11093333333333333,
+      "grad_norm": 0.5693892249171181,
+      "learning_rate": 0.0001966149091676575,
+      "loss": 0.7894,
+      "step": 208
+    },
+    {
+      "epoch": 0.11146666666666667,
+      "grad_norm": 0.5143451510978614,
+      "learning_rate": 0.00019657018398011434,
+      "loss": 0.7676,
+      "step": 209
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.563024349286294,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8176,
+      "step": 210
+    },
+    {
+      "epoch": 0.11253333333333333,
+      "grad_norm": 0.4834689700808916,
+      "learning_rate": 0.00019647986861976246,
+      "loss": 0.8119,
+      "step": 211
+    },
+    {
+      "epoch": 0.11306666666666666,
+      "grad_norm": 0.47788521778441795,
+      "learning_rate": 0.0001964342787166491,
+      "loss": 0.7896,
+      "step": 212
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.4739362949248575,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.8031,
+      "step": 213
+    },
+    {
+      "epoch": 0.11413333333333334,
+      "grad_norm": 0.4557070068611579,
+      "learning_rate": 0.0001963422351452389,
+      "loss": 0.7562,
+      "step": 214
+    },
+    {
+      "epoch": 0.11466666666666667,
+      "grad_norm": 0.5123175930425836,
+      "learning_rate": 0.0001962957817517982,
+      "loss": 0.7483,
+      "step": 215
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.5098661037143992,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.8108,
+      "step": 216
+    },
+    {
+      "epoch": 0.11573333333333333,
+      "grad_norm": 0.5360911309354575,
+      "learning_rate": 0.00019620201244302952,
+      "loss": 0.7999,
+      "step": 217
+    },
+    {
+      "epoch": 0.11626666666666667,
+      "grad_norm": 0.4940892054415995,
+      "learning_rate": 0.00019615469680771096,
+      "loss": 0.7901,
+      "step": 218
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.5224086792151261,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.8482,
+      "step": 219
+    },
+    {
+      "epoch": 0.11733333333333333,
+      "grad_norm": 0.4732769215032251,
+      "learning_rate": 0.00019605920428166323,
+      "loss": 0.7775,
+      "step": 220
+    },
+    {
+      "epoch": 0.11786666666666666,
+      "grad_norm": 0.4808299578109734,
+      "learning_rate": 0.00019601102767608923,
+      "loss": 0.8253,
+      "step": 221
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.4802019444834149,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8379,
+      "step": 222
+    },
+    {
+      "epoch": 0.11893333333333334,
+      "grad_norm": 0.5681970582081078,
+      "learning_rate": 0.00019591381449915397,
+      "loss": 0.8801,
+      "step": 223
+    },
+    {
+      "epoch": 0.11946666666666667,
+      "grad_norm": 0.5489033879490961,
+      "learning_rate": 0.00019586477821808597,
+      "loss": 0.7569,
+      "step": 224
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5026972332593719,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.6953,
+      "step": 225
+    },
+    {
+      "epoch": 0.12053333333333334,
+      "grad_norm": 0.518880954784583,
+      "learning_rate": 0.00019576584700289768,
+      "loss": 0.8101,
+      "step": 226
+    },
+    {
+      "epoch": 0.12106666666666667,
+      "grad_norm": 0.45054964365051964,
+      "learning_rate": 0.00019571595236420102,
+      "loss": 0.7742,
+      "step": 227
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.459553611277924,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.741,
+      "step": 228
+    },
+    {
+      "epoch": 0.12213333333333333,
+      "grad_norm": 0.5392204029433525,
+      "learning_rate": 0.00019561530576956703,
+      "loss": 0.7943,
+      "step": 229
+    },
+    {
+      "epoch": 0.12266666666666666,
+      "grad_norm": 0.46763358522659854,
+      "learning_rate": 0.00019556455411417573,
+      "loss": 0.7506,
+      "step": 230
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.48796001912296344,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.7885,
+      "step": 231
+    },
+    {
+      "epoch": 0.12373333333333333,
+      "grad_norm": 0.41612431290378915,
+      "learning_rate": 0.00019546219484500475,
+      "loss": 0.7818,
+      "step": 232
+    },
+    {
+      "epoch": 0.12426666666666666,
+      "grad_norm": 0.4170485922941037,
+      "learning_rate": 0.00019541058753688538,
+      "loss": 0.8096,
+      "step": 233
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.40575526010535407,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7689,
+      "step": 234
+    },
+    {
+      "epoch": 0.12533333333333332,
+      "grad_norm": 0.43631370693190186,
+      "learning_rate": 0.00019530651834411474,
+      "loss": 0.7512,
+      "step": 235
+    },
+    {
+      "epoch": 0.12586666666666665,
+      "grad_norm": 0.4549159681585235,
+      "learning_rate": 0.00019525405677022989,
+      "loss": 0.8294,
+      "step": 236
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.47339024015309783,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.846,
+      "step": 237
+    },
+    {
+      "epoch": 0.12693333333333334,
+      "grad_norm": 0.43902206036413266,
+      "learning_rate": 0.0001951482804507517,
+      "loss": 0.7381,
+      "step": 238
+    },
+    {
+      "epoch": 0.12746666666666667,
+      "grad_norm": 0.46220739725013227,
+      "learning_rate": 0.00019509496602102252,
+      "loss": 0.7356,
+      "step": 239
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.5516968267415471,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.8354,
+      "step": 240
+    },
+    {
+      "epoch": 0.12853333333333333,
+      "grad_norm": 0.48050633352830363,
+      "learning_rate": 0.00019498748541760846,
+      "loss": 0.8231,
+      "step": 241
+    },
+    {
+      "epoch": 0.12906666666666666,
+      "grad_norm": 0.4853375372865061,
+      "learning_rate": 0.0001949333195648769,
+      "loss": 0.8271,
+      "step": 242
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.44556603279849505,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8374,
+      "step": 243
+    },
+    {
+      "epoch": 0.13013333333333332,
+      "grad_norm": 0.4719602233654268,
+      "learning_rate": 0.00019482413756610173,
+      "loss": 0.7445,
+      "step": 244
+    },
+    {
+      "epoch": 0.13066666666666665,
+      "grad_norm": 0.4061586873518868,
+      "learning_rate": 0.0001947691217460921,
+      "loss": 0.7623,
+      "step": 245
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.4364473795948971,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.7568,
+      "step": 246
+    },
+    {
+      "epoch": 0.13173333333333334,
+      "grad_norm": 0.5127348804567465,
+      "learning_rate": 0.00019465824128625617,
+      "loss": 0.7951,
+      "step": 247
+    },
+    {
+      "epoch": 0.13226666666666667,
+      "grad_norm": 0.4370678397395421,
+      "learning_rate": 0.00019460237697753577,
+      "loss": 0.7307,
+      "step": 248
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.4802652726484155,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.7772,
+      "step": 249
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.4716996473593707,
+      "learning_rate": 0.00019448980103658613,
+      "loss": 0.8224,
+      "step": 250
+    },
+    {
+      "epoch": 0.13386666666666666,
+      "grad_norm": 0.45179297458376827,
+      "learning_rate": 0.0001944330897405257,
+      "loss": 0.7197,
+      "step": 251
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.6256760091274871,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8568,
+      "step": 252
+    },
+    {
+      "epoch": 0.13493333333333332,
+      "grad_norm": 0.40652398308119203,
+      "learning_rate": 0.00019431882134397598,
+      "loss": 0.7612,
+      "step": 253
+    },
+    {
+      "epoch": 0.13546666666666668,
+      "grad_norm": 0.39826248207703735,
+      "learning_rate": 0.00019426126458470936,
+      "loss": 0.697,
+      "step": 254
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.5388984002348581,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7699,
+      "step": 255
+    },
+    {
+      "epoch": 0.13653333333333334,
+      "grad_norm": 0.4894846768153503,
+      "learning_rate": 0.00019414530680355837,
+      "loss": 0.8185,
+      "step": 256
+    },
+    {
+      "epoch": 0.13706666666666667,
+      "grad_norm": 0.47452035013179233,
+      "learning_rate": 0.00019408690612794148,
+      "loss": 0.7656,
+      "step": 257
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.4638104459253293,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7817,
+      "step": 258
+    },
+    {
+      "epoch": 0.13813333333333333,
+      "grad_norm": 0.48208730761737223,
+      "learning_rate": 0.00019396926207859084,
+      "loss": 0.764,
+      "step": 259
+    },
+    {
+      "epoch": 0.13866666666666666,
+      "grad_norm": 0.404761226193334,
+      "learning_rate": 0.0001939100190561601,
+      "loss": 0.6982,
+      "step": 260
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.4453259656128742,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8025,
+      "step": 261
+    },
+    {
+      "epoch": 0.13973333333333332,
+      "grad_norm": 0.5874041451444244,
+      "learning_rate": 0.0001937906919003304,
+      "loss": 0.7778,
+      "step": 262
+    },
+    {
+      "epoch": 0.14026666666666668,
+      "grad_norm": 0.48081228484114513,
+      "learning_rate": 0.00019373060812326052,
+      "loss": 0.834,
+      "step": 263
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4489761686206763,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.7639,
+      "step": 264
+    },
+    {
+      "epoch": 0.14133333333333334,
+      "grad_norm": 0.44121347639841846,
+      "learning_rate": 0.00019360960106790643,
+      "loss": 0.7254,
+      "step": 265
+    },
+    {
+      "epoch": 0.14186666666666667,
+      "grad_norm": 0.5458503844312422,
+      "learning_rate": 0.0001935486781509677,
+      "loss": 0.796,
+      "step": 266
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.4292348510426145,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7528,
+      "step": 267
+    },
+    {
+      "epoch": 0.14293333333333333,
+      "grad_norm": 0.5601307983573015,
+      "learning_rate": 0.00019342599444819168,
+      "loss": 0.7965,
+      "step": 268
+    },
+    {
+      "epoch": 0.14346666666666666,
+      "grad_norm": 0.4661341927779982,
+      "learning_rate": 0.00019336423402870653,
+      "loss": 0.8204,
+      "step": 269
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.44290954581930936,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.7235,
+      "step": 270
+    },
+    {
+      "epoch": 0.14453333333333335,
+      "grad_norm": 0.5217928965079981,
+      "learning_rate": 0.0001932398769756714,
+      "loss": 0.7719,
+      "step": 271
+    },
+    {
+      "epoch": 0.14506666666666668,
+      "grad_norm": 0.515952008050984,
+      "learning_rate": 0.0001931772807134704,
+      "loss": 0.8093,
+      "step": 272
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.4707600737870565,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7332,
+      "step": 273
+    },
+    {
+      "epoch": 0.14613333333333334,
+      "grad_norm": 0.45512111329432015,
+      "learning_rate": 0.00019305125365231084,
+      "loss": 0.7657,
+      "step": 274
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 0.49386683069536635,
+      "learning_rate": 0.00019298782322968815,
+      "loss": 0.7748,
+      "step": 275
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.5297188316292964,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.8383,
+      "step": 276
+    },
+    {
+      "epoch": 0.14773333333333333,
+      "grad_norm": 0.4755481145022749,
+      "learning_rate": 0.0001928601295474208,
+      "loss": 0.8238,
+      "step": 277
+    },
+    {
+      "epoch": 0.14826666666666666,
+      "grad_norm": 0.49557700110676095,
+      "learning_rate": 0.00019279586666908884,
+      "loss": 0.7817,
+      "step": 278
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.46635936642688935,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7978,
+      "step": 279
+    },
+    {
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.4289128821422162,
+      "learning_rate": 0.00019266650979752136,
+      "loss": 0.7138,
+      "step": 280
+    },
+    {
+      "epoch": 0.14986666666666668,
+      "grad_norm": 0.48738314609520644,
+      "learning_rate": 0.00019260141619056507,
+      "loss": 0.7596,
+      "step": 281
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.4494836479899092,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.7568,
+      "step": 282
+    },
+    {
+      "epoch": 0.15093333333333334,
+      "grad_norm": 0.45960464062943973,
+      "learning_rate": 0.0001924703996062038,
+      "loss": 0.7752,
+      "step": 283
+    },
+    {
+      "epoch": 0.15146666666666667,
+      "grad_norm": 0.43800030980808136,
+      "learning_rate": 0.0001924044770200342,
+      "loss": 0.7668,
+      "step": 284
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.5008451762030025,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.8096,
+      "step": 285
+    },
+    {
+      "epoch": 0.15253333333333333,
+      "grad_norm": 0.4448951923072958,
+      "learning_rate": 0.0001922718042439908,
+      "loss": 0.7807,
+      "step": 286
+    },
+    {
+      "epoch": 0.15306666666666666,
+      "grad_norm": 0.5348016616275901,
+      "learning_rate": 0.000192205054450298,
+      "loss": 0.8275,
+      "step": 287
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.4475826890866102,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8464,
+      "step": 288
+    },
+    {
+      "epoch": 0.15413333333333334,
+      "grad_norm": 0.4516093386718875,
+      "learning_rate": 0.00019207072904819486,
+      "loss": 0.7338,
+      "step": 289
+    },
+    {
+      "epoch": 0.15466666666666667,
+      "grad_norm": 0.423419919326598,
+      "learning_rate": 0.00019200315384090044,
+      "loss": 0.7429,
+      "step": 290
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.40929044163186284,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7629,
+      "step": 291
+    },
+    {
+      "epoch": 0.15573333333333333,
+      "grad_norm": 0.3915640933466481,
+      "learning_rate": 0.00019186717942277462,
+      "loss": 0.6816,
+      "step": 292
+    },
+    {
+      "epoch": 0.15626666666666666,
+      "grad_norm": 0.45028124322231367,
+      "learning_rate": 0.00019179878061798347,
+      "loss": 0.7466,
+      "step": 293
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.44664051619568207,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.7082,
+      "step": 294
+    },
+    {
+      "epoch": 0.15733333333333333,
+      "grad_norm": 0.4318842557767651,
+      "learning_rate": 0.00019166116083819002,
+      "loss": 0.7901,
+      "step": 295
+    },
+    {
+      "epoch": 0.15786666666666666,
+      "grad_norm": 0.419089019477868,
+      "learning_rate": 0.00019159194027414128,
+      "loss": 0.7535,
+      "step": 296
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.4746626817050296,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8016,
+      "step": 297
+    },
+    {
+      "epoch": 0.15893333333333334,
+      "grad_norm": 0.5561412121845948,
+      "learning_rate": 0.00019145267883125482,
+      "loss": 0.7921,
+      "step": 298
+    },
+    {
+      "epoch": 0.15946666666666667,
+      "grad_norm": 0.48779717445322235,
+      "learning_rate": 0.00019138263836827288,
+      "loss": 0.8115,
+      "step": 299
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5179134634333581,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8573,
+      "step": 300
+    },
+    {
+      "epoch": 0.16053333333333333,
+      "grad_norm": 0.5164662637840521,
+      "learning_rate": 0.00019124173900498818,
+      "loss": 0.7585,
+      "step": 301
+    },
+    {
+      "epoch": 0.16106666666666666,
+      "grad_norm": 0.5557415194491732,
+      "learning_rate": 0.00019117088052543233,
+      "loss": 0.7639,
+      "step": 302
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.45507954671227385,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.6824,
+      "step": 303
+    },
+    {
+      "epoch": 0.16213333333333332,
+      "grad_norm": 0.5328486021774697,
+      "learning_rate": 0.00019102834702846387,
+      "loss": 0.7466,
+      "step": 304
+    },
+    {
+      "epoch": 0.16266666666666665,
+      "grad_norm": 0.466952483557593,
+      "learning_rate": 0.0001909566724366779,
+      "loss": 0.7895,
+      "step": 305
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5050910069933272,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.8058,
+      "step": 306
+    },
+    {
+      "epoch": 0.16373333333333334,
+      "grad_norm": 0.4437035269566679,
+      "learning_rate": 0.00019081250863665794,
+      "loss": 0.8091,
+      "step": 307
+    },
+    {
+      "epoch": 0.16426666666666667,
+      "grad_norm": 0.4149488344882996,
+      "learning_rate": 0.0001907400198589189,
+      "loss": 0.7001,
+      "step": 308
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.41420753794845466,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7069,
+      "step": 309
+    },
+    {
+      "epoch": 0.16533333333333333,
+      "grad_norm": 0.4224940665853059,
+      "learning_rate": 0.00019059422963029464,
+      "loss": 0.7945,
+      "step": 310
+    },
+    {
+      "epoch": 0.16586666666666666,
+      "grad_norm": 0.40145509406901553,
+      "learning_rate": 0.0001905209286147611,
+      "loss": 0.746,
+      "step": 311
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4732434289419564,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.8353,
+      "step": 312
+    },
+    {
+      "epoch": 0.16693333333333332,
+      "grad_norm": 0.4229366424040278,
+      "learning_rate": 0.0001903735158756905,
+      "loss": 0.8353,
+      "step": 313
+    },
+    {
+      "epoch": 0.16746666666666668,
+      "grad_norm": 0.447666319979107,
+      "learning_rate": 0.0001902994045923502,
+      "loss": 0.7846,
+      "step": 314
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.4076824839696985,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.7914,
+      "step": 315
+    },
+    {
+      "epoch": 0.16853333333333334,
+      "grad_norm": 0.5306119733066779,
+      "learning_rate": 0.0001901503733045967,
+      "loss": 0.8472,
+      "step": 316
+    },
+    {
+      "epoch": 0.16906666666666667,
+      "grad_norm": 0.43930612065109,
+      "learning_rate": 0.00019007545374521355,
+      "loss": 0.7688,
+      "step": 317
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.48951247411124604,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7647,
+      "step": 318
+    },
+    {
+      "epoch": 0.17013333333333333,
+      "grad_norm": 0.39437844060981536,
+      "learning_rate": 0.00018992480791403958,
+      "loss": 0.7146,
+      "step": 319
+    },
+    {
+      "epoch": 0.17066666666666666,
+      "grad_norm": 0.5106490024266829,
+      "learning_rate": 0.0001898490820921001,
+      "loss": 0.7673,
+      "step": 320
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.4554149817154902,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.7742,
+      "step": 321
+    },
+    {
+      "epoch": 0.17173333333333332,
+      "grad_norm": 0.4128434656129718,
+      "learning_rate": 0.0001896968257661595,
+      "loss": 0.6839,
+      "step": 322
+    },
+    {
+      "epoch": 0.17226666666666668,
+      "grad_norm": 0.4365532500068166,
+      "learning_rate": 0.00018962029571681886,
+      "loss": 0.7491,
+      "step": 323
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.5404820208818948,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.8756,
+      "step": 324
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 0.4026796553720846,
+      "learning_rate": 0.00018946643298804793,
+      "loss": 0.6692,
+      "step": 325
+    },
+    {
+      "epoch": 0.17386666666666667,
+      "grad_norm": 0.43593661521496024,
+      "learning_rate": 0.00018938910076807513,
+      "loss": 0.6858,
+      "step": 326
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.46833989079793203,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7437,
+      "step": 327
+    },
+    {
+      "epoch": 0.17493333333333333,
+      "grad_norm": 0.4638802297033959,
+      "learning_rate": 0.0001892336357715829,
+      "loss": 0.7714,
+      "step": 328
+    },
+    {
+      "epoch": 0.17546666666666666,
+      "grad_norm": 0.44901973665971945,
+      "learning_rate": 0.0001891555034593055,
+      "loss": 0.767,
+      "step": 329
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.40021188967521376,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.6805,
+      "step": 330
+    },
+    {
+      "epoch": 0.17653333333333332,
+      "grad_norm": 0.47974999968855614,
+      "learning_rate": 0.00018899844037326225,
+      "loss": 0.7806,
+      "step": 331
+    },
+    {
+      "epoch": 0.17706666666666668,
+      "grad_norm": 0.4905673523982542,
+      "learning_rate": 0.0001889195100685106,
+      "loss": 0.8136,
+      "step": 332
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.40865028043523793,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7654,
+      "step": 333
+    },
+    {
+      "epoch": 0.17813333333333334,
+      "grad_norm": 0.6085964488367325,
+      "learning_rate": 0.00018876085311403593,
+      "loss": 0.8238,
+      "step": 334
+    },
+    {
+      "epoch": 0.17866666666666667,
+      "grad_norm": 0.49235639869921055,
+      "learning_rate": 0.00018868112693808665,
+      "loss": 0.7707,
+      "step": 335
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.42159691986144743,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.756,
+      "step": 336
+    },
+    {
+      "epoch": 0.17973333333333333,
+      "grad_norm": 0.4395918565248622,
+      "learning_rate": 0.00018852088037913577,
+      "loss": 0.7879,
+      "step": 337
+    },
+    {
+      "epoch": 0.18026666666666666,
+      "grad_norm": 0.4415135411457727,
+      "learning_rate": 0.0001884403604746547,
+      "loss": 0.7335,
+      "step": 338
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.5298497796094817,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.7892,
+      "step": 339
+    },
+    {
+      "epoch": 0.18133333333333335,
+      "grad_norm": 0.4095062084473622,
+      "learning_rate": 0.00018827852861790398,
+      "loss": 0.7013,
+      "step": 340
+    },
+    {
+      "epoch": 0.18186666666666668,
+      "grad_norm": 0.4166526105460935,
+      "learning_rate": 0.00018819721714888877,
+      "loss": 0.7216,
+      "step": 341
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.4380297982805859,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.6844,
+      "step": 342
+    },
+    {
+      "epoch": 0.18293333333333334,
+      "grad_norm": 0.5279780310929358,
+      "learning_rate": 0.00018803380434362,
+      "loss": 0.7763,
+      "step": 343
+    },
+    {
+      "epoch": 0.18346666666666667,
+      "grad_norm": 0.490170387095156,
+      "learning_rate": 0.0001879517034953418,
+      "loss": 0.6931,
+      "step": 344
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.49544220794163213,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7736,
+      "step": 345
+    },
+    {
+      "epoch": 0.18453333333333333,
+      "grad_norm": 0.4131953727130121,
+      "learning_rate": 0.00018778671413332513,
+      "loss": 0.761,
+      "step": 346
+    },
+    {
+      "epoch": 0.18506666666666666,
+      "grad_norm": 0.4388218560264117,
+      "learning_rate": 0.00018770382611226987,
+      "loss": 0.7594,
+      "step": 347
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.45875231207455275,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.7495,
+      "step": 348
+    },
+    {
+      "epoch": 0.18613333333333335,
+      "grad_norm": 0.4835149475399592,
+      "learning_rate": 0.000187537264627646,
+      "loss": 0.7636,
+      "step": 349
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.4374529769827338,
+      "learning_rate": 0.00018745359166145523,
+      "loss": 0.7329,
+      "step": 350
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.4494590328996156,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.8096,
+      "step": 351
+    },
+    {
+      "epoch": 0.18773333333333334,
+      "grad_norm": 0.4642424475543717,
+      "learning_rate": 0.00018728546253061614,
+      "loss": 0.7812,
+      "step": 352
+    },
+    {
+      "epoch": 0.18826666666666667,
+      "grad_norm": 0.47429688727853647,
+      "learning_rate": 0.00018720100686802694,
+      "loss": 0.7811,
+      "step": 353
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.448926289660477,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.821,
+      "step": 354
+    },
+    {
+      "epoch": 0.18933333333333333,
+      "grad_norm": 0.4932646325656744,
+      "learning_rate": 0.00018703131460949554,
+      "loss": 0.7139,
+      "step": 355
+    },
+    {
+      "epoch": 0.18986666666666666,
+      "grad_norm": 0.4507269223674137,
+      "learning_rate": 0.0001869460785202802,
+      "loss": 0.7674,
+      "step": 356
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.48975523983555974,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7927,
+      "step": 357
+    },
+    {
+      "epoch": 0.19093333333333334,
+      "grad_norm": 0.43803365488688517,
+      "learning_rate": 0.00018677482769458904,
+      "loss": 0.7569,
+      "step": 358
+    },
+    {
+      "epoch": 0.19146666666666667,
+      "grad_norm": 0.4282559136963825,
+      "learning_rate": 0.00018668881346949417,
+      "loss": 0.7743,
+      "step": 359
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.42038876392386426,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7075,
+      "step": 360
+    },
+    {
+      "epoch": 0.19253333333333333,
+      "grad_norm": 0.38610534697116794,
+      "learning_rate": 0.00018651600867906272,
+      "loss": 0.6382,
+      "step": 361
+    },
+    {
+      "epoch": 0.19306666666666666,
+      "grad_norm": 0.3790510813289947,
+      "learning_rate": 0.00018642921862974742,
+      "loss": 0.7069,
+      "step": 362
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.46441742712105416,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.7565,
+      "step": 363
+    },
+    {
+      "epoch": 0.19413333333333332,
+      "grad_norm": 0.3788528482979111,
+      "learning_rate": 0.00018625486451875843,
+      "loss": 0.6764,
+      "step": 364
+    },
+    {
+      "epoch": 0.19466666666666665,
+      "grad_norm": 0.5086491781505815,
+      "learning_rate": 0.0001861673009777325,
+      "loss": 0.7939,
+      "step": 365
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4254679186082148,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.7762,
+      "step": 366
+    },
+    {
+      "epoch": 0.19573333333333334,
+      "grad_norm": 0.4420755098500798,
+      "learning_rate": 0.00018599140223200716,
+      "loss": 0.7413,
+      "step": 367
+    },
+    {
+      "epoch": 0.19626666666666667,
+      "grad_norm": 0.39978322577235464,
+      "learning_rate": 0.0001859030675525681,
+      "loss": 0.7203,
+      "step": 368
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.44588865008638007,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.7545,
+      "step": 369
+    },
+    {
+      "epoch": 0.19733333333333333,
+      "grad_norm": 0.4557262705357927,
+      "learning_rate": 0.0001857256288994402,
+      "loss": 0.7797,
+      "step": 370
+    },
+    {
+      "epoch": 0.19786666666666666,
+      "grad_norm": 0.4415351173164515,
+      "learning_rate": 0.00018563652545561013,
+      "loss": 0.7454,
+      "step": 371
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4478151950710153,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.741,
+      "step": 372
+    },
+    {
+      "epoch": 0.19893333333333332,
+      "grad_norm": 0.4178673683561562,
+      "learning_rate": 0.000185457551663799,
+      "loss": 0.7403,
+      "step": 373
+    },
+    {
+      "epoch": 0.19946666666666665,
+      "grad_norm": 0.39178804569712133,
+      "learning_rate": 0.00018536768185026083,
+      "loss": 0.6889,
+      "step": 374
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.49862204004927757,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.8009,
+      "step": 375
+    },
+    {
+      "epoch": 0.20053333333333334,
+      "grad_norm": 0.46341674560578006,
+      "learning_rate": 0.00018518717772974302,
+      "loss": 0.7867,
+      "step": 376
+    },
+    {
+      "epoch": 0.20106666666666667,
+      "grad_norm": 0.4462377795277957,
+      "learning_rate": 0.00018509654396177609,
+      "loss": 0.7778,
+      "step": 377
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4168182898396169,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.7058,
+      "step": 378
+    },
+    {
+      "epoch": 0.20213333333333333,
+      "grad_norm": 0.4108982624761322,
+      "learning_rate": 0.00018491451436365627,
+      "loss": 0.7243,
+      "step": 379
+    },
+    {
+      "epoch": 0.20266666666666666,
+      "grad_norm": 0.4121430838074208,
+      "learning_rate": 0.0001848231190770714,
+      "loss": 0.7884,
+      "step": 380
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.5040950105093712,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.74,
+      "step": 381
+    },
+    {
+      "epoch": 0.20373333333333332,
+      "grad_norm": 0.6185090953573744,
+      "learning_rate": 0.00018463956889345194,
+      "loss": 0.6944,
+      "step": 382
+    },
+    {
+      "epoch": 0.20426666666666668,
+      "grad_norm": 0.4562410017066912,
+      "learning_rate": 0.00018454741454452603,
+      "loss": 0.7888,
+      "step": 383
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4557627500595636,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7516,
+      "step": 384
+    },
+    {
+      "epoch": 0.20533333333333334,
+      "grad_norm": 0.4493133053651138,
+      "learning_rate": 0.00018436234870837547,
+      "loss": 0.7748,
+      "step": 385
+    },
+    {
+      "epoch": 0.20586666666666667,
+      "grad_norm": 0.5046213484524635,
+      "learning_rate": 0.00018426943777378552,
+      "loss": 0.8101,
+      "step": 386
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.5725051732278246,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7522,
+      "step": 387
+    },
+    {
+      "epoch": 0.20693333333333333,
+      "grad_norm": 0.4502329509184404,
+      "learning_rate": 0.00018408286125880604,
+      "loss": 0.7394,
+      "step": 388
+    },
+    {
+      "epoch": 0.20746666666666666,
+      "grad_norm": 0.5010140763094688,
+      "learning_rate": 0.00018398919623556238,
+      "loss": 0.7558,
+      "step": 389
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4541734409806498,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.7319,
+      "step": 390
+    },
+    {
+      "epoch": 0.20853333333333332,
+      "grad_norm": 0.4692263403190528,
+      "learning_rate": 0.0001838011140560562,
+      "loss": 0.7743,
+      "step": 391
+    },
+    {
+      "epoch": 0.20906666666666668,
+      "grad_norm": 0.4375963223108687,
+      "learning_rate": 0.00018370669746143564,
+      "loss": 0.7417,
+      "step": 392
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.39942230274315993,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.6796,
+      "step": 393
+    },
+    {
+      "epoch": 0.21013333333333334,
+      "grad_norm": 0.4249713353545521,
+      "learning_rate": 0.0001835171146721701,
+      "loss": 0.7647,
+      "step": 394
+    },
+    {
+      "epoch": 0.21066666666666667,
+      "grad_norm": 0.44763547188312713,
+      "learning_rate": 0.00018342194904364813,
+      "loss": 0.7153,
+      "step": 395
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.4272917764036961,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.7357,
+      "step": 396
+    },
+    {
+      "epoch": 0.21173333333333333,
+      "grad_norm": 0.44891642615274496,
+      "learning_rate": 0.00018323087073971993,
+      "loss": 0.7529,
+      "step": 397
+    },
+    {
+      "epoch": 0.21226666666666666,
+      "grad_norm": 0.45061301245464935,
+      "learning_rate": 0.00018313495863490258,
+      "loss": 0.7357,
+      "step": 398
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.5162333764394879,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.7415,
+      "step": 399
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.43200165971218957,
+      "learning_rate": 0.00018294238995160094,
+      "loss": 0.7658,
+      "step": 400
+    },
+    {
+      "epoch": 0.21386666666666668,
+      "grad_norm": 0.49127325690048357,
+      "learning_rate": 0.00018284573394815597,
+      "loss": 0.8219,
+      "step": 401
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.4322687546291749,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7127,
+      "step": 402
+    },
+    {
+      "epoch": 0.21493333333333334,
+      "grad_norm": 0.4352965224137496,
+      "learning_rate": 0.00018265168006082437,
+      "loss": 0.7437,
+      "step": 403
+    },
+    {
+      "epoch": 0.21546666666666667,
+      "grad_norm": 0.4038322942096568,
+      "learning_rate": 0.00018255428275641214,
+      "loss": 0.681,
+      "step": 404
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.44329753535341127,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7773,
+      "step": 405
+    },
+    {
+      "epoch": 0.21653333333333333,
+      "grad_norm": 0.41070677887586216,
+      "learning_rate": 0.0001823587488803095,
+      "loss": 0.7477,
+      "step": 406
+    },
+    {
+      "epoch": 0.21706666666666666,
+      "grad_norm": 0.4717483311142247,
+      "learning_rate": 0.00018226061289251298,
+      "loss": 0.7002,
+      "step": 407
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4131348832568817,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7224,
+      "step": 408
+    },
+    {
+      "epoch": 0.21813333333333335,
+      "grad_norm": 0.38310709778811125,
+      "learning_rate": 0.00018206360428267332,
+      "loss": 0.7388,
+      "step": 409
+    },
+    {
+      "epoch": 0.21866666666666668,
+      "grad_norm": 0.5339055401075089,
+      "learning_rate": 0.00018196473224892784,
+      "loss": 0.8489,
+      "step": 410
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.5428931097118624,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.7689,
+      "step": 411
+    },
+    {
+      "epoch": 0.21973333333333334,
+      "grad_norm": 0.506089095574403,
+      "learning_rate": 0.0001817662542000192,
+      "loss": 0.6999,
+      "step": 412
+    },
+    {
+      "epoch": 0.22026666666666667,
+      "grad_norm": 0.5000636143770371,
+      "learning_rate": 0.0001816666487775416,
+      "loss": 0.7746,
+      "step": 413
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.439603252476578,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7477,
+      "step": 414
+    },
+    {
+      "epoch": 0.22133333333333333,
+      "grad_norm": 0.41888613720812434,
+      "learning_rate": 0.00018146670662372354,
+      "loss": 0.7187,
+      "step": 415
+    },
+    {
+      "epoch": 0.22186666666666666,
+      "grad_norm": 0.5425059948079629,
+      "learning_rate": 0.0001813663704894407,
+      "loss": 0.7816,
+      "step": 416
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.4452587992918538,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.8067,
+      "step": 417
+    },
+    {
+      "epoch": 0.22293333333333334,
+      "grad_norm": 0.44327636724949143,
+      "learning_rate": 0.00018116496960422107,
+      "loss": 0.6989,
+      "step": 418
+    },
+    {
+      "epoch": 0.22346666666666667,
+      "grad_norm": 0.4522294368472925,
+      "learning_rate": 0.00018106390545469795,
+      "loss": 0.7711,
+      "step": 419
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4361127825366765,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7249,
+      "step": 420
+    },
+    {
+      "epoch": 0.22453333333333333,
+      "grad_norm": 0.42057147948553086,
+      "learning_rate": 0.00018086105125078857,
+      "loss": 0.7108,
+      "step": 421
+    },
+    {
+      "epoch": 0.22506666666666666,
+      "grad_norm": 0.4180384592963417,
+      "learning_rate": 0.00018075926180215576,
+      "loss": 0.7167,
+      "step": 422
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.4039847626846038,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.7019,
+      "step": 423
+    },
+    {
+      "epoch": 0.22613333333333333,
+      "grad_norm": 0.4932029576011835,
+      "learning_rate": 0.0001805549597313267,
+      "loss": 0.8274,
+      "step": 424
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 0.40645071321038856,
+      "learning_rate": 0.0001804524477192075,
+      "loss": 0.7336,
+      "step": 425
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.4418467538632973,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7251,
+      "step": 426
+    },
+    {
+      "epoch": 0.22773333333333334,
+      "grad_norm": 0.4593383852135761,
+      "learning_rate": 0.00018024670327214084,
+      "loss": 0.816,
+      "step": 427
+    },
+    {
+      "epoch": 0.22826666666666667,
+      "grad_norm": 0.5360074555853559,
+      "learning_rate": 0.00018014347145157755,
+      "loss": 0.7608,
+      "step": 428
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.7932283592094297,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7652,
+      "step": 429
+    },
+    {
+      "epoch": 0.22933333333333333,
+      "grad_norm": 0.4871171097026091,
+      "learning_rate": 0.0001799362901577196,
+      "loss": 0.7527,
+      "step": 430
+    },
+    {
+      "epoch": 0.22986666666666666,
+      "grad_norm": 0.43079017734380354,
+      "learning_rate": 0.00017983234130309968,
+      "loss": 0.767,
+      "step": 431
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.43719420402094894,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7999,
+      "step": 432
+    },
+    {
+      "epoch": 0.23093333333333332,
+      "grad_norm": 0.5559181175868851,
+      "learning_rate": 0.00017962372873051252,
+      "loss": 0.8166,
+      "step": 433
+    },
+    {
+      "epoch": 0.23146666666666665,
+      "grad_norm": 0.47293088861257343,
+      "learning_rate": 0.00017951906563549397,
+      "loss": 0.7727,
+      "step": 434
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.4684788662952495,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.6992,
+      "step": 435
+    },
+    {
+      "epoch": 0.23253333333333334,
+      "grad_norm": 0.4795723386291384,
+      "learning_rate": 0.00017930902739070562,
+      "loss": 0.7253,
+      "step": 436
+    },
+    {
+      "epoch": 0.23306666666666667,
+      "grad_norm": 0.5028741627697558,
+      "learning_rate": 0.00017920365286814183,
+      "loss": 0.77,
+      "step": 437
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4241329169362859,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7144,
+      "step": 438
+    },
+    {
+      "epoch": 0.23413333333333333,
+      "grad_norm": 0.4475491212762779,
+      "learning_rate": 0.0001789921945959958,
+      "loss": 0.6521,
+      "step": 439
+    },
+    {
+      "epoch": 0.23466666666666666,
+      "grad_norm": 0.4626738540177879,
+      "learning_rate": 0.00017888611147786002,
+      "loss": 0.7443,
+      "step": 440
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.41634357797444965,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.773,
+      "step": 441
+    },
+    {
+      "epoch": 0.23573333333333332,
+      "grad_norm": 0.5873436781277276,
+      "learning_rate": 0.00017867323886136348,
+      "loss": 0.7895,
+      "step": 442
+    },
+    {
+      "epoch": 0.23626666666666668,
+      "grad_norm": 0.4196180679340285,
+      "learning_rate": 0.00017856644999867264,
+      "loss": 0.707,
+      "step": 443
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5129971633116558,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.8006,
+      "step": 444
+    },
+    {
+      "epoch": 0.23733333333333334,
+      "grad_norm": 0.4552676154927619,
+      "learning_rate": 0.00017835216875884368,
+      "loss": 0.7615,
+      "step": 445
+    },
+    {
+      "epoch": 0.23786666666666667,
+      "grad_norm": 0.4011866327018788,
+      "learning_rate": 0.0001782446770215819,
+      "loss": 0.7422,
+      "step": 446
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.39768724784431175,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7593,
+      "step": 447
+    },
+    {
+      "epoch": 0.23893333333333333,
+      "grad_norm": 0.46512417794849265,
+      "learning_rate": 0.00017802899291729585,
+      "loss": 0.784,
+      "step": 448
+    },
+    {
+      "epoch": 0.23946666666666666,
+      "grad_norm": 0.410495091377265,
+      "learning_rate": 0.0001779208011943371,
+      "loss": 0.7217,
+      "step": 449
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4481590241534154,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7459,
+      "step": 450
+    },
+    {
+      "epoch": 0.24053333333333332,
+      "grad_norm": 0.4110190895695786,
+      "learning_rate": 0.00017770372002217172,
+      "loss": 0.7447,
+      "step": 451
+    },
+    {
+      "epoch": 0.24106666666666668,
+      "grad_norm": 0.4399110104984658,
+      "learning_rate": 0.00017759483122120238,
+      "loss": 0.7624,
+      "step": 452
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.46463546930000155,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7351,
+      "step": 453
+    },
+    {
+      "epoch": 0.24213333333333334,
+      "grad_norm": 0.4335397389346068,
+      "learning_rate": 0.00017737635881528196,
+      "loss": 0.8026,
+      "step": 454
+    },
+    {
+      "epoch": 0.24266666666666667,
+      "grad_norm": 0.47598294181782197,
+      "learning_rate": 0.00017726677586272263,
+      "loss": 0.7617,
+      "step": 455
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.451875150204458,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7775,
+      "step": 456
+    },
+    {
+      "epoch": 0.24373333333333333,
+      "grad_norm": 1.0696155565442207,
+      "learning_rate": 0.00017704691809456143,
+      "loss": 0.7043,
+      "step": 457
+    },
+    {
+      "epoch": 0.24426666666666666,
+      "grad_norm": 0.4155974995971046,
+      "learning_rate": 0.0001769366439354882,
+      "loss": 0.7324,
+      "step": 458
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.4597044836427236,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7551,
+      "step": 459
+    },
+    {
+      "epoch": 0.24533333333333332,
+      "grad_norm": 0.45732067571731677,
+      "learning_rate": 0.00017671540671383243,
+      "loss": 0.7749,
+      "step": 460
+    },
+    {
+      "epoch": 0.24586666666666668,
+      "grad_norm": 0.509382063027756,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 0.8071,
+      "step": 461
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.4627551430394898,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7433,
+      "step": 462
+    },
+    {
+      "epoch": 0.24693333333333334,
+      "grad_norm": 0.46220165634985766,
+      "learning_rate": 0.00017638183358256696,
+      "loss": 0.7225,
+      "step": 463
+    },
+    {
+      "epoch": 0.24746666666666667,
+      "grad_norm": 0.4392746142774141,
+      "learning_rate": 0.00017627018591992018,
+      "loss": 0.6936,
+      "step": 464
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.4417835047957786,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.6757,
+      "step": 465
+    },
+    {
+      "epoch": 0.24853333333333333,
+      "grad_norm": 0.44007228791509273,
+      "learning_rate": 0.00017604620766564723,
+      "loss": 0.7657,
+      "step": 466
+    },
+    {
+      "epoch": 0.24906666666666666,
+      "grad_norm": 0.4574407043844708,
+      "learning_rate": 0.00017593387774285412,
+      "loss": 0.7277,
+      "step": 467
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.4142482944991072,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.7399,
+      "step": 468
+    },
+    {
+      "epoch": 0.2501333333333333,
+      "grad_norm": 0.43754975132798457,
+      "learning_rate": 0.0001757085379831246,
+      "loss": 0.7258,
+      "step": 469
+    },
+    {
+      "epoch": 0.25066666666666665,
+      "grad_norm": 0.47015803950012963,
+      "learning_rate": 0.00017559552881908695,
+      "loss": 0.7298,
+      "step": 470
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.39654418276783754,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.742,
+      "step": 471
+    },
+    {
+      "epoch": 0.2517333333333333,
+      "grad_norm": 0.5012584870602004,
+      "learning_rate": 0.00017536883360997743,
+      "loss": 0.8511,
+      "step": 472
+    },
+    {
+      "epoch": 0.25226666666666664,
+      "grad_norm": 0.5218265939980339,
+      "learning_rate": 0.00017525514824185185,
+      "loss": 0.7946,
+      "step": 473
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4535168055954105,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.6896,
+      "step": 474
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 0.4355154631246511,
+      "learning_rate": 0.00017502710367586687,
+      "loss": 0.7456,
+      "step": 475
+    },
+    {
+      "epoch": 0.2538666666666667,
+      "grad_norm": 0.5032416045066815,
+      "learning_rate": 0.0001749127451589832,
+      "loss": 0.8194,
+      "step": 476
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.42058160115791615,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.6809,
+      "step": 477
+    },
+    {
+      "epoch": 0.25493333333333335,
+      "grad_norm": 0.47682544144964806,
+      "learning_rate": 0.00017468335736489177,
+      "loss": 0.7657,
+      "step": 478
+    },
+    {
+      "epoch": 0.2554666666666667,
+      "grad_norm": 0.4478517958175801,
+      "learning_rate": 0.00017456832877267084,
+      "loss": 0.7665,
+      "step": 479
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.43045307970830915,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.725,
+      "step": 480
+    },
+    {
+      "epoch": 0.25653333333333334,
+      "grad_norm": 0.40541676428183027,
+      "learning_rate": 0.00017433760391534167,
+      "loss": 0.706,
+      "step": 481
+    },
+    {
+      "epoch": 0.25706666666666667,
+      "grad_norm": 0.43198955167917824,
+      "learning_rate": 0.00017422190833921283,
+      "loss": 0.7422,
+      "step": 482
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.42700566025514264,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.8277,
+      "step": 483
+    },
+    {
+      "epoch": 0.2581333333333333,
+      "grad_norm": 0.43231994835750615,
+      "learning_rate": 0.00017398985261944856,
+      "loss": 0.7047,
+      "step": 484
+    },
+    {
+      "epoch": 0.25866666666666666,
+      "grad_norm": 0.43160023093573086,
+      "learning_rate": 0.00017387349316876666,
+      "loss": 0.7397,
+      "step": 485
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4226887446666897,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.6831,
+      "step": 486
+    },
+    {
+      "epoch": 0.2597333333333333,
+      "grad_norm": 0.40323279067236023,
+      "learning_rate": 0.0001736401128231373,
+      "loss": 0.7551,
+      "step": 487
+    },
+    {
+      "epoch": 0.26026666666666665,
+      "grad_norm": 0.45430304297247726,
+      "learning_rate": 0.00017352309262509894,
+      "loss": 0.7621,
+      "step": 488
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.4430349947086925,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.7392,
+      "step": 489
+    },
+    {
+      "epoch": 0.2613333333333333,
+      "grad_norm": 0.3888799232459701,
+      "learning_rate": 0.0001732883939257742,
+      "loss": 0.7134,
+      "step": 490
+    },
+    {
+      "epoch": 0.2618666666666667,
+      "grad_norm": 0.40725016029724603,
+      "learning_rate": 0.0001731707161253338,
+      "loss": 0.7194,
+      "step": 491
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.43932329112331703,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7664,
+      "step": 492
+    },
+    {
+      "epoch": 0.26293333333333335,
+      "grad_norm": 0.41473461673561246,
+      "learning_rate": 0.00017293470537991463,
+      "loss": 0.7426,
+      "step": 493
+    },
+    {
+      "epoch": 0.2634666666666667,
+      "grad_norm": 0.46272892040791574,
+      "learning_rate": 0.00017281637313969978,
+      "loss": 0.7554,
+      "step": 494
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.4427706963649241,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.7821,
+      "step": 495
+    },
+    {
+      "epoch": 0.26453333333333334,
+      "grad_norm": 0.47145971292056843,
+      "learning_rate": 0.00017257905669104874,
+      "loss": 0.7646,
+      "step": 496
+    },
+    {
+      "epoch": 0.2650666666666667,
+      "grad_norm": 0.49741608905805534,
+      "learning_rate": 0.00017246007319127545,
+      "loss": 0.796,
+      "step": 497
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.4413466129054198,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.6825,
+      "step": 498
+    },
+    {
+      "epoch": 0.26613333333333333,
+      "grad_norm": 0.42218546639118704,
+      "learning_rate": 0.00017222145741734626,
+      "loss": 0.6896,
+      "step": 499
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.40492674247425203,
+      "learning_rate": 0.00017210182585573327,
+      "loss": 0.7313,
+      "step": 500
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.3965105966499714,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.7127,
+      "step": 501
+    },
+    {
+      "epoch": 0.2677333333333333,
+      "grad_norm": 0.43131350442620386,
+      "learning_rate": 0.00017186191716939944,
+      "loss": 0.6961,
+      "step": 502
+    },
+    {
+      "epoch": 0.26826666666666665,
+      "grad_norm": 0.49782761355130234,
+      "learning_rate": 0.0001717416407610824,
+      "loss": 0.7276,
+      "step": 503
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4492292504112796,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8129,
+      "step": 504
+    },
+    {
+      "epoch": 0.2693333333333333,
+      "grad_norm": 0.4927718040432481,
+      "learning_rate": 0.00017150044560996488,
+      "loss": 0.7421,
+      "step": 505
+    },
+    {
+      "epoch": 0.26986666666666664,
+      "grad_norm": 0.46205345397489017,
+      "learning_rate": 0.00017137952758740978,
+      "loss": 0.7244,
+      "step": 506
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.4820921906889282,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.7976,
+      "step": 507
+    },
+    {
+      "epoch": 0.27093333333333336,
+      "grad_norm": 0.4025172261059256,
+      "learning_rate": 0.00017113705245370368,
+      "loss": 0.7616,
+      "step": 508
+    },
+    {
+      "epoch": 0.2714666666666667,
+      "grad_norm": 0.44780584646886484,
+      "learning_rate": 0.00017101549606662024,
+      "loss": 0.7582,
+      "step": 509
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.48758655722779376,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.781,
+      "step": 510
+    },
+    {
+      "epoch": 0.27253333333333335,
+      "grad_norm": 1.7151408551401242,
+      "learning_rate": 0.00017077174746692056,
+      "loss": 0.6497,
+      "step": 511
+    },
+    {
+      "epoch": 0.2730666666666667,
+      "grad_norm": 0.4051295030789714,
+      "learning_rate": 0.00017064955598217462,
+      "loss": 0.7334,
+      "step": 512
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.5067476687002975,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.8223,
+      "step": 513
+    },
+    {
+      "epoch": 0.27413333333333334,
+      "grad_norm": 0.8260851852010912,
+      "learning_rate": 0.00017040454046730115,
+      "loss": 0.6899,
+      "step": 514
+    },
+    {
+      "epoch": 0.27466666666666667,
+      "grad_norm": 0.4462804427340422,
+      "learning_rate": 0.00017028171716882714,
+      "loss": 0.774,
+      "step": 515
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4838041061473331,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7862,
+      "step": 516
+    },
+    {
+      "epoch": 0.27573333333333333,
+      "grad_norm": 0.4234453539963326,
+      "learning_rate": 0.00017003544132364846,
+      "loss": 0.6884,
+      "step": 517
+    },
+    {
+      "epoch": 0.27626666666666666,
+      "grad_norm": 0.4579586030373765,
+      "learning_rate": 0.00016991198951236088,
+      "loss": 0.7979,
+      "step": 518
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.49001578568876686,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.7449,
+      "step": 519
+    },
+    {
+      "epoch": 0.2773333333333333,
+      "grad_norm": 0.40425098791151753,
+      "learning_rate": 0.00016966445995561727,
+      "loss": 0.6956,
+      "step": 520
+    },
+    {
+      "epoch": 0.27786666666666665,
+      "grad_norm": 0.4057010560932579,
+      "learning_rate": 0.00016954038294932216,
+      "loss": 0.6925,
+      "step": 521
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.3636636017347738,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.6853,
+      "step": 522
+    },
+    {
+      "epoch": 0.2789333333333333,
+      "grad_norm": 0.4185024135524768,
+      "learning_rate": 0.0001692916063334479,
+      "loss": 0.7345,
+      "step": 523
+    },
+    {
+      "epoch": 0.27946666666666664,
+      "grad_norm": 0.5271803112585407,
+      "learning_rate": 0.0001691669074667535,
+      "loss": 0.7938,
+      "step": 524
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.41509775167313745,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.6856,
+      "step": 525
+    },
+    {
+      "epoch": 0.28053333333333336,
+      "grad_norm": 0.48121328285184867,
+      "learning_rate": 0.0001689168904776979,
+      "loss": 0.7351,
+      "step": 526
+    },
+    {
+      "epoch": 0.2810666666666667,
+      "grad_norm": 0.3365621861142254,
+      "learning_rate": 0.00016879157310192535,
+      "loss": 0.6568,
+      "step": 527
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.5003697225702642,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7644,
+      "step": 528
+    },
+    {
+      "epoch": 0.28213333333333335,
+      "grad_norm": 0.43276308167152966,
+      "learning_rate": 0.00016854032245897308,
+      "loss": 0.7071,
+      "step": 529
+    },
+    {
+      "epoch": 0.2826666666666667,
+      "grad_norm": 0.4616294263731802,
+      "learning_rate": 0.00016841438994206595,
+      "loss": 0.7714,
+      "step": 530
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.4064785119701553,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7206,
+      "step": 531
+    },
+    {
+      "epoch": 0.28373333333333334,
+      "grad_norm": 0.48378727060758436,
+      "learning_rate": 0.00016816191239765667,
+      "loss": 0.717,
+      "step": 532
+    },
+    {
+      "epoch": 0.28426666666666667,
+      "grad_norm": 0.4290622585368261,
+      "learning_rate": 0.00016803536812409075,
+      "loss": 0.7888,
+      "step": 533
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.4961258123079247,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.8515,
+      "step": 534
+    },
+    {
+      "epoch": 0.2853333333333333,
+      "grad_norm": 0.4570765932540526,
+      "learning_rate": 0.00016778167046363734,
+      "loss": 0.6933,
+      "step": 535
+    },
+    {
+      "epoch": 0.28586666666666666,
+      "grad_norm": 0.4374985572224133,
+      "learning_rate": 0.00016765451783432953,
+      "loss": 0.7807,
+      "step": 536
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.37000039379065575,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.6932,
+      "step": 537
+    },
+    {
+      "epoch": 0.2869333333333333,
+      "grad_norm": 0.41495774150290576,
+      "learning_rate": 0.0001673996068760359,
+      "loss": 0.731,
+      "step": 538
+    },
+    {
+      "epoch": 0.28746666666666665,
+      "grad_norm": 0.3918581040740028,
+      "learning_rate": 0.00016727184930825288,
+      "loss": 0.7141,
+      "step": 539
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.41713163300704226,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.7605,
+      "step": 540
+    },
+    {
+      "epoch": 0.2885333333333333,
+      "grad_norm": 0.43461384226091976,
+      "learning_rate": 0.00016701573190293077,
+      "loss": 0.7818,
+      "step": 541
+    },
+    {
+      "epoch": 0.2890666666666667,
+      "grad_norm": 0.5213258996049039,
+      "learning_rate": 0.00016688737283019706,
+      "loss": 0.6994,
+      "step": 542
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.43775019357143186,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7801,
+      "step": 543
+    },
+    {
+      "epoch": 0.29013333333333335,
+      "grad_norm": 0.40810234181034066,
+      "learning_rate": 0.00016663005586108176,
+      "loss": 0.7297,
+      "step": 544
+    },
+    {
+      "epoch": 0.2906666666666667,
+      "grad_norm": 0.451393732520972,
+      "learning_rate": 0.00016650109873308765,
+      "loss": 0.6761,
+      "step": 545
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.41830630947376274,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7271,
+      "step": 546
+    },
+    {
+      "epoch": 0.29173333333333334,
+      "grad_norm": 0.45765934665251606,
+      "learning_rate": 0.0001662425891156531,
+      "loss": 0.7133,
+      "step": 547
+    },
+    {
+      "epoch": 0.2922666666666667,
+      "grad_norm": 0.39266794743367506,
+      "learning_rate": 0.00016611303739816168,
+      "loss": 0.6791,
+      "step": 548
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.4647808500784693,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7461,
+      "step": 549
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.42211551580666573,
+      "learning_rate": 0.00016585334207993476,
+      "loss": 0.6743,
+      "step": 550
+    },
+    {
+      "epoch": 0.29386666666666666,
+      "grad_norm": 0.4274305253989335,
+      "learning_rate": 0.00016572319925468892,
+      "loss": 0.7091,
+      "step": 551
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.46075449646671,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7158,
+      "step": 552
+    },
+    {
+      "epoch": 0.2949333333333333,
+      "grad_norm": 0.3829706690823211,
+      "learning_rate": 0.0001654623252150624,
+      "loss": 0.6864,
+      "step": 553
+    },
+    {
+      "epoch": 0.29546666666666666,
+      "grad_norm": 0.43166617637463833,
+      "learning_rate": 0.00016533159477969122,
+      "loss": 0.7107,
+      "step": 554
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.41566587878888706,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7463,
+      "step": 555
+    },
+    {
+      "epoch": 0.2965333333333333,
+      "grad_norm": 0.4861445190929394,
+      "learning_rate": 0.00016506954902973655,
+      "loss": 0.7505,
+      "step": 556
+    },
+    {
+      "epoch": 0.29706666666666665,
+      "grad_norm": 0.42899606914335303,
+      "learning_rate": 0.00016493823449766136,
+      "loss": 0.7696,
+      "step": 557
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.43587057898117887,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7346,
+      "step": 558
+    },
+    {
+      "epoch": 0.2981333333333333,
+      "grad_norm": 0.3758043179957542,
+      "learning_rate": 0.00016467502407993992,
+      "loss": 0.6867,
+      "step": 559
+    },
+    {
+      "epoch": 0.2986666666666667,
+      "grad_norm": 0.607467088182954,
+      "learning_rate": 0.0001645431289802799,
+      "loss": 0.7239,
+      "step": 560
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.4167855700816348,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.7495,
+      "step": 561
+    },
+    {
+      "epoch": 0.29973333333333335,
+      "grad_norm": 0.4236235965130521,
+      "learning_rate": 0.00016427876096865394,
+      "loss": 0.7287,
+      "step": 562
+    },
+    {
+      "epoch": 0.3002666666666667,
+      "grad_norm": 0.4517003758084696,
+      "learning_rate": 0.00016414628884613107,
+      "loss": 0.7785,
+      "step": 563
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.4092945089652458,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.739,
+      "step": 564
+    },
+    {
+      "epoch": 0.30133333333333334,
+      "grad_norm": 0.3956011254706419,
+      "learning_rate": 0.00016388077034557355,
+      "loss": 0.6638,
+      "step": 565
+    },
+    {
+      "epoch": 0.30186666666666667,
+      "grad_norm": 0.4885937490577909,
+      "learning_rate": 0.00016374772476041748,
+      "loss": 0.7528,
+      "step": 566
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.5059711043497862,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.8453,
+      "step": 567
+    },
+    {
+      "epoch": 0.30293333333333333,
+      "grad_norm": 0.40573829573580594,
+      "learning_rate": 0.00016348106290682118,
+      "loss": 0.7088,
+      "step": 568
+    },
+    {
+      "epoch": 0.30346666666666666,
+      "grad_norm": 0.465500685621695,
+      "learning_rate": 0.00016334744743467364,
+      "loss": 0.7224,
+      "step": 569
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.46349954850323677,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7806,
+      "step": 570
+    },
+    {
+      "epoch": 0.3045333333333333,
+      "grad_norm": 0.4084502476742288,
+      "learning_rate": 0.00016307964939465914,
+      "loss": 0.748,
+      "step": 571
+    },
+    {
+      "epoch": 0.30506666666666665,
+      "grad_norm": 0.46618026160937986,
+      "learning_rate": 0.00016294546762647775,
+      "loss": 0.744,
+      "step": 572
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.46057581494492433,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7116,
+      "step": 573
+    },
+    {
+      "epoch": 0.3061333333333333,
+      "grad_norm": 0.4186403126544077,
+      "learning_rate": 0.0001626765405972011,
+      "loss": 0.6986,
+      "step": 574
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 0.4887227639710563,
+      "learning_rate": 0.00016254179613916278,
+      "loss": 0.7652,
+      "step": 575
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4232905490912348,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7727,
+      "step": 576
+    },
+    {
+      "epoch": 0.30773333333333336,
+      "grad_norm": 0.5453311204995098,
+      "learning_rate": 0.000162271747348122,
+      "loss": 0.7893,
+      "step": 577
+    },
+    {
+      "epoch": 0.3082666666666667,
+      "grad_norm": 0.7462875293508915,
+      "learning_rate": 0.0001621364438215262,
+      "loss": 0.6562,
+      "step": 578
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.4371718880458593,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.6955,
+      "step": 579
+    },
+    {
+      "epoch": 0.30933333333333335,
+      "grad_norm": 0.47518833463505966,
+      "learning_rate": 0.00016186528052636692,
+      "loss": 0.7473,
+      "step": 580
+    },
+    {
+      "epoch": 0.3098666666666667,
+      "grad_norm": 0.432898725708365,
+      "learning_rate": 0.0001617294215675382,
+      "loss": 0.7271,
+      "step": 581
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.4257339192815698,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.721,
+      "step": 582
+    },
+    {
+      "epoch": 0.31093333333333334,
+      "grad_norm": 0.4725311812328365,
+      "learning_rate": 0.0001614571510558588,
+      "loss": 0.7695,
+      "step": 583
+    },
+    {
+      "epoch": 0.31146666666666667,
+      "grad_norm": 0.4114555349920045,
+      "learning_rate": 0.00016132074031604917,
+      "loss": 0.7029,
+      "step": 584
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.4477687628891849,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7415,
+      "step": 585
+    },
+    {
+      "epoch": 0.31253333333333333,
+      "grad_norm": 0.6002534398271313,
+      "learning_rate": 0.00016104736990520468,
+      "loss": 0.7381,
+      "step": 586
+    },
+    {
+      "epoch": 0.31306666666666666,
+      "grad_norm": 0.42608905810913983,
+      "learning_rate": 0.0001609104110504954,
+      "loss": 0.703,
+      "step": 587
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.40846037614765474,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7207,
+      "step": 588
+    },
+    {
+      "epoch": 0.3141333333333333,
+      "grad_norm": 0.4058439104134784,
+      "learning_rate": 0.00016063594808740113,
+      "loss": 0.6249,
+      "step": 589
+    },
+    {
+      "epoch": 0.31466666666666665,
+      "grad_norm": 0.41719280094549344,
+      "learning_rate": 0.00016049844479860422,
+      "loss": 0.6666,
+      "step": 590
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.4658406262264673,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.7272,
+      "step": 591
+    },
+    {
+      "epoch": 0.3157333333333333,
+      "grad_norm": 0.42862341920626473,
+      "learning_rate": 0.00016022289665953808,
+      "loss": 0.72,
+      "step": 592
+    },
+    {
+      "epoch": 0.31626666666666664,
+      "grad_norm": 0.41527747442009044,
+      "learning_rate": 0.00016008485263209742,
+      "loss": 0.7642,
+      "step": 593
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.4828736577182294,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.8004,
+      "step": 594
+    },
+    {
+      "epoch": 0.31733333333333336,
+      "grad_norm": 0.4274931663452689,
+      "learning_rate": 0.0001598082267225018,
+      "loss": 0.6735,
+      "step": 595
+    },
+    {
+      "epoch": 0.3178666666666667,
+      "grad_norm": 0.3742559630665149,
+      "learning_rate": 0.0001596696456663938,
+      "loss": 0.6951,
+      "step": 596
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.40464248241590833,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7478,
+      "step": 597
+    },
+    {
+      "epoch": 0.31893333333333335,
+      "grad_norm": 0.44675532932647755,
+      "learning_rate": 0.00015939194942067646,
+      "loss": 0.6929,
+      "step": 598
+    },
+    {
+      "epoch": 0.3194666666666667,
+      "grad_norm": 0.46074773992887674,
+      "learning_rate": 0.0001592528350603103,
+      "loss": 0.7217,
+      "step": 599
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.38647957927545307,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7062,
+      "step": 600
+    },
+    {
+      "epoch": 0.32053333333333334,
+      "grad_norm": 0.4246676513017054,
+      "learning_rate": 0.00015897407594164467,
+      "loss": 0.7475,
+      "step": 601
+    },
+    {
+      "epoch": 0.32106666666666667,
+      "grad_norm": 0.40868851802811557,
+      "learning_rate": 0.00015883443201576225,
+      "loss": 0.7239,
+      "step": 602
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.43669275845257827,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7675,
+      "step": 603
+    },
+    {
+      "epoch": 0.3221333333333333,
+      "grad_norm": 0.4700495251104396,
+      "learning_rate": 0.00015855461751588677,
+      "loss": 0.7668,
+      "step": 604
+    },
+    {
+      "epoch": 0.32266666666666666,
+      "grad_norm": 0.4245985896912604,
+      "learning_rate": 0.0001584144477774623,
+      "loss": 0.7146,
+      "step": 605
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.39626767330894036,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7086,
+      "step": 606
+    },
+    {
+      "epoch": 0.3237333333333333,
+      "grad_norm": 0.4420705016331542,
+      "learning_rate": 0.00015813358541647915,
+      "loss": 0.7244,
+      "step": 607
+    },
+    {
+      "epoch": 0.32426666666666665,
+      "grad_norm": 0.38883265984573706,
+      "learning_rate": 0.00015799289363261813,
+      "loss": 0.6957,
+      "step": 608
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.39593162571543766,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.6991,
+      "step": 609
+    },
+    {
+      "epoch": 0.3253333333333333,
+      "grad_norm": 0.4282959021196411,
+      "learning_rate": 0.00015771099095879108,
+      "loss": 0.7266,
+      "step": 610
+    },
+    {
+      "epoch": 0.3258666666666667,
+      "grad_norm": 0.45935899293397536,
+      "learning_rate": 0.0001575697809106292,
+      "loss": 0.7648,
+      "step": 611
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.41579751479898225,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.681,
+      "step": 612
+    },
+    {
+      "epoch": 0.32693333333333335,
+      "grad_norm": 0.45019992044675045,
+      "learning_rate": 0.00015728684550018064,
+      "loss": 0.7892,
+      "step": 613
+    },
+    {
+      "epoch": 0.3274666666666667,
+      "grad_norm": 0.4012938728365543,
+      "learning_rate": 0.0001571451209827821,
+      "loss": 0.7167,
+      "step": 614
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.44542554540353824,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7242,
+      "step": 615
+    },
+    {
+      "epoch": 0.32853333333333334,
+      "grad_norm": 0.5166874109785967,
+      "learning_rate": 0.00015686116043968972,
+      "loss": 0.7425,
+      "step": 616
+    },
+    {
+      "epoch": 0.3290666666666667,
+      "grad_norm": 0.3861045162617359,
+      "learning_rate": 0.00015671892526194516,
+      "loss": 0.6238,
+      "step": 617
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.38189145539763464,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.6525,
+      "step": 618
+    },
+    {
+      "epoch": 0.33013333333333333,
+      "grad_norm": 0.45093214082403266,
+      "learning_rate": 0.0001564339472177373,
+      "loss": 0.7241,
+      "step": 619
+    },
+    {
+      "epoch": 0.33066666666666666,
+      "grad_norm": 0.3998854029693829,
+      "learning_rate": 0.00015629120520226165,
+      "loss": 0.662,
+      "step": 620
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.4761640842735284,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7143,
+      "step": 621
+    },
+    {
+      "epoch": 0.3317333333333333,
+      "grad_norm": 0.42470657304255977,
+      "learning_rate": 0.0001560052173158123,
+      "loss": 0.6674,
+      "step": 622
+    },
+    {
+      "epoch": 0.33226666666666665,
+      "grad_norm": 0.49652677264204886,
+      "learning_rate": 0.00015586197229884184,
+      "loss": 0.7792,
+      "step": 623
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.4501076437599239,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.6878,
+      "step": 624
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.42228597830501624,
+      "learning_rate": 0.00015557498225616487,
+      "loss": 0.7351,
+      "step": 625
+    },
+    {
+      "epoch": 0.33386666666666664,
+      "grad_norm": 0.437376548686482,
+      "learning_rate": 0.0001554312380874542,
+      "loss": 0.7486,
+      "step": 626
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.5178983174543825,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7648,
+      "step": 627
+    },
+    {
+      "epoch": 0.33493333333333336,
+      "grad_norm": 0.45741607403630036,
+      "learning_rate": 0.00015514325360149668,
+      "loss": 0.7689,
+      "step": 628
+    },
+    {
+      "epoch": 0.3354666666666667,
+      "grad_norm": 0.4354725296031309,
+      "learning_rate": 0.0001549990141442153,
+      "loss": 0.6399,
+      "step": 629
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.46844788057159364,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7892,
+      "step": 630
+    },
+    {
+      "epoch": 0.33653333333333335,
+      "grad_norm": 0.48302739183458254,
+      "learning_rate": 0.00015471004295465035,
+      "loss": 0.8101,
+      "step": 631
+    },
+    {
+      "epoch": 0.3370666666666667,
+      "grad_norm": 0.48983003466422564,
+      "learning_rate": 0.0001545653120852787,
+      "loss": 0.7603,
+      "step": 632
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.48213837589840736,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7638,
+      "step": 633
+    },
+    {
+      "epoch": 0.33813333333333334,
+      "grad_norm": 0.4342424993196076,
+      "learning_rate": 0.00015427536195829742,
+      "loss": 0.7447,
+      "step": 634
+    },
+    {
+      "epoch": 0.33866666666666667,
+      "grad_norm": 0.4117746056539576,
+      "learning_rate": 0.00015413014356652286,
+      "loss": 0.6763,
+      "step": 635
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.486834356129473,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7339,
+      "step": 636
+    },
+    {
+      "epoch": 0.33973333333333333,
+      "grad_norm": 0.4033184482161715,
+      "learning_rate": 0.00015383922229462549,
+      "loss": 0.685,
+      "step": 637
+    },
+    {
+      "epoch": 0.34026666666666666,
+      "grad_norm": 0.4425030536429688,
+      "learning_rate": 0.00015369352028323774,
+      "loss": 0.7379,
+      "step": 638
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.5293214087093348,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7537,
+      "step": 639
+    },
+    {
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.5095684563003335,
+      "learning_rate": 0.0001534016356850244,
+      "loss": 0.7049,
+      "step": 640
+    },
+    {
+      "epoch": 0.34186666666666665,
+      "grad_norm": 0.4659713562959859,
+      "learning_rate": 0.0001532554539698105,
+      "loss": 0.7359,
+      "step": 641
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.6119798190509462,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.737,
+      "step": 642
+    },
+    {
+      "epoch": 0.3429333333333333,
+      "grad_norm": 0.4708897469837848,
+      "learning_rate": 0.00015296261388977108,
+      "loss": 0.729,
+      "step": 643
+    },
+    {
+      "epoch": 0.34346666666666664,
+      "grad_norm": 0.44068333484512756,
+      "learning_rate": 0.0001528159563994104,
+      "loss": 0.7278,
+      "step": 644
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.4134928678532226,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7084,
+      "step": 645
+    },
+    {
+      "epoch": 0.34453333333333336,
+      "grad_norm": 0.47449717510451944,
+      "learning_rate": 0.00015252216870771345,
+      "loss": 0.7386,
+      "step": 646
+    },
+    {
+      "epoch": 0.3450666666666667,
+      "grad_norm": 0.4427366038885486,
+      "learning_rate": 0.00015237503938367186,
+      "loss": 0.7096,
+      "step": 647
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.5097252721080822,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7793,
+      "step": 648
+    },
+    {
+      "epoch": 0.34613333333333335,
+      "grad_norm": 0.4473496568576032,
+      "learning_rate": 0.00015208031197595356,
+      "loss": 0.7518,
+      "step": 649
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.3866823362517065,
+      "learning_rate": 0.0001519327147723776,
+      "loss": 0.6404,
+      "step": 650
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.42438262541061145,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7081,
+      "step": 651
+    },
+    {
+      "epoch": 0.34773333333333334,
+      "grad_norm": 0.49120729380307165,
+      "learning_rate": 0.0001516370555695291,
+      "loss": 0.6842,
+      "step": 652
+    },
+    {
+      "epoch": 0.34826666666666667,
+      "grad_norm": 0.3898907761516741,
+      "learning_rate": 0.00015148899445313981,
+      "loss": 0.6654,
+      "step": 653
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.43020618806679295,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.718,
+      "step": 654
+    },
+    {
+      "epoch": 0.34933333333333333,
+      "grad_norm": 0.43215457447900185,
+      "learning_rate": 0.00015119241140109467,
+      "loss": 0.6917,
+      "step": 655
+    },
+    {
+      "epoch": 0.34986666666666666,
+      "grad_norm": 0.42322867622099,
+      "learning_rate": 0.00015104389035108077,
+      "loss": 0.7603,
+      "step": 656
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.4036510101547229,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.6841,
+      "step": 657
+    },
+    {
+      "epoch": 0.3509333333333333,
+      "grad_norm": 0.4862966488933275,
+      "learning_rate": 0.0001507463914206012,
+      "loss": 0.7459,
+      "step": 658
+    },
+    {
+      "epoch": 0.35146666666666665,
+      "grad_norm": 0.4179069068821482,
+      "learning_rate": 0.0001505974144285124,
+      "loss": 0.7176,
+      "step": 659
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.47772376010986584,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7915,
+      "step": 660
+    },
+    {
+      "epoch": 0.3525333333333333,
+      "grad_norm": 0.4649768813334875,
+      "learning_rate": 0.00015029900761497506,
+      "loss": 0.716,
+      "step": 661
+    },
+    {
+      "epoch": 0.35306666666666664,
+      "grad_norm": 0.6083591803435475,
+      "learning_rate": 0.00015014957868461458,
+      "loss": 0.805,
+      "step": 662
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.4672825249220747,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.7404,
+      "step": 663
+    },
+    {
+      "epoch": 0.35413333333333336,
+      "grad_norm": 0.4716621448203455,
+      "learning_rate": 0.000149850272007796,
+      "loss": 0.8138,
+      "step": 664
+    },
+    {
+      "epoch": 0.3546666666666667,
+      "grad_norm": 0.466442215550732,
+      "learning_rate": 0.00014970039515511304,
+      "loss": 0.7081,
+      "step": 665
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.5398723187413823,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7096,
+      "step": 666
+    },
+    {
+      "epoch": 0.35573333333333335,
+      "grad_norm": 0.38001693981880735,
+      "learning_rate": 0.0001494001966589736,
+      "loss": 0.6827,
+      "step": 667
+    },
+    {
+      "epoch": 0.3562666666666667,
+      "grad_norm": 0.3791680356368391,
+      "learning_rate": 0.00014924987591195547,
+      "loss": 0.7309,
+      "step": 668
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.4307431262259101,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.6981,
+      "step": 669
+    },
+    {
+      "epoch": 0.35733333333333334,
+      "grad_norm": 0.4065214548484782,
+      "learning_rate": 0.0001489487936644237,
+      "loss": 0.7182,
+      "step": 670
+    },
+    {
+      "epoch": 0.35786666666666667,
+      "grad_norm": 0.4093230703093243,
+      "learning_rate": 0.00014879803306298736,
+      "loss": 0.6789,
+      "step": 671
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.41057555483980834,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7525,
+      "step": 672
+    },
+    {
+      "epoch": 0.3589333333333333,
+      "grad_norm": 0.4914727714968542,
+      "learning_rate": 0.00014849607515574276,
+      "loss": 0.772,
+      "step": 673
+    },
+    {
+      "epoch": 0.35946666666666666,
+      "grad_norm": 0.40548534436984224,
+      "learning_rate": 0.00014834487875162657,
+      "loss": 0.6628,
+      "step": 674
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4729355852851271,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.7216,
+      "step": 675
+    },
+    {
+      "epoch": 0.3605333333333333,
+      "grad_norm": 0.5009609876468544,
+      "learning_rate": 0.00014804205329988225,
+      "loss": 0.7836,
+      "step": 676
+    },
+    {
+      "epoch": 0.36106666666666665,
+      "grad_norm": 0.3834101745585515,
+      "learning_rate": 0.00014789042515653687,
+      "loss": 0.669,
+      "step": 677
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4800530337707906,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7649,
+      "step": 678
+    },
+    {
+      "epoch": 0.3621333333333333,
+      "grad_norm": 0.4884464679203419,
+      "learning_rate": 0.00014758674029882152,
+      "loss": 0.7542,
+      "step": 679
+    },
+    {
+      "epoch": 0.3626666666666667,
+      "grad_norm": 0.4380897346481966,
+      "learning_rate": 0.00014743468449130063,
+      "loss": 0.6753,
+      "step": 680
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.4135500392219363,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.6421,
+      "step": 681
+    },
+    {
+      "epoch": 0.36373333333333335,
+      "grad_norm": 0.427629710378204,
+      "learning_rate": 0.00014713014838923976,
+      "loss": 0.7122,
+      "step": 682
+    },
+    {
+      "epoch": 0.3642666666666667,
+      "grad_norm": 0.4228166917204371,
+      "learning_rate": 0.00014697766900409074,
+      "loss": 0.6429,
+      "step": 683
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.45349718531469274,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7213,
+      "step": 684
+    },
+    {
+      "epoch": 0.36533333333333334,
+      "grad_norm": 0.41425188990800704,
+      "learning_rate": 0.0001466722898421873,
+      "loss": 0.7262,
+      "step": 685
+    },
+    {
+      "epoch": 0.3658666666666667,
+      "grad_norm": 0.4175380869497653,
+      "learning_rate": 0.0001465193909773413,
+      "loss": 0.6778,
+      "step": 686
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.4794562484958292,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.7359,
+      "step": 687
+    },
+    {
+      "epoch": 0.36693333333333333,
+      "grad_norm": 0.4403153698666154,
+      "learning_rate": 0.00014621317696275564,
+      "loss": 0.7098,
+      "step": 688
+    },
+    {
+      "epoch": 0.36746666666666666,
+      "grad_norm": 0.48244562188451673,
+      "learning_rate": 0.00014605986272741748,
+      "loss": 0.7391,
+      "step": 689
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3828289265791597,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.6977,
+      "step": 690
+    },
+    {
+      "epoch": 0.3685333333333333,
+      "grad_norm": 0.443166307466767,
+      "learning_rate": 0.00014575282208974702,
+      "loss": 0.7117,
+      "step": 691
+    },
+    {
+      "epoch": 0.36906666666666665,
+      "grad_norm": 0.4219277645777576,
+      "learning_rate": 0.00014559909660428468,
+      "loss": 0.6872,
+      "step": 692
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.4543729245200786,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7291,
+      "step": 693
+    },
+    {
+      "epoch": 0.3701333333333333,
+      "grad_norm": 0.4336134648315488,
+      "learning_rate": 0.00014529123759534255,
+      "loss": 0.7074,
+      "step": 694
+    },
+    {
+      "epoch": 0.37066666666666664,
+      "grad_norm": 0.5216204706317024,
+      "learning_rate": 0.00014513710499117647,
+      "loss": 0.7158,
+      "step": 695
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4323115758318995,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.6855,
+      "step": 696
+    },
+    {
+      "epoch": 0.37173333333333336,
+      "grad_norm": 0.38010089550321274,
+      "learning_rate": 0.00014482843588476974,
+      "loss": 0.6704,
+      "step": 697
+    },
+    {
+      "epoch": 0.3722666666666667,
+      "grad_norm": 0.42628614027943795,
+      "learning_rate": 0.00014467390030426186,
+      "loss": 0.6376,
+      "step": 698
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.4708167301566809,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.6968,
+      "step": 699
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.38995110538793865,
+      "learning_rate": 0.0001443644293959693,
+      "loss": 0.74,
+      "step": 700
+    },
+    {
+      "epoch": 0.3738666666666667,
+      "grad_norm": 0.36705363589139167,
+      "learning_rate": 0.00014420949499231172,
+      "loss": 0.6154,
+      "step": 701
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4028965104030593,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.6974,
+      "step": 702
+    },
+    {
+      "epoch": 0.37493333333333334,
+      "grad_norm": 0.46764764133097736,
+      "learning_rate": 0.00014389923059926062,
+      "loss": 0.7056,
+      "step": 703
+    },
+    {
+      "epoch": 0.37546666666666667,
+      "grad_norm": 0.44110688137694515,
+      "learning_rate": 0.0001437439015363638,
+      "loss": 0.7191,
+      "step": 704
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.4061256130298787,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.6817,
+      "step": 705
+    },
+    {
+      "epoch": 0.37653333333333333,
+      "grad_norm": 0.4491104837769522,
+      "learning_rate": 0.00014343285199700683,
+      "loss": 0.7538,
+      "step": 706
+    },
+    {
+      "epoch": 0.37706666666666666,
+      "grad_norm": 0.40016104536571145,
+      "learning_rate": 0.0001432771324493879,
+      "loss": 0.6359,
+      "step": 707
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.39878375968038315,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.6714,
+      "step": 708
+    },
+    {
+      "epoch": 0.3781333333333333,
+      "grad_norm": 0.4724281238370389,
+      "learning_rate": 0.00014296530612327863,
+      "loss": 0.7213,
+      "step": 709
+    },
+    {
+      "epoch": 0.37866666666666665,
+      "grad_norm": 0.45664756846229454,
+      "learning_rate": 0.00014280920027594907,
+      "loss": 0.7172,
+      "step": 710
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.45508257643312405,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.8259,
+      "step": 711
+    },
+    {
+      "epoch": 0.3797333333333333,
+      "grad_norm": 0.4194073068520734,
+      "learning_rate": 0.00014249660554351752,
+      "loss": 0.6845,
+      "step": 712
+    },
+    {
+      "epoch": 0.38026666666666664,
+      "grad_norm": 0.4229920455577015,
+      "learning_rate": 0.00014234011759187083,
+      "loss": 0.7134,
+      "step": 713
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.40939403836641886,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7229,
+      "step": 714
+    },
+    {
+      "epoch": 0.38133333333333336,
+      "grad_norm": 0.38061804235872376,
+      "learning_rate": 0.00014202676285419812,
+      "loss": 0.6661,
+      "step": 715
+    },
+    {
+      "epoch": 0.3818666666666667,
+      "grad_norm": 0.3673298641903888,
+      "learning_rate": 0.00014186989700389687,
+      "loss": 0.6698,
+      "step": 716
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.33974242079986683,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.6866,
+      "step": 717
+    },
+    {
+      "epoch": 0.38293333333333335,
+      "grad_norm": 0.4676215441430991,
+      "learning_rate": 0.0001415557906824895,
+      "loss": 0.7178,
+      "step": 718
+    },
+    {
+      "epoch": 0.3834666666666667,
+      "grad_norm": 0.4156798745638868,
+      "learning_rate": 0.00014139855114935252,
+      "loss": 0.6987,
+      "step": 719
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.5414531691031648,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.6823,
+      "step": 720
+    },
+    {
+      "epoch": 0.38453333333333334,
+      "grad_norm": 0.41556486731924086,
+      "learning_rate": 0.0001410837016859161,
+      "loss": 0.7134,
+      "step": 721
+    },
+    {
+      "epoch": 0.38506666666666667,
+      "grad_norm": 0.4421144449207165,
+      "learning_rate": 0.00014092609269580496,
+      "loss": 0.7082,
+      "step": 722
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.48901637476282334,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7534,
+      "step": 723
+    },
+    {
+      "epoch": 0.38613333333333333,
+      "grad_norm": 0.44142637208023006,
+      "learning_rate": 0.00014061050855201723,
+      "loss": 0.725,
+      "step": 724
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 0.4240057251397034,
+      "learning_rate": 0.0001404525343407228,
+      "loss": 0.6957,
+      "step": 725
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.439688615426819,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.6955,
+      "step": 726
+    },
+    {
+      "epoch": 0.3877333333333333,
+      "grad_norm": 0.49073270188074036,
+      "learning_rate": 0.00014013622399800627,
+      "loss": 0.7933,
+      "step": 727
+    },
+    {
+      "epoch": 0.38826666666666665,
+      "grad_norm": 0.4063339004499898,
+      "learning_rate": 0.00013997788881113489,
+      "loss": 0.7032,
+      "step": 728
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.45779049125750276,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7243,
+      "step": 729
+    },
+    {
+      "epoch": 0.3893333333333333,
+      "grad_norm": 0.404914939178033,
+      "learning_rate": 0.0001396608607704289,
+      "loss": 0.6788,
+      "step": 730
+    },
+    {
+      "epoch": 0.38986666666666664,
+      "grad_norm": 0.3972068026390955,
+      "learning_rate": 0.0001395021688632882,
+      "loss": 0.6703,
+      "step": 731
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.4879619968648427,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7201,
+      "step": 732
+    },
+    {
+      "epoch": 0.39093333333333335,
+      "grad_norm": 0.4679964681011993,
+      "learning_rate": 0.00013918443164482046,
+      "loss": 0.7735,
+      "step": 733
+    },
+    {
+      "epoch": 0.3914666666666667,
+      "grad_norm": 0.42072753597519685,
+      "learning_rate": 0.000139025387282305,
+      "loss": 0.6898,
+      "step": 734
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.4726482957516662,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.6847,
+      "step": 735
+    },
+    {
+      "epoch": 0.39253333333333335,
+      "grad_norm": 0.4887903834493641,
+      "learning_rate": 0.0001387069494253626,
+      "loss": 0.7416,
+      "step": 736
+    },
+    {
+      "epoch": 0.3930666666666667,
+      "grad_norm": 0.40143329891058066,
+      "learning_rate": 0.0001385475568818394,
+      "loss": 0.7716,
+      "step": 737
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.41547353873035464,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7366,
+      "step": 738
+    },
+    {
+      "epoch": 0.39413333333333334,
+      "grad_norm": 0.40206206668994704,
+      "learning_rate": 0.00013822842694453924,
+      "loss": 0.6793,
+      "step": 739
+    },
+    {
+      "epoch": 0.39466666666666667,
+      "grad_norm": 0.41347048078549353,
+      "learning_rate": 0.0001380686905037327,
+      "loss": 0.7255,
+      "step": 740
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.46793924880980436,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7162,
+      "step": 741
+    },
+    {
+      "epoch": 0.3957333333333333,
+      "grad_norm": 0.38303959136396093,
+      "learning_rate": 0.00013774887706279165,
+      "loss": 0.7203,
+      "step": 742
+    },
+    {
+      "epoch": 0.39626666666666666,
+      "grad_norm": 0.4579613008032338,
+      "learning_rate": 0.0001375888010176686,
+      "loss": 0.7542,
+      "step": 743
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4553518771062613,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7761,
+      "step": 744
+    },
+    {
+      "epoch": 0.3973333333333333,
+      "grad_norm": 0.3753163192522391,
+      "learning_rate": 0.00013726831266817278,
+      "loss": 0.7468,
+      "step": 745
+    },
+    {
+      "epoch": 0.39786666666666665,
+      "grad_norm": 0.4037705646029616,
+      "learning_rate": 0.00013710790132082692,
+      "loss": 0.7258,
+      "step": 746
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.5474063355670041,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7031,
+      "step": 747
+    },
+    {
+      "epoch": 0.3989333333333333,
+      "grad_norm": 0.40706363110628097,
+      "learning_rate": 0.00013678674667600102,
+      "loss": 0.6722,
+      "step": 748
+    },
+    {
+      "epoch": 0.3994666666666667,
+      "grad_norm": 0.4504615444140748,
+      "learning_rate": 0.00013662600433753745,
+      "loss": 0.6885,
+      "step": 749
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.47968476769032353,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7307,
+      "step": 750
+    },
+    {
+      "epoch": 0.40053333333333335,
+      "grad_norm": 0.44061680012937754,
+      "learning_rate": 0.00013630419202851284,
+      "loss": 0.6696,
+      "step": 751
+    },
+    {
+      "epoch": 0.4010666666666667,
+      "grad_norm": 0.4305693762750893,
+      "learning_rate": 0.00013614312301893223,
+      "loss": 0.7481,
+      "step": 752
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.37222997154847426,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.6699,
+      "step": 753
+    },
+    {
+      "epoch": 0.40213333333333334,
+      "grad_norm": 0.41079752183961143,
+      "learning_rate": 0.00013582066169451535,
+      "loss": 0.7348,
+      "step": 754
+    },
+    {
+      "epoch": 0.4026666666666667,
+      "grad_norm": 0.40773869451985867,
+      "learning_rate": 0.0001356592703425976,
+      "loss": 0.6857,
+      "step": 755
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.47169748823654517,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7367,
+      "step": 756
+    },
+    {
+      "epoch": 0.40373333333333333,
+      "grad_norm": 0.45001025674489187,
+      "learning_rate": 0.00013533616866903735,
+      "loss": 0.6541,
+      "step": 757
+    },
+    {
+      "epoch": 0.40426666666666666,
+      "grad_norm": 0.4345333849845057,
+      "learning_rate": 0.0001351744593122255,
+      "loss": 0.7513,
+      "step": 758
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.4438907915023372,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7784,
+      "step": 759
+    },
+    {
+      "epoch": 0.4053333333333333,
+      "grad_norm": 0.4133629646551236,
+      "learning_rate": 0.00013485072597298038,
+      "loss": 0.763,
+      "step": 760
+    },
+    {
+      "epoch": 0.40586666666666665,
+      "grad_norm": 0.4214493595283753,
+      "learning_rate": 0.00013468870295726398,
+      "loss": 0.7029,
+      "step": 761
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.402566253476059,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.7763,
+      "step": 762
+    },
+    {
+      "epoch": 0.4069333333333333,
+      "grad_norm": 0.4395179020145512,
+      "learning_rate": 0.00013436434665276865,
+      "loss": 0.7388,
+      "step": 763
+    },
+    {
+      "epoch": 0.40746666666666664,
+      "grad_norm": 0.4019633578648377,
+      "learning_rate": 0.00013420201433256689,
+      "loss": 0.7098,
+      "step": 764
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.4221451934470314,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7585,
+      "step": 765
+    },
+    {
+      "epoch": 0.40853333333333336,
+      "grad_norm": 0.49205607465247114,
+      "learning_rate": 0.00013387704377999842,
+      "loss": 0.6805,
+      "step": 766
+    },
+    {
+      "epoch": 0.4090666666666667,
+      "grad_norm": 0.4687478768923137,
+      "learning_rate": 0.00013371440651804313,
+      "loss": 0.6652,
+      "step": 767
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4420764881796587,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7045,
+      "step": 768
+    },
+    {
+      "epoch": 0.41013333333333335,
+      "grad_norm": 0.48578145246982046,
+      "learning_rate": 0.00013338883045108674,
+      "loss": 0.7397,
+      "step": 769
+    },
+    {
+      "epoch": 0.4106666666666667,
+      "grad_norm": 0.39700424142863916,
+      "learning_rate": 0.00013322589261830517,
+      "loss": 0.72,
+      "step": 770
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.45825900340675857,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7296,
+      "step": 771
+    },
+    {
+      "epoch": 0.41173333333333334,
+      "grad_norm": 0.45527861986215046,
+      "learning_rate": 0.0001328997197869194,
+      "loss": 0.6884,
+      "step": 772
+    },
+    {
+      "epoch": 0.41226666666666667,
+      "grad_norm": 0.4481830986741094,
+      "learning_rate": 0.0001327364857623168,
+      "loss": 0.7575,
+      "step": 773
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4987312925439467,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.6603,
+      "step": 774
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 0.39534240050858144,
+      "learning_rate": 0.00013240972493249847,
+      "loss": 0.6948,
+      "step": 775
+    },
+    {
+      "epoch": 0.41386666666666666,
+      "grad_norm": 0.3968442688352337,
+      "learning_rate": 0.0001322461991030402,
+      "loss": 0.7356,
+      "step": 776
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.39866634478629737,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7222,
+      "step": 777
+    },
+    {
+      "epoch": 0.4149333333333333,
+      "grad_norm": 0.4371120490452384,
+      "learning_rate": 0.00013191885905658872,
+      "loss": 0.7211,
+      "step": 778
+    },
+    {
+      "epoch": 0.41546666666666665,
+      "grad_norm": 0.43599166673531314,
+      "learning_rate": 0.0001317550458170826,
+      "loss": 0.7457,
+      "step": 779
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4492614272388021,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7184,
+      "step": 780
+    },
+    {
+      "epoch": 0.4165333333333333,
+      "grad_norm": 0.4129896919942516,
+      "learning_rate": 0.00013142713535136414,
+      "loss": 0.7208,
+      "step": 781
+    },
+    {
+      "epoch": 0.41706666666666664,
+      "grad_norm": 0.40376552851306863,
+      "learning_rate": 0.00013126303910434214,
+      "loss": 0.7316,
+      "step": 782
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.4185572212312969,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7052,
+      "step": 783
+    },
+    {
+      "epoch": 0.41813333333333336,
+      "grad_norm": 0.4881424014088192,
+      "learning_rate": 0.00013093456703205288,
+      "loss": 0.7074,
+      "step": 784
+    },
+    {
+      "epoch": 0.4186666666666667,
+      "grad_norm": 0.4610877464624192,
+      "learning_rate": 0.00013077019218765305,
+      "loss": 0.7387,
+      "step": 785
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.44572746138602026,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.663,
+      "step": 786
+    },
+    {
+      "epoch": 0.41973333333333335,
+      "grad_norm": 0.36716569675658006,
+      "learning_rate": 0.0001304411673365826,
+      "loss": 0.6498,
+      "step": 787
+    },
+    {
+      "epoch": 0.4202666666666667,
+      "grad_norm": 0.36917178328838557,
+      "learning_rate": 0.0001302765183124302,
+      "loss": 0.6883,
+      "step": 788
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.4047570582334391,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7388,
+      "step": 789
+    },
+    {
+      "epoch": 0.42133333333333334,
+      "grad_norm": 0.45436952725535107,
+      "learning_rate": 0.00012994694952522435,
+      "loss": 0.8112,
+      "step": 790
+    },
+    {
+      "epoch": 0.42186666666666667,
+      "grad_norm": 0.42679066990560155,
+      "learning_rate": 0.00012978203074631334,
+      "loss": 0.684,
+      "step": 791
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.4382352768580495,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7128,
+      "step": 792
+    },
+    {
+      "epoch": 0.42293333333333333,
+      "grad_norm": 0.4145094988517289,
+      "learning_rate": 0.00012945192688023624,
+      "loss": 0.6848,
+      "step": 793
+    },
+    {
+      "epoch": 0.42346666666666666,
+      "grad_norm": 0.36075755397066733,
+      "learning_rate": 0.0001292867427788104,
+      "loss": 0.6619,
+      "step": 794
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.3712743508124415,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.6506,
+      "step": 795
+    },
+    {
+      "epoch": 0.4245333333333333,
+      "grad_norm": 0.37546797432265555,
+      "learning_rate": 0.00012895611270550666,
+      "loss": 0.6462,
+      "step": 796
+    },
+    {
+      "epoch": 0.42506666666666665,
+      "grad_norm": 0.3767351358646228,
+      "learning_rate": 0.0001287906677209403,
+      "loss": 0.6773,
+      "step": 797
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.4710095609101957,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7257,
+      "step": 798
+    },
+    {
+      "epoch": 0.4261333333333333,
+      "grad_norm": 0.36281296561704746,
+      "learning_rate": 0.0001284595203261965,
+      "loss": 0.5821,
+      "step": 799
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.42079112992195267,
+      "learning_rate": 0.00012829381890487536,
+      "loss": 0.7005,
+      "step": 800
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.43713247867130345,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.6935,
+      "step": 801
+    },
+    {
+      "epoch": 0.42773333333333335,
+      "grad_norm": 0.40721407933818515,
+      "learning_rate": 0.00012796216308838117,
+      "loss": 0.6983,
+      "step": 802
+    },
+    {
+      "epoch": 0.4282666666666667,
+      "grad_norm": 0.416091539276782,
+      "learning_rate": 0.00012779620968358273,
+      "loss": 0.7329,
+      "step": 803
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3551946822819585,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.6448,
+      "step": 804
+    },
+    {
+      "epoch": 0.42933333333333334,
+      "grad_norm": 0.40351741659392415,
+      "learning_rate": 0.00012746405435869198,
+      "loss": 0.7009,
+      "step": 805
+    },
+    {
+      "epoch": 0.4298666666666667,
+      "grad_norm": 0.4787621556554398,
+      "learning_rate": 0.00012729785343046588,
+      "loss": 0.7258,
+      "step": 806
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.3892528430573125,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.6523,
+      "step": 807
+    },
+    {
+      "epoch": 0.43093333333333333,
+      "grad_norm": 0.37727840676156577,
+      "learning_rate": 0.00012696520752395672,
+      "loss": 0.5908,
+      "step": 808
+    },
+    {
+      "epoch": 0.43146666666666667,
+      "grad_norm": 0.43095274549454,
+      "learning_rate": 0.00012679876353900482,
+      "loss": 0.7181,
+      "step": 809
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.38310526976068127,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.6361,
+      "step": 810
+    },
+    {
+      "epoch": 0.4325333333333333,
+      "grad_norm": 0.5635091893555705,
+      "learning_rate": 0.00012646563599083996,
+      "loss": 0.704,
+      "step": 811
+    },
+    {
+      "epoch": 0.43306666666666666,
+      "grad_norm": 0.41744990618571426,
+      "learning_rate": 0.00012629895342239643,
+      "loss": 0.7536,
+      "step": 812
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.40509174607945353,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.6173,
+      "step": 813
+    },
+    {
+      "epoch": 0.4341333333333333,
+      "grad_norm": 0.3940937013071912,
+      "learning_rate": 0.00012596535318548289,
+      "loss": 0.6644,
+      "step": 814
+    },
+    {
+      "epoch": 0.43466666666666665,
+      "grad_norm": 0.4578480614007167,
+      "learning_rate": 0.0001257984365131938,
+      "loss": 0.7495,
+      "step": 815
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.39450558796923313,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.6689,
+      "step": 816
+    },
+    {
+      "epoch": 0.4357333333333333,
+      "grad_norm": 0.4470652101566401,
+      "learning_rate": 0.00012546437255314222,
+      "loss": 0.6808,
+      "step": 817
+    },
+    {
+      "epoch": 0.4362666666666667,
+      "grad_norm": 0.39295591566329835,
+      "learning_rate": 0.0001252972262629454,
+      "loss": 0.7356,
+      "step": 818
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.42222344198393386,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.6428,
+      "step": 819
+    },
+    {
+      "epoch": 0.43733333333333335,
+      "grad_norm": 0.4625162506790576,
+      "learning_rate": 0.00012496270755782914,
+      "loss": 0.8178,
+      "step": 820
+    },
+    {
+      "epoch": 0.4378666666666667,
+      "grad_norm": 0.4022265574948015,
+      "learning_rate": 0.00012479533614183334,
+      "loss": 0.692,
+      "step": 821
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.6056096144027203,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.6708,
+      "step": 822
+    },
+    {
+      "epoch": 0.43893333333333334,
+      "grad_norm": 0.46250737240915346,
+      "learning_rate": 0.00012446037168194714,
+      "loss": 0.7153,
+      "step": 823
+    },
+    {
+      "epoch": 0.43946666666666667,
+      "grad_norm": 0.4117622696079509,
+      "learning_rate": 0.00012429277963831148,
+      "loss": 0.6545,
+      "step": 824
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.41224353888067033,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.6996,
+      "step": 825
+    },
+    {
+      "epoch": 0.44053333333333333,
+      "grad_norm": 0.3624384260730754,
+      "learning_rate": 0.00012395737842592995,
+      "loss": 0.6831,
+      "step": 826
+    },
+    {
+      "epoch": 0.44106666666666666,
+      "grad_norm": 0.4377808159656789,
+      "learning_rate": 0.000123789570258743,
+      "loss": 0.7198,
+      "step": 827
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.36137625136521534,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.6056,
+      "step": 828
+    },
+    {
+      "epoch": 0.4421333333333333,
+      "grad_norm": 0.37493121821106284,
+      "learning_rate": 0.00012345374130787854,
+      "loss": 0.6888,
+      "step": 829
+    },
+    {
+      "epoch": 0.44266666666666665,
+      "grad_norm": 0.3718927110258117,
+      "learning_rate": 0.00012328572152703725,
+      "loss": 0.657,
+      "step": 830
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.5031044687304947,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.744,
+      "step": 831
+    },
+    {
+      "epoch": 0.4437333333333333,
+      "grad_norm": 0.43078111700694655,
+      "learning_rate": 0.00012294947386319794,
+      "loss": 0.6694,
+      "step": 832
+    },
+    {
+      "epoch": 0.44426666666666664,
+      "grad_norm": 0.4639228000745811,
+      "learning_rate": 0.0001227812469842864,
+      "loss": 0.7747,
+      "step": 833
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.39758629630564035,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.6426,
+      "step": 834
+    },
+    {
+      "epoch": 0.44533333333333336,
+      "grad_norm": 0.4063459316169701,
+      "learning_rate": 0.00012244458964423327,
+      "loss": 0.7086,
+      "step": 835
+    },
+    {
+      "epoch": 0.4458666666666667,
+      "grad_norm": 0.4748592102336087,
+      "learning_rate": 0.00012227616018840154,
+      "loss": 0.7329,
+      "step": 836
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.4205979075975615,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.6909,
+      "step": 837
+    },
+    {
+      "epoch": 0.44693333333333335,
+      "grad_norm": 0.44685321510698384,
+      "learning_rate": 0.00012193910221990581,
+      "loss": 0.6986,
+      "step": 838
+    },
+    {
+      "epoch": 0.4474666666666667,
+      "grad_norm": 0.42504187867710974,
+      "learning_rate": 0.00012177047471374807,
+      "loss": 0.6404,
+      "step": 839
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.49042418899558937,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7014,
+      "step": 840
+    },
+    {
+      "epoch": 0.44853333333333334,
+      "grad_norm": 0.41850019432821417,
+      "learning_rate": 0.0001214330251753481,
+      "loss": 0.7016,
+      "step": 841
+    },
+    {
+      "epoch": 0.44906666666666667,
+      "grad_norm": 0.45347906795750986,
+      "learning_rate": 0.00012126420415078132,
+      "loss": 0.7231,
+      "step": 842
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.387587711988338,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.715,
+      "step": 843
+    },
+    {
+      "epoch": 0.45013333333333333,
+      "grad_norm": 0.41162680743585645,
+      "learning_rate": 0.00012092637211153885,
+      "loss": 0.6825,
+      "step": 844
+    },
+    {
+      "epoch": 0.45066666666666666,
+      "grad_norm": 0.4142208409837441,
+      "learning_rate": 0.0001207573621056809,
+      "loss": 0.676,
+      "step": 845
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.42920067190846045,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.8196,
+      "step": 846
+    },
+    {
+      "epoch": 0.4517333333333333,
+      "grad_norm": 0.4638661761075869,
+      "learning_rate": 0.00012041915664493761,
+      "loss": 0.7171,
+      "step": 847
+    },
+    {
+      "epoch": 0.45226666666666665,
+      "grad_norm": 0.4742798926425448,
+      "learning_rate": 0.00012024996219998517,
+      "loss": 0.6643,
+      "step": 848
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.37690347692164017,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.6683,
+      "step": 849
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.4042102819655462,
+      "learning_rate": 0.00011991139240711857,
+      "loss": 0.669,
+      "step": 850
+    },
+    {
+      "epoch": 0.45386666666666664,
+      "grad_norm": 0.3633750799310292,
+      "learning_rate": 0.00011974201807022525,
+      "loss": 0.6655,
+      "step": 851
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.4772890784984497,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.6983,
+      "step": 852
+    },
+    {
+      "epoch": 0.45493333333333336,
+      "grad_norm": 0.4067014806459648,
+      "learning_rate": 0.00011940309304440433,
+      "loss": 0.7198,
+      "step": 853
+    },
+    {
+      "epoch": 0.4554666666666667,
+      "grad_norm": 0.4342151246933155,
+      "learning_rate": 0.00011923354336755835,
+      "loss": 0.7167,
+      "step": 854
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.4834339455954333,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.6081,
+      "step": 855
+    },
+    {
+      "epoch": 0.45653333333333335,
+      "grad_norm": 0.4679919863240487,
+      "learning_rate": 0.00011889427221749916,
+      "loss": 0.7148,
+      "step": 856
+    },
+    {
+      "epoch": 0.4570666666666667,
+      "grad_norm": 0.3728667535780497,
+      "learning_rate": 0.00011872455175740112,
+      "loss": 0.7109,
+      "step": 857
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.4180859609410972,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.6249,
+      "step": 858
+    },
+    {
+      "epoch": 0.45813333333333334,
+      "grad_norm": 0.4440989914199599,
+      "learning_rate": 0.00011838494360112185,
+      "loss": 0.7309,
+      "step": 859
+    },
+    {
+      "epoch": 0.45866666666666667,
+      "grad_norm": 0.6043139218010019,
+      "learning_rate": 0.00011821505691906216,
+      "loss": 0.8382,
+      "step": 860
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.48735406987908186,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.707,
+      "step": 861
+    },
+    {
+      "epoch": 0.4597333333333333,
+      "grad_norm": 0.4125045692505512,
+      "learning_rate": 0.00011787512088363817,
+      "loss": 0.6703,
+      "step": 862
+    },
+    {
+      "epoch": 0.46026666666666666,
+      "grad_norm": 0.5310983266217417,
+      "learning_rate": 0.00011770507254537453,
+      "loss": 0.7686,
+      "step": 863
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.5097574837010186,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.7524,
+      "step": 864
+    },
+    {
+      "epoch": 0.4613333333333333,
+      "grad_norm": 0.47956622154740414,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.685,
+      "step": 865
+    },
+    {
+      "epoch": 0.46186666666666665,
+      "grad_norm": 0.4561679545204111,
+      "learning_rate": 0.00011719461234232764,
+      "loss": 0.7041,
+      "step": 866
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.4154388504558048,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.6099,
+      "step": 867
+    },
+    {
+      "epoch": 0.4629333333333333,
+      "grad_norm": 0.4543256122908966,
+      "learning_rate": 0.00011685404796484225,
+      "loss": 0.6958,
+      "step": 868
+    },
+    {
+      "epoch": 0.4634666666666667,
+      "grad_norm": 0.41312564730628,
+      "learning_rate": 0.00011668369002869912,
+      "loss": 0.6957,
+      "step": 869
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.44743686800004545,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7224,
+      "step": 870
+    },
+    {
+      "epoch": 0.46453333333333335,
+      "grad_norm": 0.46432629274912773,
+      "learning_rate": 0.00011634282520518383,
+      "loss": 0.6909,
+      "step": 871
+    },
+    {
+      "epoch": 0.4650666666666667,
+      "grad_norm": 0.4492010051884188,
+      "learning_rate": 0.00011617231933568578,
+      "loss": 0.7444,
+      "step": 872
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.4040632376780653,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.6699,
+      "step": 873
+    },
+    {
+      "epoch": 0.46613333333333334,
+      "grad_norm": 0.3778954928686016,
+      "learning_rate": 0.00011583116322698935,
+      "loss": 0.6573,
+      "step": 874
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.42941828850566793,
+      "learning_rate": 0.00011566051400653486,
+      "loss": 0.6319,
+      "step": 875
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.4268571606261546,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7451,
+      "step": 876
+    },
+    {
+      "epoch": 0.46773333333333333,
+      "grad_norm": 0.39453614407684384,
+      "learning_rate": 0.00011531907578133429,
+      "loss": 0.7385,
+      "step": 877
+    },
+    {
+      "epoch": 0.46826666666666666,
+      "grad_norm": 0.4036651489847049,
+      "learning_rate": 0.00011514828779617459,
+      "loss": 0.6389,
+      "step": 878
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.3973231219955467,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.713,
+      "step": 879
+    },
+    {
+      "epoch": 0.4693333333333333,
+      "grad_norm": 0.3938388324647291,
+      "learning_rate": 0.00011480657663072896,
+      "loss": 0.6975,
+      "step": 880
+    },
+    {
+      "epoch": 0.46986666666666665,
+      "grad_norm": 0.38572634041719334,
+      "learning_rate": 0.00011463565447084445,
+      "loss": 0.6308,
+      "step": 881
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.46729208523626325,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.718,
+      "step": 882
+    },
+    {
+      "epoch": 0.4709333333333333,
+      "grad_norm": 0.4124496636141174,
+      "learning_rate": 0.00011429367954874819,
+      "loss": 0.6791,
+      "step": 883
+    },
+    {
+      "epoch": 0.47146666666666665,
+      "grad_norm": 0.42916527827309303,
+      "learning_rate": 0.0001141226278077254,
+      "loss": 0.664,
+      "step": 884
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.3858954342354503,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.6626,
+      "step": 885
+    },
+    {
+      "epoch": 0.47253333333333336,
+      "grad_norm": 0.394754016424999,
+      "learning_rate": 0.00011378039831966134,
+      "loss": 0.7088,
+      "step": 886
+    },
+    {
+      "epoch": 0.4730666666666667,
+      "grad_norm": 0.4383828229399499,
+      "learning_rate": 0.00011360922159456928,
+      "loss": 0.6659,
+      "step": 887
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.4018923458310357,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.716,
+      "step": 888
+    },
+    {
+      "epoch": 0.47413333333333335,
+      "grad_norm": 0.40402159693430423,
+      "learning_rate": 0.00011326674673806195,
+      "loss": 0.6761,
+      "step": 889
+    },
+    {
+      "epoch": 0.4746666666666667,
+      "grad_norm": 0.39471185256601704,
+      "learning_rate": 0.00011309544962932862,
+      "loss": 0.6687,
+      "step": 890
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.37248706946436677,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.6876,
+      "step": 891
+    },
+    {
+      "epoch": 0.47573333333333334,
+      "grad_norm": 0.43095559414585216,
+      "learning_rate": 0.00011275273860849684,
+      "loss": 0.7514,
+      "step": 892
+    },
+    {
+      "epoch": 0.47626666666666667,
+      "grad_norm": 0.37468690060253285,
+      "learning_rate": 0.00011258132571978555,
+      "loss": 0.6628,
+      "step": 893
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.39739859446740033,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.6563,
+      "step": 894
+    },
+    {
+      "epoch": 0.47733333333333333,
+      "grad_norm": 0.4069890132906869,
+      "learning_rate": 0.00011223838774509514,
+      "loss": 0.7077,
+      "step": 895
+    },
+    {
+      "epoch": 0.47786666666666666,
+      "grad_norm": 0.47143700583514053,
+      "learning_rate": 0.00011206686368318086,
+      "loss": 0.7081,
+      "step": 896
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.4738869800644348,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.6958,
+      "step": 897
+    },
+    {
+      "epoch": 0.4789333333333333,
+      "grad_norm": 0.38031191606305387,
+      "learning_rate": 0.00011172370797119712,
+      "loss": 0.6723,
+      "step": 898
+    },
+    {
+      "epoch": 0.47946666666666665,
+      "grad_norm": 0.46867394581105737,
+      "learning_rate": 0.00011155207734584263,
+      "loss": 0.7427,
+      "step": 899
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.40511505624824656,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.6834,
+      "step": 900
+    },
+    {
+      "epoch": 0.4805333333333333,
+      "grad_norm": 0.37366106956900996,
+      "learning_rate": 0.00011120871311898254,
+      "loss": 0.679,
+      "step": 901
+    },
+    {
+      "epoch": 0.48106666666666664,
+      "grad_norm": 0.4043939330555839,
+      "learning_rate": 0.0001110369805428146,
+      "loss": 0.6883,
+      "step": 902
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.4260025289606062,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.7284,
+      "step": 903
+    },
+    {
+      "epoch": 0.48213333333333336,
+      "grad_norm": 0.39658217703840415,
+      "learning_rate": 0.0001106934170290991,
+      "loss": 0.717,
+      "step": 904
+    },
+    {
+      "epoch": 0.4826666666666667,
+      "grad_norm": 0.4631914880344376,
+      "learning_rate": 0.00011052158711748434,
+      "loss": 0.7042,
+      "step": 905
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.4275072332001429,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.6456,
+      "step": 906
+    },
+    {
+      "epoch": 0.48373333333333335,
+      "grad_norm": 0.496698488911602,
+      "learning_rate": 0.00011017783355029026,
+      "loss": 0.7391,
+      "step": 907
+    },
+    {
+      "epoch": 0.4842666666666667,
+      "grad_norm": 0.4266729730283349,
+      "learning_rate": 0.00011000591092121127,
+      "loss": 0.6477,
+      "step": 908
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.40492842730773204,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7057,
+      "step": 909
+    },
+    {
+      "epoch": 0.48533333333333334,
+      "grad_norm": 0.417203016214931,
+      "learning_rate": 0.0001096619765390232,
+      "loss": 0.7141,
+      "step": 910
+    },
+    {
+      "epoch": 0.48586666666666667,
+      "grad_norm": 0.3596592617100103,
+      "learning_rate": 0.00010948996581295436,
+      "loss": 0.6502,
+      "step": 911
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.38557498729928935,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.6876,
+      "step": 912
+    },
+    {
+      "epoch": 0.48693333333333333,
+      "grad_norm": 0.40103013114864566,
+      "learning_rate": 0.00010914585985911632,
+      "loss": 0.6916,
+      "step": 913
+    },
+    {
+      "epoch": 0.48746666666666666,
+      "grad_norm": 0.38436379613282606,
+      "learning_rate": 0.00010897376565889971,
+      "loss": 0.5922,
+      "step": 914
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.47627052391103164,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.7174,
+      "step": 915
+    },
+    {
+      "epoch": 0.4885333333333333,
+      "grad_norm": 0.4073969434937531,
+      "learning_rate": 0.00010862949738136681,
+      "loss": 0.6773,
+      "step": 916
+    },
+    {
+      "epoch": 0.48906666666666665,
+      "grad_norm": 0.3773290918934115,
+      "learning_rate": 0.00010845732433208779,
+      "loss": 0.6248,
+      "step": 917
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.43903890663176887,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.6832,
+      "step": 918
+    },
+    {
+      "epoch": 0.4901333333333333,
+      "grad_norm": 0.39105490582606944,
+      "learning_rate": 0.00010811290298317755,
+      "loss": 0.6208,
+      "step": 919
+    },
+    {
+      "epoch": 0.49066666666666664,
+      "grad_norm": 0.36021733418000385,
+      "learning_rate": 0.00010794065571204072,
+      "loss": 0.5952,
+      "step": 920
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.3646626277686097,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.6183,
+      "step": 921
+    },
+    {
+      "epoch": 0.49173333333333336,
+      "grad_norm": 0.40163882230877435,
+      "learning_rate": 0.00010759609054818458,
+      "loss": 0.6859,
+      "step": 922
+    },
+    {
+      "epoch": 0.4922666666666667,
+      "grad_norm": 0.4406773319947825,
+      "learning_rate": 0.00010742377368438914,
+      "loss": 0.7309,
+      "step": 923
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.400770912566185,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.6978,
+      "step": 924
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 0.4090972088523414,
+      "learning_rate": 0.00010707907396588361,
+      "loss": 0.6985,
+      "step": 925
+    },
+    {
+      "epoch": 0.4938666666666667,
+      "grad_norm": 0.4430850034506034,
+      "learning_rate": 0.0001069066921404992,
+      "loss": 0.7171,
+      "step": 926
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.5104233164266083,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7206,
+      "step": 927
+    },
+    {
+      "epoch": 0.49493333333333334,
+      "grad_norm": 0.5337145411704607,
+      "learning_rate": 0.00010656186713125689,
+      "loss": 0.7028,
+      "step": 928
+    },
+    {
+      "epoch": 0.49546666666666667,
+      "grad_norm": 0.4654800068840646,
+      "learning_rate": 0.0001063894249770989,
+      "loss": 0.7164,
+      "step": 929
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.45156944572502017,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7286,
+      "step": 930
+    },
+    {
+      "epoch": 0.4965333333333333,
+      "grad_norm": 0.4488343394437967,
+      "learning_rate": 0.00010604448394439983,
+      "loss": 0.6555,
+      "step": 931
+    },
+    {
+      "epoch": 0.49706666666666666,
+      "grad_norm": 0.4176500164666214,
+      "learning_rate": 0.00010587198609590505,
+      "loss": 0.7031,
+      "step": 932
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.4568540494066028,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.6915,
+      "step": 933
+    },
+    {
+      "epoch": 0.4981333333333333,
+      "grad_norm": 0.36636546293185185,
+      "learning_rate": 0.00010552693831014726,
+      "loss": 0.6404,
+      "step": 934
+    },
+    {
+      "epoch": 0.49866666666666665,
+      "grad_norm": 0.43114833573416217,
+      "learning_rate": 0.0001053543894032493,
+      "loss": 0.6521,
+      "step": 935
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3939861121073195,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.6629,
+      "step": 936
+    },
+    {
+      "epoch": 0.4997333333333333,
+      "grad_norm": 0.4260211410168558,
+      "learning_rate": 0.00010500924413769988,
+      "loss": 0.7409,
+      "step": 937
+    },
+    {
+      "epoch": 0.5002666666666666,
+      "grad_norm": 0.3940632056381582,
+      "learning_rate": 0.00010483664880970457,
+      "loss": 0.6455,
+      "step": 938
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.47128360899565275,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7293,
+      "step": 939
+    },
+    {
+      "epoch": 0.5013333333333333,
+      "grad_norm": 0.4570639352777145,
+      "learning_rate": 0.00010449141534025045,
+      "loss": 0.7703,
+      "step": 940
+    },
+    {
+      "epoch": 0.5018666666666667,
+      "grad_norm": 0.44166486237645275,
+      "learning_rate": 0.00010431877822971117,
+      "loss": 0.6754,
+      "step": 941
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.4291141079011354,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.7155,
+      "step": 942
+    },
+    {
+      "epoch": 0.5029333333333333,
+      "grad_norm": 0.43701127729116224,
+      "learning_rate": 0.00010397346583460971,
+      "loss": 0.7297,
+      "step": 943
+    },
+    {
+      "epoch": 0.5034666666666666,
+      "grad_norm": 0.41251108166319733,
+      "learning_rate": 0.0001038007915812028,
+      "loss": 0.7382,
+      "step": 944
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.44539839009574256,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.7178,
+      "step": 945
+    },
+    {
+      "epoch": 0.5045333333333333,
+      "grad_norm": 0.35839327545970345,
+      "learning_rate": 0.0001034554095408326,
+      "loss": 0.6291,
+      "step": 946
+    },
+    {
+      "epoch": 0.5050666666666667,
+      "grad_norm": 0.4066840426301633,
+      "learning_rate": 0.00010328270278523256,
+      "loss": 0.6569,
+      "step": 947
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.4381921529525435,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7229,
+      "step": 948
+    },
+    {
+      "epoch": 0.5061333333333333,
+      "grad_norm": 0.4095186601877654,
+      "learning_rate": 0.00010293726038184393,
+      "loss": 0.7245,
+      "step": 949
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.38636903150512125,
+      "learning_rate": 0.00010276452576559879,
+      "loss": 0.6828,
+      "step": 950
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.6260488460130194,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.816,
+      "step": 951
+    },
+    {
+      "epoch": 0.5077333333333334,
+      "grad_norm": 0.37943573207293746,
+      "learning_rate": 0.00010241903228306431,
+      "loss": 0.6282,
+      "step": 952
+    },
+    {
+      "epoch": 0.5082666666666666,
+      "grad_norm": 0.45185514579403613,
+      "learning_rate": 0.0001022462744484709,
+      "loss": 0.6902,
+      "step": 953
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.44337785555605214,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.681,
+      "step": 954
+    },
+    {
+      "epoch": 0.5093333333333333,
+      "grad_norm": 0.47565324340343806,
+      "learning_rate": 0.00010190073917203589,
+      "loss": 0.7147,
+      "step": 955
+    },
+    {
+      "epoch": 0.5098666666666667,
+      "grad_norm": 0.43158006078313926,
+      "learning_rate": 0.00010172796276201503,
+      "loss": 0.7381,
+      "step": 956
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.4187257063640549,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.7025,
+      "step": 957
+    },
+    {
+      "epoch": 0.5109333333333334,
+      "grad_norm": 0.37507567267923686,
+      "learning_rate": 0.00010138239497804804,
+      "loss": 0.7284,
+      "step": 958
+    },
+    {
+      "epoch": 0.5114666666666666,
+      "grad_norm": 0.4362336355553638,
+      "learning_rate": 0.00010120960463601976,
+      "loss": 0.6565,
+      "step": 959
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.41957720951009236,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.6875,
+      "step": 960
+    },
+    {
+      "epoch": 0.5125333333333333,
+      "grad_norm": 0.42848222659383123,
+      "learning_rate": 0.00010086401363176305,
+      "loss": 0.6724,
+      "step": 961
+    },
+    {
+      "epoch": 0.5130666666666667,
+      "grad_norm": 0.38569769294882406,
+      "learning_rate": 0.00010069121400152181,
+      "loss": 0.659,
+      "step": 962
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.43222419573723375,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.7376,
+      "step": 963
+    },
+    {
+      "epoch": 0.5141333333333333,
+      "grad_norm": 0.41762869586994045,
+      "learning_rate": 0.0001003456090648416,
+      "loss": 0.6672,
+      "step": 964
+    },
+    {
+      "epoch": 0.5146666666666667,
+      "grad_norm": 0.4151854212676284,
+      "learning_rate": 0.00010017280479043147,
+      "loss": 0.681,
+      "step": 965
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4013939727263469,
+      "learning_rate": 0.0001,
+      "loss": 0.6539,
+      "step": 966
+    },
+    {
+      "epoch": 0.5157333333333334,
+      "grad_norm": 0.44025614029414273,
+      "learning_rate": 9.982719520956855e-05,
+      "loss": 0.627,
+      "step": 967
+    },
+    {
+      "epoch": 0.5162666666666667,
+      "grad_norm": 0.4851943989181806,
+      "learning_rate": 9.965439093515841e-05,
+      "loss": 0.6842,
+      "step": 968
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.4806947511141547,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.6682,
+      "step": 969
+    },
+    {
+      "epoch": 0.5173333333333333,
+      "grad_norm": 0.4117238232036571,
+      "learning_rate": 9.930878599847821e-05,
+      "loss": 0.6487,
+      "step": 970
+    },
+    {
+      "epoch": 0.5178666666666667,
+      "grad_norm": 0.47733044825034754,
+      "learning_rate": 9.913598636823693e-05,
+      "loss": 0.7449,
+      "step": 971
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.48317342547621384,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.7296,
+      "step": 972
+    },
+    {
+      "epoch": 0.5189333333333334,
+      "grad_norm": 0.3268073323556687,
+      "learning_rate": 9.879039536398024e-05,
+      "loss": 0.6212,
+      "step": 973
+    },
+    {
+      "epoch": 0.5194666666666666,
+      "grad_norm": 0.3915524465052016,
+      "learning_rate": 9.861760502195197e-05,
+      "loss": 0.7185,
+      "step": 974
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3817787587426184,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7066,
+      "step": 975
+    },
+    {
+      "epoch": 0.5205333333333333,
+      "grad_norm": 0.41660734367905067,
+      "learning_rate": 9.827203723798498e-05,
+      "loss": 0.6731,
+      "step": 976
+    },
+    {
+      "epoch": 0.5210666666666667,
+      "grad_norm": 0.46479312837310793,
+      "learning_rate": 9.809926082796415e-05,
+      "loss": 0.684,
+      "step": 977
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.4008729136774808,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.6768,
+      "step": 978
+    },
+    {
+      "epoch": 0.5221333333333333,
+      "grad_norm": 0.4242250122226439,
+      "learning_rate": 9.775372555152912e-05,
+      "loss": 0.6652,
+      "step": 979
+    },
+    {
+      "epoch": 0.5226666666666666,
+      "grad_norm": 0.43686101699854446,
+      "learning_rate": 9.758096771693573e-05,
+      "loss": 0.7035,
+      "step": 980
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.41471733807270394,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6341,
+      "step": 981
+    },
+    {
+      "epoch": 0.5237333333333334,
+      "grad_norm": 0.4216582084990744,
+      "learning_rate": 9.723547423440122e-05,
+      "loss": 0.6415,
+      "step": 982
+    },
+    {
+      "epoch": 0.5242666666666667,
+      "grad_norm": 0.3600462255058733,
+      "learning_rate": 9.70627396181561e-05,
+      "loss": 0.6844,
+      "step": 983
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4057188519578744,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.6586,
+      "step": 984
+    },
+    {
+      "epoch": 0.5253333333333333,
+      "grad_norm": 0.39840093105740715,
+      "learning_rate": 9.671729721476746e-05,
+      "loss": 0.6503,
+      "step": 985
+    },
+    {
+      "epoch": 0.5258666666666667,
+      "grad_norm": 0.4256026109442371,
+      "learning_rate": 9.654459045916743e-05,
+      "loss": 0.7083,
+      "step": 986
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.5092089395428362,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.6475,
+      "step": 987
+    },
+    {
+      "epoch": 0.5269333333333334,
+      "grad_norm": 0.38207338014783776,
+      "learning_rate": 9.619920841879725e-05,
+      "loss": 0.6826,
+      "step": 988
+    },
+    {
+      "epoch": 0.5274666666666666,
+      "grad_norm": 0.4082864767051089,
+      "learning_rate": 9.602653416539031e-05,
+      "loss": 0.6569,
+      "step": 989
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.458709563299764,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.6756,
+      "step": 990
+    },
+    {
+      "epoch": 0.5285333333333333,
+      "grad_norm": 0.40946611755676315,
+      "learning_rate": 9.568122177028884e-05,
+      "loss": 0.6861,
+      "step": 991
+    },
+    {
+      "epoch": 0.5290666666666667,
+      "grad_norm": 0.3739907081716999,
+      "learning_rate": 9.550858465974958e-05,
+      "loss": 0.6836,
+      "step": 992
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.4856089213522823,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.6764,
+      "step": 993
+    },
+    {
+      "epoch": 0.5301333333333333,
+      "grad_norm": 0.3766340947339158,
+      "learning_rate": 9.516335119029546e-05,
+      "loss": 0.6089,
+      "step": 994
+    },
+    {
+      "epoch": 0.5306666666666666,
+      "grad_norm": 0.3819603841632572,
+      "learning_rate": 9.499075586230013e-05,
+      "loss": 0.6088,
+      "step": 995
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.4723446206807123,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.7164,
+      "step": 996
+    },
+    {
+      "epoch": 0.5317333333333333,
+      "grad_norm": 0.4358190408613106,
+      "learning_rate": 9.464561059675073e-05,
+      "loss": 0.7187,
+      "step": 997
+    },
+    {
+      "epoch": 0.5322666666666667,
+      "grad_norm": 0.42383727681651934,
+      "learning_rate": 9.44730616898528e-05,
+      "loss": 0.6573,
+      "step": 998
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.36709869944338097,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.6417,
+      "step": 999
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.4062515478356949,
+      "learning_rate": 9.412801390409497e-05,
+      "loss": 0.6724,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5338666666666667,
+      "grad_norm": 0.4354605385703807,
+      "learning_rate": 9.395551605560018e-05,
+      "loss": 0.7404,
+      "step": 1001
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.4763139923022005,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.7233,
+      "step": 1002
+    },
+    {
+      "epoch": 0.5349333333333334,
+      "grad_norm": 0.3750523405124464,
+      "learning_rate": 9.361057502290113e-05,
+      "loss": 0.6729,
+      "step": 1003
+    },
+    {
+      "epoch": 0.5354666666666666,
+      "grad_norm": 0.42329434731731186,
+      "learning_rate": 9.343813286874312e-05,
+      "loss": 0.6796,
+      "step": 1004
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.3859846464386311,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.6804,
+      "step": 1005
+    },
+    {
+      "epoch": 0.5365333333333333,
+      "grad_norm": 0.41894947258061355,
+      "learning_rate": 9.309330785950086e-05,
+      "loss": 0.6697,
+      "step": 1006
+    },
+    {
+      "epoch": 0.5370666666666667,
+      "grad_norm": 0.4314825350716821,
+      "learning_rate": 9.292092603411641e-05,
+      "loss": 0.6496,
+      "step": 1007
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.4714068042823263,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.6685,
+      "step": 1008
+    },
+    {
+      "epoch": 0.5381333333333334,
+      "grad_norm": 0.4283526186401107,
+      "learning_rate": 9.257622631561085e-05,
+      "loss": 0.7105,
+      "step": 1009
+    },
+    {
+      "epoch": 0.5386666666666666,
+      "grad_norm": 0.47671975373092673,
+      "learning_rate": 9.240390945181543e-05,
+      "loss": 0.7895,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.42490613681725325,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.755,
+      "step": 1011
+    },
+    {
+      "epoch": 0.5397333333333333,
+      "grad_norm": 0.43598164411052964,
+      "learning_rate": 9.205934428795929e-05,
+      "loss": 0.5963,
+      "step": 1012
+    },
+    {
+      "epoch": 0.5402666666666667,
+      "grad_norm": 0.5044110827187931,
+      "learning_rate": 9.188709701682247e-05,
+      "loss": 0.6823,
+      "step": 1013
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.4163631762938455,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.6874,
+      "step": 1014
+    },
+    {
+      "epoch": 0.5413333333333333,
+      "grad_norm": 0.6574527142102977,
+      "learning_rate": 9.154267566791223e-05,
+      "loss": 0.6819,
+      "step": 1015
+    },
+    {
+      "epoch": 0.5418666666666667,
+      "grad_norm": 0.41503089259838405,
+      "learning_rate": 9.137050261863324e-05,
+      "loss": 0.6267,
+      "step": 1016
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.4371772499619709,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7382,
+      "step": 1017
+    },
+    {
+      "epoch": 0.5429333333333334,
+      "grad_norm": 0.42666590640185137,
+      "learning_rate": 9.102623434110028e-05,
+      "loss": 0.7149,
+      "step": 1018
+    },
+    {
+      "epoch": 0.5434666666666667,
+      "grad_norm": 0.36601998126699087,
+      "learning_rate": 9.085414014088369e-05,
+      "loss": 0.6834,
+      "step": 1019
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.5437344236236936,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.7006,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5445333333333333,
+      "grad_norm": 0.38132873616786067,
+      "learning_rate": 9.051003418704565e-05,
+      "loss": 0.7248,
+      "step": 1021
+    },
+    {
+      "epoch": 0.5450666666666667,
+      "grad_norm": 0.47486074810590434,
+      "learning_rate": 9.033802346097682e-05,
+      "loss": 0.7776,
+      "step": 1022
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.45698397555151876,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.6543,
+      "step": 1023
+    },
+    {
+      "epoch": 0.5461333333333334,
+      "grad_norm": 0.4222745211512193,
+      "learning_rate": 8.999408907878877e-05,
+      "loss": 0.6682,
+      "step": 1024
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 0.3975252007203213,
+      "learning_rate": 8.982216644970979e-05,
+      "loss": 0.7073,
+      "step": 1025
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.36462621894528313,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.6548,
+      "step": 1026
+    },
+    {
+      "epoch": 0.5477333333333333,
+      "grad_norm": 0.46299205272186583,
+      "learning_rate": 8.947841288251568e-05,
+      "loss": 0.7228,
+      "step": 1027
+    },
+    {
+      "epoch": 0.5482666666666667,
+      "grad_norm": 0.4099166364848886,
+      "learning_rate": 8.930658297090091e-05,
+      "loss": 0.6679,
+      "step": 1028
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.3563235630214903,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.6399,
+      "step": 1029
+    },
+    {
+      "epoch": 0.5493333333333333,
+      "grad_norm": 0.42017677018775823,
+      "learning_rate": 8.896301945718541e-05,
+      "loss": 0.6557,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5498666666666666,
+      "grad_norm": 0.37422163801109426,
+      "learning_rate": 8.879128688101749e-05,
+      "loss": 0.6781,
+      "step": 1031
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.4244094126169354,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7105,
+      "step": 1032
+    },
+    {
+      "epoch": 0.5509333333333334,
+      "grad_norm": 0.4273457906424879,
+      "learning_rate": 8.844792265415738e-05,
+      "loss": 0.7027,
+      "step": 1033
+    },
+    {
+      "epoch": 0.5514666666666667,
+      "grad_norm": 0.5429630506062204,
+      "learning_rate": 8.827629202880293e-05,
+      "loss": 0.7273,
+      "step": 1034
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.3945842034493147,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.6233,
+      "step": 1035
+    },
+    {
+      "epoch": 0.5525333333333333,
+      "grad_norm": 0.4276874529823611,
+      "learning_rate": 8.793313631681915e-05,
+      "loss": 0.6882,
+      "step": 1036
+    },
+    {
+      "epoch": 0.5530666666666667,
+      "grad_norm": 0.4383449056836858,
+      "learning_rate": 8.776161225490489e-05,
+      "loss": 0.7074,
+      "step": 1037
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4664996080845354,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.6848,
+      "step": 1038
+    },
+    {
+      "epoch": 0.5541333333333334,
+      "grad_norm": 0.3718922182224511,
+      "learning_rate": 8.741867428021446e-05,
+      "loss": 0.6321,
+      "step": 1039
+    },
+    {
+      "epoch": 0.5546666666666666,
+      "grad_norm": 0.4078353276598548,
+      "learning_rate": 8.724726139150318e-05,
+      "loss": 0.7739,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.3824518450621971,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.6549,
+      "step": 1041
+    },
+    {
+      "epoch": 0.5557333333333333,
+      "grad_norm": 0.38449336937285716,
+      "learning_rate": 8.690455037067141e-05,
+      "loss": 0.6315,
+      "step": 1042
+    },
+    {
+      "epoch": 0.5562666666666667,
+      "grad_norm": 0.44187691432038917,
+      "learning_rate": 8.673325326193806e-05,
+      "loss": 0.6751,
+      "step": 1043
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.43181725311216423,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.6939,
+      "step": 1044
+    },
+    {
+      "epoch": 0.5573333333333333,
+      "grad_norm": 0.38920706971716057,
+      "learning_rate": 8.639077840543077e-05,
+      "loss": 0.6679,
+      "step": 1045
+    },
+    {
+      "epoch": 0.5578666666666666,
+      "grad_norm": 0.45150061599409774,
+      "learning_rate": 8.621960168033867e-05,
+      "loss": 0.6919,
+      "step": 1046
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.3585658057427239,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.6491,
+      "step": 1047
+    },
+    {
+      "epoch": 0.5589333333333333,
+      "grad_norm": 0.4336551483648102,
+      "learning_rate": 8.587737219227462e-05,
+      "loss": 0.6221,
+      "step": 1048
+    },
+    {
+      "epoch": 0.5594666666666667,
+      "grad_norm": 0.415951014350006,
+      "learning_rate": 8.570632045125185e-05,
+      "loss": 0.6321,
+      "step": 1049
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.422624518486723,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6913,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5605333333333333,
+      "grad_norm": 0.5028905039257692,
+      "learning_rate": 8.536434552915556e-05,
+      "loss": 0.7145,
+      "step": 1051
+    },
+    {
+      "epoch": 0.5610666666666667,
+      "grad_norm": 0.38879350823351877,
+      "learning_rate": 8.519342336927105e-05,
+      "loss": 0.6286,
+      "step": 1052
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.4064224765126606,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6805,
+      "step": 1053
+    },
+    {
+      "epoch": 0.5621333333333334,
+      "grad_norm": 0.44986365222772284,
+      "learning_rate": 8.485171220382545e-05,
+      "loss": 0.6571,
+      "step": 1054
+    },
+    {
+      "epoch": 0.5626666666666666,
+      "grad_norm": 0.44248101210506857,
+      "learning_rate": 8.468092421866573e-05,
+      "loss": 0.6665,
+      "step": 1055
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.44571400690033497,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7224,
+      "step": 1056
+    },
+    {
+      "epoch": 0.5637333333333333,
+      "grad_norm": 0.40878613792979684,
+      "learning_rate": 8.433948599346516e-05,
+      "loss": 0.6553,
+      "step": 1057
+    },
+    {
+      "epoch": 0.5642666666666667,
+      "grad_norm": 0.3776861200286177,
+      "learning_rate": 8.416883677301069e-05,
+      "loss": 0.6324,
+      "step": 1058
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.5449941507254645,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6746,
+      "step": 1059
+    },
+    {
+      "epoch": 0.5653333333333334,
+      "grad_norm": 0.42283035357198445,
+      "learning_rate": 8.382768066431425e-05,
+      "loss": 0.7079,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5658666666666666,
+      "grad_norm": 0.39700699285483754,
+      "learning_rate": 8.36571747948162e-05,
+      "loss": 0.7007,
+      "step": 1061
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.43262492021103666,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.6409,
+      "step": 1062
+    },
+    {
+      "epoch": 0.5669333333333333,
+      "grad_norm": 0.4472394720336021,
+      "learning_rate": 8.33163099713009e-05,
+      "loss": 0.7294,
+      "step": 1063
+    },
+    {
+      "epoch": 0.5674666666666667,
+      "grad_norm": 0.35841758822880526,
+      "learning_rate": 8.31459520351578e-05,
+      "loss": 0.5765,
+      "step": 1064
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.47823220967303554,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.6573,
+      "step": 1065
+    },
+    {
+      "epoch": 0.5685333333333333,
+      "grad_norm": 0.40298371612589157,
+      "learning_rate": 8.280538765767235e-05,
+      "loss": 0.6166,
+      "step": 1066
+    },
+    {
+      "epoch": 0.5690666666666667,
+      "grad_norm": 0.44584424650919,
+      "learning_rate": 8.263518223330697e-05,
+      "loss": 0.629,
+      "step": 1067
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4208691909512677,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7249,
+      "step": 1068
+    },
+    {
+      "epoch": 0.5701333333333334,
+      "grad_norm": 0.3491792959128797,
+      "learning_rate": 8.22949274546255e-05,
+      "loss": 0.6691,
+      "step": 1069
+    },
+    {
+      "epoch": 0.5706666666666667,
+      "grad_norm": 0.36491087398456773,
+      "learning_rate": 8.212487911636184e-05,
+      "loss": 0.5796,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.40813117896238155,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6867,
+      "step": 1071
+    },
+    {
+      "epoch": 0.5717333333333333,
+      "grad_norm": 0.44940589005817705,
+      "learning_rate": 8.178494308093789e-05,
+      "loss": 0.6958,
+      "step": 1072
+    },
+    {
+      "epoch": 0.5722666666666667,
+      "grad_norm": 0.4144835924770522,
+      "learning_rate": 8.161505639887817e-05,
+      "loss": 0.6491,
+      "step": 1073
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.39707376034497954,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.5771,
+      "step": 1074
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 0.42123790275817014,
+      "learning_rate": 8.127544824259889e-05,
+      "loss": 0.689,
+      "step": 1075
+    },
+    {
+      "epoch": 0.5738666666666666,
+      "grad_norm": 0.37312709892533785,
+      "learning_rate": 8.110572778250085e-05,
+      "loss": 0.6464,
+      "step": 1076
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.45093709079037364,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.6174,
+      "step": 1077
+    },
+    {
+      "epoch": 0.5749333333333333,
+      "grad_norm": 0.4031053081919436,
+      "learning_rate": 8.076645663244168e-05,
+      "loss": 0.6632,
+      "step": 1078
+    },
+    {
+      "epoch": 0.5754666666666667,
+      "grad_norm": 0.42902271009367576,
+      "learning_rate": 8.059690695559568e-05,
+      "loss": 0.6407,
+      "step": 1079
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.43050650172847615,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.6925,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5765333333333333,
+      "grad_norm": 0.42447102508079837,
+      "learning_rate": 8.025798192977481e-05,
+      "loss": 0.6767,
+      "step": 1081
+    },
+    {
+      "epoch": 0.5770666666666666,
+      "grad_norm": 0.4209436167654291,
+      "learning_rate": 8.008860759288147e-05,
+      "loss": 0.7065,
+      "step": 1082
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.3854169051220057,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.7035,
+      "step": 1083
+    },
+    {
+      "epoch": 0.5781333333333334,
+      "grad_norm": 0.36149464508948714,
+      "learning_rate": 7.975003780001485e-05,
+      "loss": 0.6117,
+      "step": 1084
+    },
+    {
+      "epoch": 0.5786666666666667,
+      "grad_norm": 0.5357696910183349,
+      "learning_rate": 7.958084335506239e-05,
+      "loss": 0.7273,
+      "step": 1085
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.42903143003003225,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6957,
+      "step": 1086
+    },
+    {
+      "epoch": 0.5797333333333333,
+      "grad_norm": 0.3888063195583562,
+      "learning_rate": 7.924263789431912e-05,
+      "loss": 0.6655,
+      "step": 1087
+    },
+    {
+      "epoch": 0.5802666666666667,
+      "grad_norm": 0.3395464244689389,
+      "learning_rate": 7.907362788846116e-05,
+      "loss": 0.5678,
+      "step": 1088
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.36372538496696344,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.5905,
+      "step": 1089
+    },
+    {
+      "epoch": 0.5813333333333334,
+      "grad_norm": 0.4435374035943677,
+      "learning_rate": 7.873579584921869e-05,
+      "loss": 0.7141,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5818666666666666,
+      "grad_norm": 0.3980093384067443,
+      "learning_rate": 7.856697482465196e-05,
+      "loss": 0.6604,
+      "step": 1091
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.33984862224305995,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6245,
+      "step": 1092
+    },
+    {
+      "epoch": 0.5829333333333333,
+      "grad_norm": 0.38295421732248536,
+      "learning_rate": 7.822952528625191e-05,
+      "loss": 0.7128,
+      "step": 1093
+    },
+    {
+      "epoch": 0.5834666666666667,
+      "grad_norm": 0.4491590184147579,
+      "learning_rate": 7.806089778009421e-05,
+      "loss": 0.6315,
+      "step": 1094
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3767130186401547,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.572,
+      "step": 1095
+    },
+    {
+      "epoch": 0.5845333333333333,
+      "grad_norm": 0.3972548195229604,
+      "learning_rate": 7.772383981159849e-05,
+      "loss": 0.6308,
+      "step": 1096
+    },
+    {
+      "epoch": 0.5850666666666666,
+      "grad_norm": 0.44055381077950156,
+      "learning_rate": 7.755541035576677e-05,
+      "loss": 0.6866,
+      "step": 1097
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.4104075619845114,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6666,
+      "step": 1098
+    },
+    {
+      "epoch": 0.5861333333333333,
+      "grad_norm": 0.3907331405509662,
+      "learning_rate": 7.721875301571359e-05,
+      "loss": 0.6599,
+      "step": 1099
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.3847663055317739,
+      "learning_rate": 7.705052613680211e-05,
+      "loss": 0.6741,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.4628313913008596,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.6593,
+      "step": 1101
+    },
+    {
+      "epoch": 0.5877333333333333,
+      "grad_norm": 0.4647925650314762,
+      "learning_rate": 7.671427847296275e-05,
+      "loss": 0.7455,
+      "step": 1102
+    },
+    {
+      "epoch": 0.5882666666666667,
+      "grad_norm": 0.41261519664681573,
+      "learning_rate": 7.654625869212146e-05,
+      "loss": 0.6954,
+      "step": 1103
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.45444836790689513,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.6606,
+      "step": 1104
+    },
+    {
+      "epoch": 0.5893333333333334,
+      "grad_norm": 0.5358891340024584,
+      "learning_rate": 7.6210429741257e-05,
+      "loss": 0.7436,
+      "step": 1105
+    },
+    {
+      "epoch": 0.5898666666666667,
+      "grad_norm": 0.45521731751388783,
+      "learning_rate": 7.604262157407007e-05,
+      "loss": 0.6796,
+      "step": 1106
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.394011079694462,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.6235,
+      "step": 1107
+    },
+    {
+      "epoch": 0.5909333333333333,
+      "grad_norm": 0.5079189076277912,
+      "learning_rate": 7.570722036168854e-05,
+      "loss": 0.7749,
+      "step": 1108
+    },
+    {
+      "epoch": 0.5914666666666667,
+      "grad_norm": 0.3928134231684419,
+      "learning_rate": 7.55396283180529e-05,
+      "loss": 0.6817,
+      "step": 1109
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.5111921296335108,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.6678,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5925333333333334,
+      "grad_norm": 0.4081978794658278,
+      "learning_rate": 7.520466385816671e-05,
+      "loss": 0.6628,
+      "step": 1111
+    },
+    {
+      "epoch": 0.5930666666666666,
+      "grad_norm": 0.4053211813220869,
+      "learning_rate": 7.503729244217086e-05,
+      "loss": 0.6032,
+      "step": 1112
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.42983168496764484,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6636,
+      "step": 1113
+    },
+    {
+      "epoch": 0.5941333333333333,
+      "grad_norm": 0.4644008516765664,
+      "learning_rate": 7.470277373705461e-05,
+      "loss": 0.6703,
+      "step": 1114
+    },
+    {
+      "epoch": 0.5946666666666667,
+      "grad_norm": 0.4574504774505039,
+      "learning_rate": 7.453562744685778e-05,
+      "loss": 0.7518,
+      "step": 1115
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.4772708644960783,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.6889,
+      "step": 1116
+    },
+    {
+      "epoch": 0.5957333333333333,
+      "grad_norm": 0.4435426325155918,
+      "learning_rate": 7.42015634868062e-05,
+      "loss": 0.7131,
+      "step": 1117
+    },
+    {
+      "epoch": 0.5962666666666666,
+      "grad_norm": 0.4366700926912915,
+      "learning_rate": 7.403464681451715e-05,
+      "loss": 0.6809,
+      "step": 1118
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.3963571015044681,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6618,
+      "step": 1119
+    },
+    {
+      "epoch": 0.5973333333333334,
+      "grad_norm": 0.48419721275297006,
+      "learning_rate": 7.370104657760361e-05,
+      "loss": 0.6748,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5978666666666667,
+      "grad_norm": 0.38905688201468447,
+      "learning_rate": 7.353436400916004e-05,
+      "loss": 0.6205,
+      "step": 1121
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.433394986559628,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.678,
+      "step": 1122
+    },
+    {
+      "epoch": 0.5989333333333333,
+      "grad_norm": 0.496769545403158,
+      "learning_rate": 7.320123646099519e-05,
+      "loss": 0.7682,
+      "step": 1123
+    },
+    {
+      "epoch": 0.5994666666666667,
+      "grad_norm": 0.36142198179226703,
+      "learning_rate": 7.303479247604332e-05,
+      "loss": 0.6383,
+      "step": 1124
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4636565826974786,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.6458,
+      "step": 1125
+    },
+    {
+      "epoch": 0.6005333333333334,
+      "grad_norm": 0.417421627598741,
+      "learning_rate": 7.270214656953415e-05,
+      "loss": 0.6481,
+      "step": 1126
+    },
+    {
+      "epoch": 0.6010666666666666,
+      "grad_norm": 0.4472370301922988,
+      "learning_rate": 7.253594564130804e-05,
+      "loss": 0.6952,
+      "step": 1127
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.4088784521699277,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.6465,
+      "step": 1128
+    },
+    {
+      "epoch": 0.6021333333333333,
+      "grad_norm": 0.46647558279832896,
+      "learning_rate": 7.22037903164173e-05,
+      "loss": 0.7393,
+      "step": 1129
+    },
+    {
+      "epoch": 0.6026666666666667,
+      "grad_norm": 0.4053385750067436,
+      "learning_rate": 7.203783691161883e-05,
+      "loss": 0.6574,
+      "step": 1130
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.42757964213529087,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.6633,
+      "step": 1131
+    },
+    {
+      "epoch": 0.6037333333333333,
+      "grad_norm": 0.4470110173526119,
+      "learning_rate": 7.170618109512465e-05,
+      "loss": 0.6793,
+      "step": 1132
+    },
+    {
+      "epoch": 0.6042666666666666,
+      "grad_norm": 0.4193340190231634,
+      "learning_rate": 7.154047967380354e-05,
+      "loss": 0.7022,
+      "step": 1133
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.46156971270395714,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.6946,
+      "step": 1134
+    },
+    {
+      "epoch": 0.6053333333333333,
+      "grad_norm": 0.6119050871362105,
+      "learning_rate": 7.12093322790597e-05,
+      "loss": 0.713,
+      "step": 1135
+    },
+    {
+      "epoch": 0.6058666666666667,
+      "grad_norm": 0.3992646868223229,
+      "learning_rate": 7.104388729449338e-05,
+      "loss": 0.6214,
+      "step": 1136
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.5217558447619373,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7425,
+      "step": 1137
+    },
+    {
+      "epoch": 0.6069333333333333,
+      "grad_norm": 0.41823835650592406,
+      "learning_rate": 7.071325722118963e-05,
+      "loss": 0.6679,
+      "step": 1138
+    },
+    {
+      "epoch": 0.6074666666666667,
+      "grad_norm": 0.46847038553389453,
+      "learning_rate": 7.054807311976379e-05,
+      "loss": 0.7256,
+      "step": 1139
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4600176726426822,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.7218,
+      "step": 1140
+    },
+    {
+      "epoch": 0.6085333333333334,
+      "grad_norm": 0.5067414577175797,
+      "learning_rate": 7.021796925368667e-05,
+      "loss": 0.7144,
+      "step": 1141
+    },
+    {
+      "epoch": 0.6090666666666666,
+      "grad_norm": 0.4557479530568632,
+      "learning_rate": 7.005305047477566e-05,
+      "loss": 0.6882,
+      "step": 1142
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.3920277763332182,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6492,
+      "step": 1143
+    },
+    {
+      "epoch": 0.6101333333333333,
+      "grad_norm": 0.4845316811457574,
+      "learning_rate": 6.972348168756983e-05,
+      "loss": 0.637,
+      "step": 1144
+    },
+    {
+      "epoch": 0.6106666666666667,
+      "grad_norm": 0.4421634183836985,
+      "learning_rate": 6.955883266341741e-05,
+      "loss": 0.6482,
+      "step": 1145
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.5306364811458931,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6295,
+      "step": 1146
+    },
+    {
+      "epoch": 0.6117333333333334,
+      "grad_norm": 0.6189704780148215,
+      "learning_rate": 6.922980781234699e-05,
+      "loss": 0.7222,
+      "step": 1147
+    },
+    {
+      "epoch": 0.6122666666666666,
+      "grad_norm": 0.438502098448414,
+      "learning_rate": 6.906543296794714e-05,
+      "loss": 0.6524,
+      "step": 1148
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.4339981933957319,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6601,
+      "step": 1149
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.38820940513874697,
+      "learning_rate": 6.873696089565786e-05,
+      "loss": 0.6566,
+      "step": 1150
+    },
+    {
+      "epoch": 0.6138666666666667,
+      "grad_norm": 0.4463376889999683,
+      "learning_rate": 6.85728646486359e-05,
+      "loss": 0.6638,
+      "step": 1151
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.41195369045521074,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6777,
+      "step": 1152
+    },
+    {
+      "epoch": 0.6149333333333333,
+      "grad_norm": 0.3841239347667893,
+      "learning_rate": 6.82449541829174e-05,
+      "loss": 0.6372,
+      "step": 1153
+    },
+    {
+      "epoch": 0.6154666666666667,
+      "grad_norm": 0.3868369894075657,
+      "learning_rate": 6.80811409434113e-05,
+      "loss": 0.6393,
+      "step": 1154
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.5146796448296774,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.7299,
+      "step": 1155
+    },
+    {
+      "epoch": 0.6165333333333334,
+      "grad_norm": 0.43790325034252436,
+      "learning_rate": 6.775380089695986e-05,
+      "loss": 0.7157,
+      "step": 1156
+    },
+    {
+      "epoch": 0.6170666666666667,
+      "grad_norm": 0.4398587464943831,
+      "learning_rate": 6.759027506750158e-05,
+      "loss": 0.6846,
+      "step": 1157
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3595461758946452,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.5901,
+      "step": 1158
+    },
+    {
+      "epoch": 0.6181333333333333,
+      "grad_norm": 0.38109497064771947,
+      "learning_rate": 6.726351423768322e-05,
+      "loss": 0.683,
+      "step": 1159
+    },
+    {
+      "epoch": 0.6186666666666667,
+      "grad_norm": 0.36004089054337374,
+      "learning_rate": 6.710028021308061e-05,
+      "loss": 0.6178,
+      "step": 1160
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.42637815638404397,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.7173,
+      "step": 1161
+    },
+    {
+      "epoch": 0.6197333333333334,
+      "grad_norm": 0.42425891002201527,
+      "learning_rate": 6.677410738169485e-05,
+      "loss": 0.6631,
+      "step": 1162
+    },
+    {
+      "epoch": 0.6202666666666666,
+      "grad_norm": 0.450524267432063,
+      "learning_rate": 6.661116954891328e-05,
+      "loss": 0.6258,
+      "step": 1163
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.48785012034753317,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.7325,
+      "step": 1164
+    },
+    {
+      "epoch": 0.6213333333333333,
+      "grad_norm": 0.3811145492840182,
+      "learning_rate": 6.62855934819569e-05,
+      "loss": 0.6453,
+      "step": 1165
+    },
+    {
+      "epoch": 0.6218666666666667,
+      "grad_norm": 0.37249313313450727,
+      "learning_rate": 6.612295622000162e-05,
+      "loss": 0.6186,
+      "step": 1166
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.4432375348141401,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7176,
+      "step": 1167
+    },
+    {
+      "epoch": 0.6229333333333333,
+      "grad_norm": 0.4919086266265921,
+      "learning_rate": 6.579798566743314e-05,
+      "loss": 0.7397,
+      "step": 1168
+    },
+    {
+      "epoch": 0.6234666666666666,
+      "grad_norm": 0.40437823487566327,
+      "learning_rate": 6.563565334723134e-05,
+      "loss": 0.5864,
+      "step": 1169
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.4684502401638013,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.6577,
+      "step": 1170
+    },
+    {
+      "epoch": 0.6245333333333334,
+      "grad_norm": 0.36718558673611307,
+      "learning_rate": 6.531129704273604e-05,
+      "loss": 0.6323,
+      "step": 1171
+    },
+    {
+      "epoch": 0.6250666666666667,
+      "grad_norm": 0.3646893225239305,
+      "learning_rate": 6.514927402701964e-05,
+      "loss": 0.6039,
+      "step": 1172
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.4330441706528994,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.6565,
+      "step": 1173
+    },
+    {
+      "epoch": 0.6261333333333333,
+      "grad_norm": 0.4141825846758376,
+      "learning_rate": 6.48255406877745e-05,
+      "loss": 0.6368,
+      "step": 1174
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 0.4131440350198122,
+      "learning_rate": 6.466383133096267e-05,
+      "loss": 0.6301,
+      "step": 1175
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.34632135053706137,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.5804,
+      "step": 1176
+    },
+    {
+      "epoch": 0.6277333333333334,
+      "grad_norm": 0.3492834275916797,
+      "learning_rate": 6.434072965740242e-05,
+      "loss": 0.5651,
+      "step": 1177
+    },
+    {
+      "epoch": 0.6282666666666666,
+      "grad_norm": 0.454126021627074,
+      "learning_rate": 6.417933830548467e-05,
+      "loss": 0.7312,
+      "step": 1178
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.43030683923975094,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.6441,
+      "step": 1179
+    },
+    {
+      "epoch": 0.6293333333333333,
+      "grad_norm": 0.4579726985783972,
+      "learning_rate": 6.385687698106781e-05,
+      "loss": 0.6526,
+      "step": 1180
+    },
+    {
+      "epoch": 0.6298666666666667,
+      "grad_norm": 0.4667751571777085,
+      "learning_rate": 6.369580797148718e-05,
+      "loss": 0.7083,
+      "step": 1181
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.3716727407951127,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6583,
+      "step": 1182
+    },
+    {
+      "epoch": 0.6309333333333333,
+      "grad_norm": 0.4886366499037081,
+      "learning_rate": 6.337399566246257e-05,
+      "loss": 0.6991,
+      "step": 1183
+    },
+    {
+      "epoch": 0.6314666666666666,
+      "grad_norm": 0.4810398000322877,
+      "learning_rate": 6.321325332399903e-05,
+      "loss": 0.6088,
+      "step": 1184
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.40501659908197385,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.6343,
+      "step": 1185
+    },
+    {
+      "epoch": 0.6325333333333333,
+      "grad_norm": 0.3872785713763552,
+      "learning_rate": 6.289209867917312e-05,
+      "loss": 0.5938,
+      "step": 1186
+    },
+    {
+      "epoch": 0.6330666666666667,
+      "grad_norm": 0.4262127961256045,
+      "learning_rate": 6.273168733182722e-05,
+      "loss": 0.6683,
+      "step": 1187
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.36594726725387117,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6589,
+      "step": 1188
+    },
+    {
+      "epoch": 0.6341333333333333,
+      "grad_norm": 0.5296772394839944,
+      "learning_rate": 6.241119898233144e-05,
+      "loss": 0.7472,
+      "step": 1189
+    },
+    {
+      "epoch": 0.6346666666666667,
+      "grad_norm": 0.40591808570497256,
+      "learning_rate": 6.225112293720836e-05,
+      "loss": 0.6199,
+      "step": 1190
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.33499027276495974,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.6213,
+      "step": 1191
+    },
+    {
+      "epoch": 0.6357333333333334,
+      "grad_norm": 0.4490471112002442,
+      "learning_rate": 6.19313094962673e-05,
+      "loss": 0.745,
+      "step": 1192
+    },
+    {
+      "epoch": 0.6362666666666666,
+      "grad_norm": 0.4004004209960038,
+      "learning_rate": 6.177157305546078e-05,
+      "loss": 0.6736,
+      "step": 1193
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.4081418069053937,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.6877,
+      "step": 1194
+    },
+    {
+      "epoch": 0.6373333333333333,
+      "grad_norm": 0.5510970610876371,
+      "learning_rate": 6.145244311816063e-05,
+      "loss": 0.6336,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6378666666666667,
+      "grad_norm": 0.577670413375373,
+      "learning_rate": 6.129305057463741e-05,
+      "loss": 0.6311,
+      "step": 1196
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.3905336200852035,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.6562,
+      "step": 1197
+    },
+    {
+      "epoch": 0.6389333333333334,
+      "grad_norm": 0.46209659837967093,
+      "learning_rate": 6.0974612717695004e-05,
+      "loss": 0.6951,
+      "step": 1198
+    },
+    {
+      "epoch": 0.6394666666666666,
+      "grad_norm": 0.36284719198992044,
+      "learning_rate": 6.0815568355179556e-05,
+      "loss": 0.65,
+      "step": 1199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3963866904526784,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6036,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6405333333333333,
+      "grad_norm": 0.42869465690151415,
+      "learning_rate": 6.0497831136711836e-05,
+      "loss": 0.6931,
+      "step": 1201
+    },
+    {
+      "epoch": 0.6410666666666667,
+      "grad_norm": 0.4030905528391357,
+      "learning_rate": 6.0339139229571116e-05,
+      "loss": 0.6693,
+      "step": 1202
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.4299121498011599,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.6789,
+      "step": 1203
+    },
+    {
+      "epoch": 0.6421333333333333,
+      "grad_norm": 0.3869971495167029,
+      "learning_rate": 6.002211118886514e-05,
+      "loss": 0.6083,
+      "step": 1204
+    },
+    {
+      "epoch": 0.6426666666666667,
+      "grad_norm": 0.35145059252234884,
+      "learning_rate": 5.986377600199371e-05,
+      "loss": 0.6021,
+      "step": 1205
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.431472130038551,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6127,
+      "step": 1206
+    },
+    {
+      "epoch": 0.6437333333333334,
+      "grad_norm": 0.5211453824777301,
+      "learning_rate": 5.9547465659277215e-05,
+      "loss": 0.8389,
+      "step": 1207
+    },
+    {
+      "epoch": 0.6442666666666667,
+      "grad_norm": 0.43216368219278184,
+      "learning_rate": 5.938949144798279e-05,
+      "loss": 0.664,
+      "step": 1208
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.3652179821869812,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6094,
+      "step": 1209
+    },
+    {
+      "epoch": 0.6453333333333333,
+      "grad_norm": 0.4280942594177932,
+      "learning_rate": 5.907390730419507e-05,
+      "loss": 0.6872,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6458666666666667,
+      "grad_norm": 0.39141921847889477,
+      "learning_rate": 5.8916298314083915e-05,
+      "loss": 0.5987,
+      "step": 1211
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4557631876206928,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6478,
+      "step": 1212
+    },
+    {
+      "epoch": 0.6469333333333334,
+      "grad_norm": 0.39869783455438407,
+      "learning_rate": 5.860144885064751e-05,
+      "loss": 0.6588,
+      "step": 1213
+    },
+    {
+      "epoch": 0.6474666666666666,
+      "grad_norm": 0.4479245163416833,
+      "learning_rate": 5.8444209317510514e-05,
+      "loss": 0.6612,
+      "step": 1214
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.46517723517096476,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6641,
+      "step": 1215
+    },
+    {
+      "epoch": 0.6485333333333333,
+      "grad_norm": 0.38834700022906427,
+      "learning_rate": 5.813010299610313e-05,
+      "loss": 0.6497,
+      "step": 1216
+    },
+    {
+      "epoch": 0.6490666666666667,
+      "grad_norm": 0.45946022382496476,
+      "learning_rate": 5.797323714580192e-05,
+      "loss": 0.6648,
+      "step": 1217
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.42073745894658116,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.643,
+      "step": 1218
+    },
+    {
+      "epoch": 0.6501333333333333,
+      "grad_norm": 0.466784594465401,
+      "learning_rate": 5.765988240812921e-05,
+      "loss": 0.6769,
+      "step": 1219
+    },
+    {
+      "epoch": 0.6506666666666666,
+      "grad_norm": 0.42517084430908664,
+      "learning_rate": 5.750339445648252e-05,
+      "loss": 0.6687,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.38895462758587773,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.6537,
+      "step": 1221
+    },
+    {
+      "epoch": 0.6517333333333334,
+      "grad_norm": 0.48604488584972266,
+      "learning_rate": 5.7190799724050924e-05,
+      "loss": 0.5972,
+      "step": 1222
+    },
+    {
+      "epoch": 0.6522666666666667,
+      "grad_norm": 0.46578198589028647,
+      "learning_rate": 5.7034693876721376e-05,
+      "loss": 0.6648,
+      "step": 1223
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3968236236964227,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.6607,
+      "step": 1224
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 0.4798199203451227,
+      "learning_rate": 5.6722867550612116e-05,
+      "loss": 0.7428,
+      "step": 1225
+    },
+    {
+      "epoch": 0.6538666666666667,
+      "grad_norm": 0.4531935558414575,
+      "learning_rate": 5.6567148002993164e-05,
+      "loss": 0.6634,
+      "step": 1226
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.34558900265033327,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.632,
+      "step": 1227
+    },
+    {
+      "epoch": 0.6549333333333334,
+      "grad_norm": 0.43787546322679616,
+      "learning_rate": 5.625609846363622e-05,
+      "loss": 0.656,
+      "step": 1228
+    },
+    {
+      "epoch": 0.6554666666666666,
+      "grad_norm": 0.43250325432433856,
+      "learning_rate": 5.6100769400739383e-05,
+      "loss": 0.6779,
+      "step": 1229
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.38833312220396976,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.6292,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6565333333333333,
+      "grad_norm": 0.44540024272174433,
+      "learning_rate": 5.579050500768836e-05,
+      "loss": 0.608,
+      "step": 1231
+    },
+    {
+      "epoch": 0.6570666666666667,
+      "grad_norm": 0.3579828149017168,
+      "learning_rate": 5.5635570604030705e-05,
+      "loss": 0.6255,
+      "step": 1232
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.4053175333424152,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.6258,
+      "step": 1233
+    },
+    {
+      "epoch": 0.6581333333333333,
+      "grad_norm": 0.35858098645794223,
+      "learning_rate": 5.53260996957381e-05,
+      "loss": 0.6704,
+      "step": 1234
+    },
+    {
+      "epoch": 0.6586666666666666,
+      "grad_norm": 0.3987451827915316,
+      "learning_rate": 5.5171564115230254e-05,
+      "loss": 0.6937,
+      "step": 1235
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.42997637983274856,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6585,
+      "step": 1236
+    },
+    {
+      "epoch": 0.6597333333333333,
+      "grad_norm": 0.3932526654130728,
+      "learning_rate": 5.486289500882355e-05,
+      "loss": 0.6462,
+      "step": 1237
+    },
+    {
+      "epoch": 0.6602666666666667,
+      "grad_norm": 0.3870233656284184,
+      "learning_rate": 5.47087624046575e-05,
+      "loss": 0.6615,
+      "step": 1238
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.49712452919958033,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.6623,
+      "step": 1239
+    },
+    {
+      "epoch": 0.6613333333333333,
+      "grad_norm": 0.35625906326919565,
+      "learning_rate": 5.4400903395715366e-05,
+      "loss": 0.6282,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6618666666666667,
+      "grad_norm": 0.42291877206495576,
+      "learning_rate": 5.424717791025302e-05,
+      "loss": 0.7098,
+      "step": 1241
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4253620364990812,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.6386,
+      "step": 1242
+    },
+    {
+      "epoch": 0.6629333333333334,
+      "grad_norm": 0.3912739689870102,
+      "learning_rate": 5.394013727258254e-05,
+      "loss": 0.6086,
+      "step": 1243
+    },
+    {
+      "epoch": 0.6634666666666666,
+      "grad_norm": 0.41318139248268376,
+      "learning_rate": 5.378682303724435e-05,
+      "loss": 0.7141,
+      "step": 1244
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.43295533205841463,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.7043,
+      "step": 1245
+    },
+    {
+      "epoch": 0.6645333333333333,
+      "grad_norm": 1.1398184199401138,
+      "learning_rate": 5.348060902265871e-05,
+      "loss": 0.7222,
+      "step": 1246
+    },
+    {
+      "epoch": 0.6650666666666667,
+      "grad_norm": 0.3862703219307575,
+      "learning_rate": 5.332771015781275e-05,
+      "loss": 0.6426,
+      "step": 1247
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.4677947520049491,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.6662,
+      "step": 1248
+    },
+    {
+      "epoch": 0.6661333333333334,
+      "grad_norm": 0.401846080147303,
+      "learning_rate": 5.302233099590928e-05,
+      "loss": 0.6521,
+      "step": 1249
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.37286357672527154,
+      "learning_rate": 5.286985161076029e-05,
+      "loss": 0.6062,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.40748923628071837,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6926,
+      "step": 1251
+    },
+    {
+      "epoch": 0.6677333333333333,
+      "grad_norm": 0.4479423524142015,
+      "learning_rate": 5.2565315508699376e-05,
+      "loss": 0.6756,
+      "step": 1252
+    },
+    {
+      "epoch": 0.6682666666666667,
+      "grad_norm": 0.42234114843813786,
+      "learning_rate": 5.2413259701178505e-05,
+      "loss": 0.6476,
+      "step": 1253
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3896182410264214,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6772,
+      "step": 1254
+    },
+    {
+      "epoch": 0.6693333333333333,
+      "grad_norm": 0.4411435829201509,
+      "learning_rate": 5.210957484346314e-05,
+      "loss": 0.6234,
+      "step": 1255
+    },
+    {
+      "epoch": 0.6698666666666667,
+      "grad_norm": 0.37933568532674966,
+      "learning_rate": 5.195794670011776e-05,
+      "loss": 0.6204,
+      "step": 1256
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.46265736442828825,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6668,
+      "step": 1257
+    },
+    {
+      "epoch": 0.6709333333333334,
+      "grad_norm": 0.4393018454365821,
+      "learning_rate": 5.165512124837344e-05,
+      "loss": 0.6645,
+      "step": 1258
+    },
+    {
+      "epoch": 0.6714666666666667,
+      "grad_norm": 0.5177754712691343,
+      "learning_rate": 5.150392484425728e-05,
+      "loss": 0.6997,
+      "step": 1259
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.42607221478118706,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.666,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6725333333333333,
+      "grad_norm": 0.39882355924879737,
+      "learning_rate": 5.120196693701267e-05,
+      "loss": 0.6213,
+      "step": 1261
+    },
+    {
+      "epoch": 0.6730666666666667,
+      "grad_norm": 0.38805641941283864,
+      "learning_rate": 5.105120633557634e-05,
+      "loss": 0.5997,
+      "step": 1262
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.38762433495801163,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.6421,
+      "step": 1263
+    },
+    {
+      "epoch": 0.6741333333333334,
+      "grad_norm": 0.40016323813150256,
+      "learning_rate": 5.075012408804458e-05,
+      "loss": 0.6085,
+      "step": 1264
+    },
+    {
+      "epoch": 0.6746666666666666,
+      "grad_norm": 0.37616401930803023,
+      "learning_rate": 5.059980334102637e-05,
+      "loss": 0.6258,
+      "step": 1265
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.41028978668860133,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.7122,
+      "step": 1266
+    },
+    {
+      "epoch": 0.6757333333333333,
+      "grad_norm": 0.4042626959520759,
+      "learning_rate": 5.0299604844886985e-05,
+      "loss": 0.6362,
+      "step": 1267
+    },
+    {
+      "epoch": 0.6762666666666667,
+      "grad_norm": 0.45220879835180783,
+      "learning_rate": 5.014972799220403e-05,
+      "loss": 0.709,
+      "step": 1268
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.4293227756763883,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.6058,
+      "step": 1269
+    },
+    {
+      "epoch": 0.6773333333333333,
+      "grad_norm": 0.39955554118477965,
+      "learning_rate": 4.985042131538545e-05,
+      "loss": 0.6197,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6778666666666666,
+      "grad_norm": 0.42040530700673934,
+      "learning_rate": 4.9700992385024934e-05,
+      "loss": 0.6397,
+      "step": 1271
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.4265123061001254,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.649,
+      "step": 1272
+    },
+    {
+      "epoch": 0.6789333333333334,
+      "grad_norm": 0.431492915890074,
+      "learning_rate": 4.940258557148765e-05,
+      "loss": 0.6597,
+      "step": 1273
+    },
+    {
+      "epoch": 0.6794666666666667,
+      "grad_norm": 0.3893191806148624,
+      "learning_rate": 4.9253608579398855e-05,
+      "loss": 0.6364,
+      "step": 1274
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.4593682382286822,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6714,
+      "step": 1275
+    },
+    {
+      "epoch": 0.6805333333333333,
+      "grad_norm": 0.4513437746047561,
+      "learning_rate": 4.895610964891923e-05,
+      "loss": 0.679,
+      "step": 1276
+    },
+    {
+      "epoch": 0.6810666666666667,
+      "grad_norm": 0.37487776255065614,
+      "learning_rate": 4.880758859890536e-05,
+      "loss": 0.6301,
+      "step": 1277
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.5125427502539084,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6741,
+      "step": 1278
+    },
+    {
+      "epoch": 0.6821333333333334,
+      "grad_norm": 0.4875542190779091,
+      "learning_rate": 4.851100554686021e-05,
+      "loss": 0.705,
+      "step": 1279
+    },
+    {
+      "epoch": 0.6826666666666666,
+      "grad_norm": 0.3913131109343328,
+      "learning_rate": 4.836294443047088e-05,
+      "loss": 0.647,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.39692001539739485,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.6315,
+      "step": 1281
+    },
+    {
+      "epoch": 0.6837333333333333,
+      "grad_norm": 0.3661830086625643,
+      "learning_rate": 4.8067285227622404e-05,
+      "loss": 0.6102,
+      "step": 1282
+    },
+    {
+      "epoch": 0.6842666666666667,
+      "grad_norm": 0.40088562778368225,
+      "learning_rate": 4.791968802404648e-05,
+      "loss": 0.6512,
+      "step": 1283
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.3968740135921082,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6251,
+      "step": 1284
+    },
+    {
+      "epoch": 0.6853333333333333,
+      "grad_norm": 0.4561766053407961,
+      "learning_rate": 4.762496061632814e-05,
+      "loss": 0.6348,
+      "step": 1285
+    },
+    {
+      "epoch": 0.6858666666666666,
+      "grad_norm": 0.42854281632100244,
+      "learning_rate": 4.747783129228656e-05,
+      "loss": 0.5912,
+      "step": 1286
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.5283300083820703,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.731,
+      "step": 1287
+    },
+    {
+      "epoch": 0.6869333333333333,
+      "grad_norm": 0.3956666776345902,
+      "learning_rate": 4.718404360058966e-05,
+      "loss": 0.5672,
+      "step": 1288
+    },
+    {
+      "epoch": 0.6874666666666667,
+      "grad_norm": 0.3811259172079882,
+      "learning_rate": 4.7037386110228985e-05,
+      "loss": 0.6196,
+      "step": 1289
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4769350025419239,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.7024,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6885333333333333,
+      "grad_norm": 0.41630517232447295,
+      "learning_rate": 4.6744546030189486e-05,
+      "loss": 0.6686,
+      "step": 1291
+    },
+    {
+      "epoch": 0.6890666666666667,
+      "grad_norm": 0.5956279258644027,
+      "learning_rate": 4.659836431497563e-05,
+      "loss": 0.6151,
+      "step": 1292
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.40433857542379514,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.6602,
+      "step": 1293
+    },
+    {
+      "epoch": 0.6901333333333334,
+      "grad_norm": 0.45198972435300755,
+      "learning_rate": 4.630647971676232e-05,
+      "loss": 0.6614,
+      "step": 1294
+    },
+    {
+      "epoch": 0.6906666666666667,
+      "grad_norm": 0.4242156941762448,
+      "learning_rate": 4.6160777705374524e-05,
+      "loss": 0.7417,
+      "step": 1295
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.37644881883817727,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.6413,
+      "step": 1296
+    },
+    {
+      "epoch": 0.6917333333333333,
+      "grad_norm": 0.3922208031071116,
+      "learning_rate": 4.586985643347717e-05,
+      "loss": 0.6515,
+      "step": 1297
+    },
+    {
+      "epoch": 0.6922666666666667,
+      "grad_norm": 0.48489005861581225,
+      "learning_rate": 4.572463804170263e-05,
+      "loss": 0.5778,
+      "step": 1298
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.3956409256924293,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.643,
+      "step": 1299
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.427651978645937,
+      "learning_rate": 4.543468791472131e-05,
+      "loss": 0.6556,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6938666666666666,
+      "grad_norm": 0.3547420794780449,
+      "learning_rate": 4.5289957045349653e-05,
+      "loss": 0.6127,
+      "step": 1301
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.41186232406623996,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.6259,
+      "step": 1302
+    },
+    {
+      "epoch": 0.6949333333333333,
+      "grad_norm": 0.4607083618840504,
+      "learning_rate": 4.5000985855784746e-05,
+      "loss": 0.649,
+      "step": 1303
+    },
+    {
+      "epoch": 0.6954666666666667,
+      "grad_norm": 0.4424568065373181,
+      "learning_rate": 4.485674639850333e-05,
+      "loss": 0.7219,
+      "step": 1304
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.5217758891399114,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7046,
+      "step": 1305
+    },
+    {
+      "epoch": 0.6965333333333333,
+      "grad_norm": 0.36482890737078805,
+      "learning_rate": 4.456876191254582e-05,
+      "loss": 0.605,
+      "step": 1306
+    },
+    {
+      "epoch": 0.6970666666666666,
+      "grad_norm": 0.41144418918267145,
+      "learning_rate": 4.442501774383515e-05,
+      "loss": 0.6534,
+      "step": 1307
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.42698741671175594,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.704,
+      "step": 1308
+    },
+    {
+      "epoch": 0.6981333333333334,
+      "grad_norm": 0.3781262769116132,
+      "learning_rate": 4.413802770115816e-05,
+      "loss": 0.6249,
+      "step": 1309
+    },
+    {
+      "epoch": 0.6986666666666667,
+      "grad_norm": 0.3811219502776769,
+      "learning_rate": 4.399478268418771e-05,
+      "loss": 0.6517,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.3962703830271651,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.647,
+      "step": 1311
+    },
+    {
+      "epoch": 0.6997333333333333,
+      "grad_norm": 0.47512105748597355,
+      "learning_rate": 4.3708794797738375e-05,
+      "loss": 0.6546,
+      "step": 1312
+    },
+    {
+      "epoch": 0.7002666666666667,
+      "grad_norm": 0.3864120022213843,
+      "learning_rate": 4.3566052782262735e-05,
+      "loss": 0.6675,
+      "step": 1313
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.40235347840914154,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6411,
+      "step": 1314
+    },
+    {
+      "epoch": 0.7013333333333334,
+      "grad_norm": 0.5395692983578562,
+      "learning_rate": 4.328107473805487e-05,
+      "loss": 0.668,
+      "step": 1315
+    },
+    {
+      "epoch": 0.7018666666666666,
+      "grad_norm": 0.4633234751653702,
+      "learning_rate": 4.3138839560310303e-05,
+      "loss": 0.6619,
+      "step": 1316
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.4252508666674081,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6064,
+      "step": 1317
+    },
+    {
+      "epoch": 0.7029333333333333,
+      "grad_norm": 0.40105004829371027,
+      "learning_rate": 4.2854879017217894e-05,
+      "loss": 0.5626,
+      "step": 1318
+    },
+    {
+      "epoch": 0.7034666666666667,
+      "grad_norm": 0.42489712099458443,
+      "learning_rate": 4.271315449981934e-05,
+      "loss": 0.647,
+      "step": 1319
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.43580292129353576,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.6945,
+      "step": 1320
+    },
+    {
+      "epoch": 0.7045333333333333,
+      "grad_norm": 0.39032402412353395,
+      "learning_rate": 4.2430219089370823e-05,
+      "loss": 0.6747,
+      "step": 1321
+    },
+    {
+      "epoch": 0.7050666666666666,
+      "grad_norm": 0.4538435511477062,
+      "learning_rate": 4.228900904120895e-05,
+      "loss": 0.6278,
+      "step": 1322
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.40414914792484313,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.6237,
+      "step": 1323
+    },
+    {
+      "epoch": 0.7061333333333333,
+      "grad_norm": 0.3715303103103586,
+      "learning_rate": 4.200710636738189e-05,
+      "loss": 0.6401,
+      "step": 1324
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 0.3616164453355704,
+      "learning_rate": 4.1866414583520877e-05,
+      "loss": 0.6366,
+      "step": 1325
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.40727321244216635,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6349,
+      "step": 1326
+    },
+    {
+      "epoch": 0.7077333333333333,
+      "grad_norm": 0.3995228220202781,
+      "learning_rate": 4.158555222253771e-05,
+      "loss": 0.5915,
+      "step": 1327
+    },
+    {
+      "epoch": 0.7082666666666667,
+      "grad_norm": 0.3796897322527262,
+      "learning_rate": 4.14453824841132e-05,
+      "loss": 0.623,
+      "step": 1328
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.4191499271698253,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.7008,
+      "step": 1329
+    },
+    {
+      "epoch": 0.7093333333333334,
+      "grad_norm": 0.4151839265104562,
+      "learning_rate": 4.1165567984237764e-05,
+      "loss": 0.6186,
+      "step": 1330
+    },
+    {
+      "epoch": 0.7098666666666666,
+      "grad_norm": 0.4765363310999923,
+      "learning_rate": 4.102592405835536e-05,
+      "loss": 0.6983,
+      "step": 1331
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.45251210327244407,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7252,
+      "step": 1332
+    },
+    {
+      "epoch": 0.7109333333333333,
+      "grad_norm": 0.46979099440068206,
+      "learning_rate": 4.074716493968975e-05,
+      "loss": 0.7144,
+      "step": 1333
+    },
+    {
+      "epoch": 0.7114666666666667,
+      "grad_norm": 0.400590774372865,
+      "learning_rate": 4.060805057932359e-05,
+      "loss": 0.6376,
+      "step": 1334
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3643801004323646,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6127,
+      "step": 1335
+    },
+    {
+      "epoch": 0.7125333333333334,
+      "grad_norm": 0.411676426115019,
+      "learning_rate": 4.0330354333606234e-05,
+      "loss": 0.6431,
+      "step": 1336
+    },
+    {
+      "epoch": 0.7130666666666666,
+      "grad_norm": 0.46124395883928804,
+      "learning_rate": 4.019177327749822e-05,
+      "loss": 0.7374,
+      "step": 1337
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.435690599029188,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6413,
+      "step": 1338
+    },
+    {
+      "epoch": 0.7141333333333333,
+      "grad_norm": 0.4811840917063531,
+      "learning_rate": 3.991514736790258e-05,
+      "loss": 0.7198,
+      "step": 1339
+    },
+    {
+      "epoch": 0.7146666666666667,
+      "grad_norm": 0.3792243676588303,
+      "learning_rate": 3.977710334046193e-05,
+      "loss": 0.6197,
+      "step": 1340
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.4062410023533493,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.6669,
+      "step": 1341
+    },
+    {
+      "epoch": 0.7157333333333333,
+      "grad_norm": 0.5052405679987543,
+      "learning_rate": 3.950155520139581e-05,
+      "loss": 0.773,
+      "step": 1342
+    },
+    {
+      "epoch": 0.7162666666666667,
+      "grad_norm": 0.414816182297928,
+      "learning_rate": 3.936405191259891e-05,
+      "loss": 0.6194,
+      "step": 1343
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.46485351218905546,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.5999,
+      "step": 1344
+    },
+    {
+      "epoch": 0.7173333333333334,
+      "grad_norm": 0.40217650610978745,
+      "learning_rate": 3.9089588949504655e-05,
+      "loss": 0.6472,
+      "step": 1345
+    },
+    {
+      "epoch": 0.7178666666666667,
+      "grad_norm": 0.41126670572783974,
+      "learning_rate": 3.895263009479534e-05,
+      "loss": 0.6621,
+      "step": 1346
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.42123602460746146,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.676,
+      "step": 1347
+    },
+    {
+      "epoch": 0.7189333333333333,
+      "grad_norm": 0.4253579515095016,
+      "learning_rate": 3.867925968395085e-05,
+      "loss": 0.6946,
+      "step": 1348
+    },
+    {
+      "epoch": 0.7194666666666667,
+      "grad_norm": 0.3879289872092901,
+      "learning_rate": 3.854284894414122e-05,
+      "loss": 0.651,
+      "step": 1349
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.47074809141769725,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.6865,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7205333333333334,
+      "grad_norm": 0.4525618442462892,
+      "learning_rate": 3.82705784324618e-05,
+      "loss": 0.6473,
+      "step": 1351
+    },
+    {
+      "epoch": 0.7210666666666666,
+      "grad_norm": 0.4211196142878507,
+      "learning_rate": 3.8134719473633094e-05,
+      "loss": 0.673,
+      "step": 1352
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.3932298817755076,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6081,
+      "step": 1353
+    },
+    {
+      "epoch": 0.7221333333333333,
+      "grad_norm": 0.4484407693371872,
+      "learning_rate": 3.786355617847385e-05,
+      "loss": 0.6591,
+      "step": 1354
+    },
+    {
+      "epoch": 0.7226666666666667,
+      "grad_norm": 0.4157957671793489,
+      "learning_rate": 3.772825265187802e-05,
+      "loss": 0.7103,
+      "step": 1355
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.40025302364842813,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.7238,
+      "step": 1356
+    },
+    {
+      "epoch": 0.7237333333333333,
+      "grad_norm": 0.3735438460830809,
+      "learning_rate": 3.7458203860837234e-05,
+      "loss": 0.6057,
+      "step": 1357
+    },
+    {
+      "epoch": 0.7242666666666666,
+      "grad_norm": 0.4258025624491941,
+      "learning_rate": 3.732345940279893e-05,
+      "loss": 0.6287,
+      "step": 1358
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.41435165265860574,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.7342,
+      "step": 1359
+    },
+    {
+      "epoch": 0.7253333333333334,
+      "grad_norm": 0.43510629641125437,
+      "learning_rate": 3.705453237352227e-05,
+      "loss": 0.6102,
+      "step": 1360
+    },
+    {
+      "epoch": 0.7258666666666667,
+      "grad_norm": 0.4155326388136457,
+      "learning_rate": 3.692035060534088e-05,
+      "loss": 0.6477,
+      "step": 1361
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3920465694406834,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.6337,
+      "step": 1362
+    },
+    {
+      "epoch": 0.7269333333333333,
+      "grad_norm": 0.3280776536946831,
+      "learning_rate": 3.665255256532638e-05,
+      "loss": 0.5867,
+      "step": 1363
+    },
+    {
+      "epoch": 0.7274666666666667,
+      "grad_norm": 0.4020150710561621,
+      "learning_rate": 3.651893709317887e-05,
+      "loss": 0.6606,
+      "step": 1364
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4321202747337267,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.653,
+      "step": 1365
+    },
+    {
+      "epoch": 0.7285333333333334,
+      "grad_norm": 0.4145771553289532,
+      "learning_rate": 3.625227523958252e-05,
+      "loss": 0.6449,
+      "step": 1366
+    },
+    {
+      "epoch": 0.7290666666666666,
+      "grad_norm": 0.4000997935495122,
+      "learning_rate": 3.611922965442648e-05,
+      "loss": 0.6388,
+      "step": 1367
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.37776776801238715,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.5843,
+      "step": 1368
+    },
+    {
+      "epoch": 0.7301333333333333,
+      "grad_norm": 0.38639775734571263,
+      "learning_rate": 3.5853711153868965e-05,
+      "loss": 0.6342,
+      "step": 1369
+    },
+    {
+      "epoch": 0.7306666666666667,
+      "grad_norm": 0.38802333600420025,
+      "learning_rate": 3.5721239031346066e-05,
+      "loss": 0.5957,
+      "step": 1370
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.4380942056655734,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.6544,
+      "step": 1371
+    },
+    {
+      "epoch": 0.7317333333333333,
+      "grad_norm": 0.4043243800838281,
+      "learning_rate": 3.545687101972013e-05,
+      "loss": 0.6314,
+      "step": 1372
+    },
+    {
+      "epoch": 0.7322666666666666,
+      "grad_norm": 0.37685966819642175,
+      "learning_rate": 3.53249759200601e-05,
+      "loss": 0.5692,
+      "step": 1373
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.5074379766120739,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6473,
+      "step": 1374
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.4747154935129227,
+      "learning_rate": 3.506176550233863e-05,
+      "loss": 0.6918,
+      "step": 1375
+    },
+    {
+      "epoch": 0.7338666666666667,
+      "grad_norm": 0.3809069679768159,
+      "learning_rate": 3.4930450970263485e-05,
+      "loss": 0.6786,
+      "step": 1376
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.3936443972311563,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6358,
+      "step": 1377
+    },
+    {
+      "epoch": 0.7349333333333333,
+      "grad_norm": 0.4460387695423164,
+      "learning_rate": 3.46684052203088e-05,
+      "loss": 0.6629,
+      "step": 1378
+    },
+    {
+      "epoch": 0.7354666666666667,
+      "grad_norm": 0.47158508880496003,
+      "learning_rate": 3.4537674784937614e-05,
+      "loss": 0.6718,
+      "step": 1379
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.45039878447397613,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.594,
+      "step": 1380
+    },
+    {
+      "epoch": 0.7365333333333334,
+      "grad_norm": 0.3760296162927113,
+      "learning_rate": 3.427680074531113e-05,
+      "loss": 0.6529,
+      "step": 1381
+    },
+    {
+      "epoch": 0.7370666666666666,
+      "grad_norm": 0.3887184512734403,
+      "learning_rate": 3.4146657920065285e-05,
+      "loss": 0.6598,
+      "step": 1382
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.4221929479111637,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.6554,
+      "step": 1383
+    },
+    {
+      "epoch": 0.7381333333333333,
+      "grad_norm": 0.6988186268204654,
+      "learning_rate": 3.388696260183832e-05,
+      "loss": 0.6674,
+      "step": 1384
+    },
+    {
+      "epoch": 0.7386666666666667,
+      "grad_norm": 0.36250401220062856,
+      "learning_rate": 3.3757410884346894e-05,
+      "loss": 0.5585,
+      "step": 1385
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.36621982144568793,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6274,
+      "step": 1386
+    },
+    {
+      "epoch": 0.7397333333333334,
+      "grad_norm": 0.38601818339233507,
+      "learning_rate": 3.3498901266912396e-05,
+      "loss": 0.5878,
+      "step": 1387
+    },
+    {
+      "epoch": 0.7402666666666666,
+      "grad_norm": 0.4316403933335295,
+      "learning_rate": 3.336994413891828e-05,
+      "loss": 0.6424,
+      "step": 1388
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.3907795425417193,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.6275,
+      "step": 1389
+    },
+    {
+      "epoch": 0.7413333333333333,
+      "grad_norm": 0.45314061352335727,
+      "learning_rate": 3.3112627169802946e-05,
+      "loss": 0.6697,
+      "step": 1390
+    },
+    {
+      "epoch": 0.7418666666666667,
+      "grad_norm": 0.4256923628283321,
+      "learning_rate": 3.298426809706928e-05,
+      "loss": 0.668,
+      "step": 1391
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.5072675399710331,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.7495,
+      "step": 1392
+    },
+    {
+      "epoch": 0.7429333333333333,
+      "grad_norm": 0.3819982521836467,
+      "learning_rate": 3.2728150691747115e-05,
+      "loss": 0.6652,
+      "step": 1393
+    },
+    {
+      "epoch": 0.7434666666666667,
+      "grad_norm": 0.4377503488023667,
+      "learning_rate": 3.2600393123964113e-05,
+      "loss": 0.6411,
+      "step": 1394
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.36867785125647967,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.5995,
+      "step": 1395
+    },
+    {
+      "epoch": 0.7445333333333334,
+      "grad_norm": 0.40642499892036954,
+      "learning_rate": 3.234548216567049e-05,
+      "loss": 0.6682,
+      "step": 1396
+    },
+    {
+      "epoch": 0.7450666666666667,
+      "grad_norm": 0.385843008891069,
+      "learning_rate": 3.2218329536362704e-05,
+      "loss": 0.6242,
+      "step": 1397
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.4325274716602865,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.6582,
+      "step": 1398
+    },
+    {
+      "epoch": 0.7461333333333333,
+      "grad_norm": 0.46100598318068164,
+      "learning_rate": 3.196463187590929e-05,
+      "loss": 0.6108,
+      "step": 1399
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.42482416416988833,
+      "learning_rate": 3.1838087602343344e-05,
+      "loss": 0.6546,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.39647749049588554,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6523,
+      "step": 1401
+    },
+    {
+      "epoch": 0.7477333333333334,
+      "grad_norm": 0.441865136236515,
+      "learning_rate": 3.158561005793402e-05,
+      "loss": 0.6661,
+      "step": 1402
+    },
+    {
+      "epoch": 0.7482666666666666,
+      "grad_norm": 0.4417314914160828,
+      "learning_rate": 3.145967754102691e-05,
+      "loss": 0.6063,
+      "step": 1403
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3908224130249587,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.6564,
+      "step": 1404
+    },
+    {
+      "epoch": 0.7493333333333333,
+      "grad_norm": 0.41361323719504467,
+      "learning_rate": 3.120842689807468e-05,
+      "loss": 0.5932,
+      "step": 1405
+    },
+    {
+      "epoch": 0.7498666666666667,
+      "grad_norm": 0.45220246474049364,
+      "learning_rate": 3.108310952230212e-05,
+      "loss": 0.6369,
+      "step": 1406
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.37290705932486357,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.5997,
+      "step": 1407
+    },
+    {
+      "epoch": 0.7509333333333333,
+      "grad_norm": 0.3952853062774028,
+      "learning_rate": 3.083309253324651e-05,
+      "loss": 0.6158,
+      "step": 1408
+    },
+    {
+      "epoch": 0.7514666666666666,
+      "grad_norm": 0.411211223469002,
+      "learning_rate": 3.070839366655215e-05,
+      "loss": 0.6074,
+      "step": 1409
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.41492456552185275,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6444,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7525333333333334,
+      "grad_norm": 0.3567417740050895,
+      "learning_rate": 3.0459617050677868e-05,
+      "loss": 0.62,
+      "step": 1411
+    },
+    {
+      "epoch": 0.7530666666666667,
+      "grad_norm": 0.41309505589507733,
+      "learning_rate": 3.0335540044382694e-05,
+      "loss": 0.6294,
+      "step": 1412
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.4773359945231895,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.6836,
+      "step": 1413
+    },
+    {
+      "epoch": 0.7541333333333333,
+      "grad_norm": 0.42567710901459754,
+      "learning_rate": 3.008801048763914e-05,
+      "loss": 0.7079,
+      "step": 1414
+    },
+    {
+      "epoch": 0.7546666666666667,
+      "grad_norm": 0.40052062915954073,
+      "learning_rate": 2.996455867635155e-05,
+      "loss": 0.6326,
+      "step": 1415
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.43124179647071165,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.5604,
+      "step": 1416
+    },
+    {
+      "epoch": 0.7557333333333334,
+      "grad_norm": 0.42862447927787145,
+      "learning_rate": 2.9718282831172883e-05,
+      "loss": 0.6885,
+      "step": 1417
+    },
+    {
+      "epoch": 0.7562666666666666,
+      "grad_norm": 0.45500083220282006,
+      "learning_rate": 2.9595459532698854e-05,
+      "loss": 0.6823,
+      "step": 1418
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.37262762163934443,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.63,
+      "step": 1419
+    },
+    {
+      "epoch": 0.7573333333333333,
+      "grad_norm": 0.39930643408061656,
+      "learning_rate": 2.9350444017825385e-05,
+      "loss": 0.6416,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7578666666666667,
+      "grad_norm": 0.4450527614039218,
+      "learning_rate": 2.922825253307947e-05,
+      "loss": 0.6659,
+      "step": 1421
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3509670630771238,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.5801,
+      "step": 1422
+    },
+    {
+      "epoch": 0.7589333333333333,
+      "grad_norm": 0.39275889558823696,
+      "learning_rate": 2.898450393337977e-05,
+      "loss": 0.6156,
+      "step": 1423
+    },
+    {
+      "epoch": 0.7594666666666666,
+      "grad_norm": 0.4365065813793774,
+      "learning_rate": 2.8862947546296315e-05,
+      "loss": 0.6213,
+      "step": 1424
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3129303170662575,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.5871,
+      "step": 1425
+    },
+    {
+      "epoch": 0.7605333333333333,
+      "grad_norm": 0.39102267898629967,
+      "learning_rate": 2.8620472412590228e-05,
+      "loss": 0.5783,
+      "step": 1426
+    },
+    {
+      "epoch": 0.7610666666666667,
+      "grad_norm": 0.39074303616585854,
+      "learning_rate": 2.8499554390035143e-05,
+      "loss": 0.6334,
+      "step": 1427
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.47323989776245445,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6175,
+      "step": 1428
+    },
+    {
+      "epoch": 0.7621333333333333,
+      "grad_norm": 0.46029568340569943,
+      "learning_rate": 2.8258359238917665e-05,
+      "loss": 0.6693,
+      "step": 1429
+    },
+    {
+      "epoch": 0.7626666666666667,
+      "grad_norm": 0.4273119427517804,
+      "learning_rate": 2.8138082830600554e-05,
+      "loss": 0.632,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.36461203285948796,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.5611,
+      "step": 1431
+    },
+    {
+      "epoch": 0.7637333333333334,
+      "grad_norm": 0.4301826055749961,
+      "learning_rate": 2.7898174144266732e-05,
+      "loss": 0.6514,
+      "step": 1432
+    },
+    {
+      "epoch": 0.7642666666666666,
+      "grad_norm": 0.37385563161907515,
+      "learning_rate": 2.7778542582653744e-05,
+      "loss": 0.6295,
+      "step": 1433
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.40609506955915836,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6105,
+      "step": 1434
+    },
+    {
+      "epoch": 0.7653333333333333,
+      "grad_norm": 0.5230191893009891,
+      "learning_rate": 2.753992680872457e-05,
+      "loss": 0.6659,
+      "step": 1435
+    },
+    {
+      "epoch": 0.7658666666666667,
+      "grad_norm": 0.3839015364109024,
+      "learning_rate": 2.7420943308951284e-05,
+      "loss": 0.6077,
+      "step": 1436
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.40991190312692427,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.6648,
+      "step": 1437
+    },
+    {
+      "epoch": 0.7669333333333334,
+      "grad_norm": 0.4183908345604154,
+      "learning_rate": 2.7183626860300247e-05,
+      "loss": 0.591,
+      "step": 1438
+    },
+    {
+      "epoch": 0.7674666666666666,
+      "grad_norm": 0.3839402985392285,
+      "learning_rate": 2.7065294620085424e-05,
+      "loss": 0.6354,
+      "step": 1439
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.39316191117423616,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.5511,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7685333333333333,
+      "grad_norm": 0.3757351480584375,
+      "learning_rate": 2.6829283874666233e-05,
+      "loss": 0.6634,
+      "step": 1441
+    },
+    {
+      "epoch": 0.7690666666666667,
+      "grad_norm": 0.38586563028159926,
+      "learning_rate": 2.6711606074225782e-05,
+      "loss": 0.6076,
+      "step": 1442
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.40401411814743754,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6151,
+      "step": 1443
+    },
+    {
+      "epoch": 0.7701333333333333,
+      "grad_norm": 0.4848653050261294,
+      "learning_rate": 2.647690737490106e-05,
+      "loss": 0.6956,
+      "step": 1444
+    },
+    {
+      "epoch": 0.7706666666666667,
+      "grad_norm": 0.5332637343551315,
+      "learning_rate": 2.6359887176862718e-05,
+      "loss": 0.6617,
+      "step": 1445
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3740767346843177,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6384,
+      "step": 1446
+    },
+    {
+      "epoch": 0.7717333333333334,
+      "grad_norm": 0.3720633228138245,
+      "learning_rate": 2.6126506831233344e-05,
+      "loss": 0.609,
+      "step": 1447
+    },
+    {
+      "epoch": 0.7722666666666667,
+      "grad_norm": 0.47431548855743244,
+      "learning_rate": 2.6010147380551475e-05,
+      "loss": 0.5865,
+      "step": 1448
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.4379463383797498,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6096,
+      "step": 1449
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.39255985136755134,
+      "learning_rate": 2.577809166078716e-05,
+      "loss": 0.6216,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7738666666666667,
+      "grad_norm": 0.39246122531624794,
+      "learning_rate": 2.566239608465838e-05,
+      "loss": 0.6354,
+      "step": 1451
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.41380264443785086,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.6603,
+      "step": 1452
+    },
+    {
+      "epoch": 0.7749333333333334,
+      "grad_norm": 0.3363711579115939,
+      "learning_rate": 2.543167122732918e-05,
+      "loss": 0.6206,
+      "step": 1453
+    },
+    {
+      "epoch": 0.7754666666666666,
+      "grad_norm": 0.410785747452271,
+      "learning_rate": 2.5316642635108244e-05,
+      "loss": 0.6419,
+      "step": 1454
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.38915997907587807,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.6565,
+      "step": 1455
+    },
+    {
+      "epoch": 0.7765333333333333,
+      "grad_norm": 0.468336153616573,
+      "learning_rate": 2.508725484101684e-05,
+      "loss": 0.6887,
+      "step": 1456
+    },
+    {
+      "epoch": 0.7770666666666667,
+      "grad_norm": 0.38816798537475805,
+      "learning_rate": 2.4972896324133144e-05,
+      "loss": 0.669,
+      "step": 1457
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.4407475349265452,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6904,
+      "step": 1458
+    },
+    {
+      "epoch": 0.7781333333333333,
+      "grad_norm": 0.6455077887809194,
+      "learning_rate": 2.4744851758148156e-05,
+      "loss": 0.6925,
+      "step": 1459
+    },
+    {
+      "epoch": 0.7786666666666666,
+      "grad_norm": 0.4047821122693816,
+      "learning_rate": 2.4631166390022574e-05,
+      "loss": 0.6348,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.3904817170860773,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.5996,
+      "step": 1461
+    },
+    {
+      "epoch": 0.7797333333333333,
+      "grad_norm": 0.4256817343847479,
+      "learning_rate": 2.4404471180913058e-05,
+      "loss": 0.6196,
+      "step": 1462
+    },
+    {
+      "epoch": 0.7802666666666667,
+      "grad_norm": 0.3564141647024927,
+      "learning_rate": 2.429146201687538e-05,
+      "loss": 0.5845,
+      "step": 1463
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.43895240241864214,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6411,
+      "step": 1464
+    },
+    {
+      "epoch": 0.7813333333333333,
+      "grad_norm": 0.41344241738837445,
+      "learning_rate": 2.4066122257145894e-05,
+      "loss": 0.6282,
+      "step": 1465
+    },
+    {
+      "epoch": 0.7818666666666667,
+      "grad_norm": 0.47764251794889384,
+      "learning_rate": 2.3953792334352787e-05,
+      "loss": 0.6491,
+      "step": 1466
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.40405844877676395,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6292,
+      "step": 1467
+    },
+    {
+      "epoch": 0.7829333333333334,
+      "grad_norm": 0.40747542751942456,
+      "learning_rate": 2.3729814080079816e-05,
+      "loss": 0.5761,
+      "step": 1468
+    },
+    {
+      "epoch": 0.7834666666666666,
+      "grad_norm": 0.4172261378329325,
+      "learning_rate": 2.361816641743303e-05,
+      "loss": 0.5867,
+      "step": 1469
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.3880975350407753,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.6298,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7845333333333333,
+      "grad_norm": 0.454827666158906,
+      "learning_rate": 2.339555568810221e-05,
+      "loss": 0.6357,
+      "step": 1471
+    },
+    {
+      "epoch": 0.7850666666666667,
+      "grad_norm": 0.4135524865170144,
+      "learning_rate": 2.328459328616759e-05,
+      "loss": 0.6125,
+      "step": 1472
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.3627291761852448,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.5833,
+      "step": 1473
+    },
+    {
+      "epoch": 0.7861333333333334,
+      "grad_norm": 0.41676502372058544,
+      "learning_rate": 2.306335606451181e-05,
+      "loss": 0.6458,
+      "step": 1474
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 0.3658004704309575,
+      "learning_rate": 2.295308190543859e-05,
+      "loss": 0.5856,
+      "step": 1475
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.4634414266057261,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6891,
+      "step": 1476
+    },
+    {
+      "epoch": 0.7877333333333333,
+      "grad_norm": 0.56074419797481,
+      "learning_rate": 2.2733224137277366e-05,
+      "loss": 0.6194,
+      "step": 1477
+    },
+    {
+      "epoch": 0.7882666666666667,
+      "grad_norm": 0.4319324344035407,
+      "learning_rate": 2.262364118471805e-05,
+      "loss": 0.6244,
+      "step": 1478
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.38178275769314285,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6477,
+      "step": 1479
+    },
+    {
+      "epoch": 0.7893333333333333,
+      "grad_norm": 0.3917852782025771,
+      "learning_rate": 2.2405168778797646e-05,
+      "loss": 0.604,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7898666666666667,
+      "grad_norm": 0.43663593848716403,
+      "learning_rate": 2.2296279977828337e-05,
+      "loss": 0.6185,
+      "step": 1481
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.4232853699490676,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.6224,
+      "step": 1482
+    },
+    {
+      "epoch": 0.7909333333333334,
+      "grad_norm": 0.3658584722646184,
+      "learning_rate": 2.2079198805662914e-05,
+      "loss": 0.6368,
+      "step": 1483
+    },
+    {
+      "epoch": 0.7914666666666667,
+      "grad_norm": 0.44565137919498404,
+      "learning_rate": 2.1971007082704164e-05,
+      "loss": 0.6221,
+      "step": 1484
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.44278550694929475,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6462,
+      "step": 1485
+    },
+    {
+      "epoch": 0.7925333333333333,
+      "grad_norm": 0.4123636761689713,
+      "learning_rate": 2.1755322978418137e-05,
+      "loss": 0.7347,
+      "step": 1486
+    },
+    {
+      "epoch": 0.7930666666666667,
+      "grad_norm": 0.4079679364330374,
+      "learning_rate": 2.1647831241156302e-05,
+      "loss": 0.6451,
+      "step": 1487
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.5168076566291907,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6402,
+      "step": 1488
+    },
+    {
+      "epoch": 0.7941333333333334,
+      "grad_norm": 0.38173849341906546,
+      "learning_rate": 2.1433550001327373e-05,
+      "loss": 0.6282,
+      "step": 1489
+    },
+    {
+      "epoch": 0.7946666666666666,
+      "grad_norm": 0.4524165538190746,
+      "learning_rate": 2.1326761138636553e-05,
+      "loss": 0.6428,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.4143510686204648,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6359,
+      "step": 1491
+    },
+    {
+      "epoch": 0.7957333333333333,
+      "grad_norm": 0.4433278715654798,
+      "learning_rate": 2.111388852214001e-05,
+      "loss": 0.6412,
+      "step": 1492
+    },
+    {
+      "epoch": 0.7962666666666667,
+      "grad_norm": 0.44529038500856755,
+      "learning_rate": 2.1007805404004242e-05,
+      "loss": 0.573,
+      "step": 1493
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3726690994770168,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.6233,
+      "step": 1494
+    },
+    {
+      "epoch": 0.7973333333333333,
+      "grad_norm": 0.36030377090053944,
+      "learning_rate": 2.0796347131858186e-05,
+      "loss": 0.6211,
+      "step": 1495
+    },
+    {
+      "epoch": 0.7978666666666666,
+      "grad_norm": 0.40693922341498406,
+      "learning_rate": 2.069097260929439e-05,
+      "loss": 0.5771,
+      "step": 1496
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.401872353212445,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6278,
+      "step": 1497
+    },
+    {
+      "epoch": 0.7989333333333334,
+      "grad_norm": 0.4802253049924863,
+      "learning_rate": 2.048093436450603e-05,
+      "loss": 0.7407,
+      "step": 1498
+    },
+    {
+      "epoch": 0.7994666666666667,
+      "grad_norm": 0.3927311504866121,
+      "learning_rate": 2.0376271269487514e-05,
+      "loss": 0.6169,
+      "step": 1499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3980797671464647,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6038,
+      "step": 1500
+    },
+    {
+      "epoch": 0.8005333333333333,
+      "grad_norm": 0.396463500581168,
+      "learning_rate": 2.0167658696900317e-05,
+      "loss": 0.6191,
+      "step": 1501
+    },
+    {
+      "epoch": 0.8010666666666667,
+      "grad_norm": 0.4246288215052101,
+      "learning_rate": 2.0063709842280432e-05,
+      "loss": 0.6639,
+      "step": 1502
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.36942287784701217,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.5835,
+      "step": 1503
+    },
+    {
+      "epoch": 0.8021333333333334,
+      "grad_norm": 0.39238846065412564,
+      "learning_rate": 1.985652854842247e-05,
+      "loss": 0.6085,
+      "step": 1504
+    },
+    {
+      "epoch": 0.8026666666666666,
+      "grad_norm": 0.39605568898365406,
+      "learning_rate": 1.9753296727859195e-05,
+      "loss": 0.5944,
+      "step": 1505
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.47417463303431173,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6406,
+      "step": 1506
+    },
+    {
+      "epoch": 0.8037333333333333,
+      "grad_norm": 0.41033444761054455,
+      "learning_rate": 1.9547552280792524e-05,
+      "loss": 0.5882,
+      "step": 1507
+    },
+    {
+      "epoch": 0.8042666666666667,
+      "grad_norm": 0.3506013690792776,
+      "learning_rate": 1.9445040268673298e-05,
+      "loss": 0.6245,
+      "step": 1508
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.3842673745708574,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.6776,
+      "step": 1509
+    },
+    {
+      "epoch": 0.8053333333333333,
+      "grad_norm": 0.37624110756408163,
+      "learning_rate": 1.9240738197844278e-05,
+      "loss": 0.5891,
+      "step": 1510
+    },
+    {
+      "epoch": 0.8058666666666666,
+      "grad_norm": 0.44301843398022756,
+      "learning_rate": 1.9138948749211472e-05,
+      "loss": 0.6831,
+      "step": 1511
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.43605392399193227,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6384,
+      "step": 1512
+    },
+    {
+      "epoch": 0.8069333333333333,
+      "grad_norm": 0.4598273335227648,
+      "learning_rate": 1.8936094545302095e-05,
+      "loss": 0.5941,
+      "step": 1513
+    },
+    {
+      "epoch": 0.8074666666666667,
+      "grad_norm": 0.4141225681986433,
+      "learning_rate": 1.883503039577894e-05,
+      "loss": 0.6381,
+      "step": 1514
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.42394156093608293,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.6768,
+      "step": 1515
+    },
+    {
+      "epoch": 0.8085333333333333,
+      "grad_norm": 0.3548570188787065,
+      "learning_rate": 1.8633629510559314e-05,
+      "loss": 0.5555,
+      "step": 1516
+    },
+    {
+      "epoch": 0.8090666666666667,
+      "grad_norm": 0.39023735618158406,
+      "learning_rate": 1.8533293376276472e-05,
+      "loss": 0.6145,
+      "step": 1517
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3359493120677617,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.5751,
+      "step": 1518
+    },
+    {
+      "epoch": 0.8101333333333334,
+      "grad_norm": 0.4226374951462766,
+      "learning_rate": 1.8333351222458407e-05,
+      "loss": 0.6533,
+      "step": 1519
+    },
+    {
+      "epoch": 0.8106666666666666,
+      "grad_norm": 0.47239366046471304,
+      "learning_rate": 1.8233745799980817e-05,
+      "loss": 0.7022,
+      "step": 1520
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.37894693036953175,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.601,
+      "step": 1521
+    },
+    {
+      "epoch": 0.8117333333333333,
+      "grad_norm": 0.4026214658454624,
+      "learning_rate": 1.803526775107217e-05,
+      "loss": 0.6149,
+      "step": 1522
+    },
+    {
+      "epoch": 0.8122666666666667,
+      "grad_norm": 0.3897012031544754,
+      "learning_rate": 1.7936395717326704e-05,
+      "loss": 0.601,
+      "step": 1523
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4867669436897636,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6589,
+      "step": 1524
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 0.40006855874177877,
+      "learning_rate": 1.773938710748706e-05,
+      "loss": 0.6324,
+      "step": 1525
+    },
+    {
+      "epoch": 0.8138666666666666,
+      "grad_norm": 0.36040475737051053,
+      "learning_rate": 1.7641251119690505e-05,
+      "loss": 0.5511,
+      "step": 1526
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.39138607786669155,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.582,
+      "step": 1527
+    },
+    {
+      "epoch": 0.8149333333333333,
+      "grad_norm": 0.38396535899467,
+      "learning_rate": 1.744571724358789e-05,
+      "loss": 0.5841,
+      "step": 1528
+    },
+    {
+      "epoch": 0.8154666666666667,
+      "grad_norm": 0.3776215499822429,
+      "learning_rate": 1.7348319939175637e-05,
+      "loss": 0.6831,
+      "step": 1529
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.43813368686973886,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.6174,
+      "step": 1530
+    },
+    {
+      "epoch": 0.8165333333333333,
+      "grad_norm": 0.3841077501914417,
+      "learning_rate": 1.715426605184407e-05,
+      "loss": 0.6296,
+      "step": 1531
+    },
+    {
+      "epoch": 0.8170666666666667,
+      "grad_norm": 0.3892898735979422,
+      "learning_rate": 1.705761004839911e-05,
+      "loss": 0.6327,
+      "step": 1532
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.40353192605898125,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7106,
+      "step": 1533
+    },
+    {
+      "epoch": 0.8181333333333334,
+      "grad_norm": 0.39140572566359955,
+      "learning_rate": 1.6865041365097435e-05,
+      "loss": 0.6234,
+      "step": 1534
+    },
+    {
+      "epoch": 0.8186666666666667,
+      "grad_norm": 0.4384632881438894,
+      "learning_rate": 1.676912926028007e-05,
+      "loss": 0.6934,
+      "step": 1535
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.5225492263587118,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6741,
+      "step": 1536
+    },
+    {
+      "epoch": 0.8197333333333333,
+      "grad_norm": 0.5722418748382089,
+      "learning_rate": 1.6578050956351886e-05,
+      "loss": 0.6537,
+      "step": 1537
+    },
+    {
+      "epoch": 0.8202666666666667,
+      "grad_norm": 0.40576271575043527,
+      "learning_rate": 1.6482885327829913e-05,
+      "loss": 0.6053,
+      "step": 1538
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.419476986095844,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.5853,
+      "step": 1539
+    },
+    {
+      "epoch": 0.8213333333333334,
+      "grad_norm": 0.36410852383771564,
+      "learning_rate": 1.6293302538564382e-05,
+      "loss": 0.5529,
+      "step": 1540
+    },
+    {
+      "epoch": 0.8218666666666666,
+      "grad_norm": 0.4096711146858712,
+      "learning_rate": 1.619888594394382e-05,
+      "loss": 0.608,
+      "step": 1541
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.4069274461277458,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6141,
+      "step": 1542
+    },
+    {
+      "epoch": 0.8229333333333333,
+      "grad_norm": 0.36476842192995557,
+      "learning_rate": 1.601080376443763e-05,
+      "loss": 0.5893,
+      "step": 1543
+    },
+    {
+      "epoch": 0.8234666666666667,
+      "grad_norm": 0.42563347102734733,
+      "learning_rate": 1.5917138741193973e-05,
+      "loss": 0.6448,
+      "step": 1544
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.4692462806578368,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.592,
+      "step": 1545
+    },
+    {
+      "epoch": 0.8245333333333333,
+      "grad_norm": 0.43930859709973186,
+      "learning_rate": 1.573056222621453e-05,
+      "loss": 0.6242,
+      "step": 1546
+    },
+    {
+      "epoch": 0.8250666666666666,
+      "grad_norm": 0.45687506190316407,
+      "learning_rate": 1.5637651291624523e-05,
+      "loss": 0.6265,
+      "step": 1547
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.399945416164093,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6369,
+      "step": 1548
+    },
+    {
+      "epoch": 0.8261333333333334,
+      "grad_norm": 0.3837425889924183,
+      "learning_rate": 1.5452585455473977e-05,
+      "loss": 0.649,
+      "step": 1549
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.429907781056903,
+      "learning_rate": 1.536043110654809e-05,
+      "loss": 0.6715,
+      "step": 1550
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.39455565201868215,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6036,
+      "step": 1551
+    },
+    {
+      "epoch": 0.8277333333333333,
+      "grad_norm": 0.4080082886430872,
+      "learning_rate": 1.5176880922928616e-05,
+      "loss": 0.6283,
+      "step": 1552
+    },
+    {
+      "epoch": 0.8282666666666667,
+      "grad_norm": 0.3630930563014412,
+      "learning_rate": 1.5085485636343755e-05,
+      "loss": 0.5932,
+      "step": 1553
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.37576258976580323,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.5762,
+      "step": 1554
+    },
+    {
+      "epoch": 0.8293333333333334,
+      "grad_norm": 0.35183356342237204,
+      "learning_rate": 1.4903456038223939e-05,
+      "loss": 0.6029,
+      "step": 1555
+    },
+    {
+      "epoch": 0.8298666666666666,
+      "grad_norm": 0.4491934616307518,
+      "learning_rate": 1.4812822270257009e-05,
+      "loss": 0.6398,
+      "step": 1556
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.3726502213271665,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.5828,
+      "step": 1557
+    },
+    {
+      "epoch": 0.8309333333333333,
+      "grad_norm": 0.40384795307175253,
+      "learning_rate": 1.4632318149739177e-05,
+      "loss": 0.5927,
+      "step": 1558
+    },
+    {
+      "epoch": 0.8314666666666667,
+      "grad_norm": 0.4366373927791251,
+      "learning_rate": 1.454244833620102e-05,
+      "loss": 0.6991,
+      "step": 1559
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.39394958900157206,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6119,
+      "step": 1560
+    },
+    {
+      "epoch": 0.8325333333333333,
+      "grad_norm": 0.38678026946844857,
+      "learning_rate": 1.4363474544389877e-05,
+      "loss": 0.6334,
+      "step": 1561
+    },
+    {
+      "epoch": 0.8330666666666666,
+      "grad_norm": 0.39462507397666935,
+      "learning_rate": 1.4274371100559791e-05,
+      "loss": 0.5922,
+      "step": 1562
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.3859662830695169,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.5692,
+      "step": 1563
+    },
+    {
+      "epoch": 0.8341333333333333,
+      "grad_norm": 0.38515250216530106,
+      "learning_rate": 1.409693244743192e-05,
+      "loss": 0.668,
+      "step": 1564
+    },
+    {
+      "epoch": 0.8346666666666667,
+      "grad_norm": 0.456437290699982,
+      "learning_rate": 1.4008597767992871e-05,
+      "loss": 0.6638,
+      "step": 1565
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.4168506986866765,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.5815,
+      "step": 1566
+    },
+    {
+      "epoch": 0.8357333333333333,
+      "grad_norm": 0.39807632655886105,
+      "learning_rate": 1.3832699022267515e-05,
+      "loss": 0.6329,
+      "step": 1567
+    },
+    {
+      "epoch": 0.8362666666666667,
+      "grad_norm": 0.38254096155961537,
+      "learning_rate": 1.37451354812416e-05,
+      "loss": 0.661,
+      "step": 1568
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.4458411190257632,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6396,
+      "step": 1569
+    },
+    {
+      "epoch": 0.8373333333333334,
+      "grad_norm": 0.4154278363491857,
+      "learning_rate": 1.3570781370252582e-05,
+      "loss": 0.6967,
+      "step": 1570
+    },
+    {
+      "epoch": 0.8378666666666666,
+      "grad_norm": 0.40290083595644605,
+      "learning_rate": 1.3483991320937306e-05,
+      "loss": 0.651,
+      "step": 1571
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.4108782186035999,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.6025,
+      "step": 1572
+    },
+    {
+      "epoch": 0.8389333333333333,
+      "grad_norm": 0.367731559943917,
+      "learning_rate": 1.3311186530505838e-05,
+      "loss": 0.6344,
+      "step": 1573
+    },
+    {
+      "epoch": 0.8394666666666667,
+      "grad_norm": 0.40629503604824857,
+      "learning_rate": 1.322517230541096e-05,
+      "loss": 0.6146,
+      "step": 1574
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.3645058833020978,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6137,
+      "step": 1575
+    },
+    {
+      "epoch": 0.8405333333333334,
+      "grad_norm": 0.4015338086627751,
+      "learning_rate": 1.30539214797198e-05,
+      "loss": 0.6482,
+      "step": 1576
+    },
+    {
+      "epoch": 0.8410666666666666,
+      "grad_norm": 0.4148085919365177,
+      "learning_rate": 1.2968685390504465e-05,
+      "loss": 0.6587,
+      "step": 1577
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.40470168724297817,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6203,
+      "step": 1578
+    },
+    {
+      "epoch": 0.8421333333333333,
+      "grad_norm": 0.4109070896313481,
+      "learning_rate": 1.2798993131973091e-05,
+      "loss": 0.6397,
+      "step": 1579
+    },
+    {
+      "epoch": 0.8426666666666667,
+      "grad_norm": 0.45667782401349,
+      "learning_rate": 1.2714537469383858e-05,
+      "loss": 0.6691,
+      "step": 1580
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.41273803679512366,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6089,
+      "step": 1581
+    },
+    {
+      "epoch": 0.8437333333333333,
+      "grad_norm": 0.39738829331285314,
+      "learning_rate": 1.2546408338544769e-05,
+      "loss": 0.5557,
+      "step": 1582
+    },
+    {
+      "epoch": 0.8442666666666667,
+      "grad_norm": 0.4035938123952541,
+      "learning_rate": 1.2462735372353996e-05,
+      "loss": 0.6179,
+      "step": 1583
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3592535699415328,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.59,
+      "step": 1584
+    },
+    {
+      "epoch": 0.8453333333333334,
+      "grad_norm": 0.34395768642729996,
+      "learning_rate": 1.2296173887730123e-05,
+      "loss": 0.624,
+      "step": 1585
+    },
+    {
+      "epoch": 0.8458666666666667,
+      "grad_norm": 0.37888093887754987,
+      "learning_rate": 1.2213285866674905e-05,
+      "loss": 0.6166,
+      "step": 1586
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.42673984972854934,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.5848,
+      "step": 1587
+    },
+    {
+      "epoch": 0.8469333333333333,
+      "grad_norm": 0.4348623379382092,
+      "learning_rate": 1.2048296504658207e-05,
+      "loss": 0.5819,
+      "step": 1588
+    },
+    {
+      "epoch": 0.8474666666666667,
+      "grad_norm": 0.4217765180561866,
+      "learning_rate": 1.1966195656380031e-05,
+      "loss": 0.6488,
+      "step": 1589
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3926592127503482,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6102,
+      "step": 1590
+    },
+    {
+      "epoch": 0.8485333333333334,
+      "grad_norm": 0.444535421100806,
+      "learning_rate": 1.1802782851111205e-05,
+      "loss": 0.5667,
+      "step": 1591
+    },
+    {
+      "epoch": 0.8490666666666666,
+      "grad_norm": 0.37800782166550384,
+      "learning_rate": 1.1721471382096027e-05,
+      "loss": 0.6631,
+      "step": 1592
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.484669796405072,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6771,
+      "step": 1593
+    },
+    {
+      "epoch": 0.8501333333333333,
+      "grad_norm": 0.34471682796148445,
+      "learning_rate": 1.1559639525345311e-05,
+      "loss": 0.6357,
+      "step": 1594
+    },
+    {
+      "epoch": 0.8506666666666667,
+      "grad_norm": 0.40101464844953705,
+      "learning_rate": 1.1479119620864276e-05,
+      "loss": 0.6635,
+      "step": 1595
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.4584298369094714,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6146,
+      "step": 1596
+    },
+    {
+      "epoch": 0.8517333333333333,
+      "grad_norm": 0.4030602211795025,
+      "learning_rate": 1.1318873061913405e-05,
+      "loss": 0.5997,
+      "step": 1597
+    },
+    {
+      "epoch": 0.8522666666666666,
+      "grad_norm": 0.4529669138975379,
+      "learning_rate": 1.123914688596409e-05,
+      "loss": 0.6184,
+      "step": 1598
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.39090019517918023,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6119,
+      "step": 1599
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.416318764923169,
+      "learning_rate": 1.1080489931489391e-05,
+      "loss": 0.6376,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8538666666666667,
+      "grad_norm": 0.5807650457041063,
+      "learning_rate": 1.1001559626737756e-05,
+      "loss": 0.7556,
+      "step": 1601
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.47773024387151364,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6892,
+      "step": 1602
+    },
+    {
+      "epoch": 0.8549333333333333,
+      "grad_norm": 0.4268904491824666,
+      "learning_rate": 1.0844496540694515e-05,
+      "loss": 0.6724,
+      "step": 1603
+    },
+    {
+      "epoch": 0.8554666666666667,
+      "grad_norm": 0.4339520738304605,
+      "learning_rate": 1.0766364228417148e-05,
+      "loss": 0.6838,
+      "step": 1604
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.42365839653832277,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.645,
+      "step": 1605
+    },
+    {
+      "epoch": 0.8565333333333334,
+      "grad_norm": 0.45171866803310495,
+      "learning_rate": 1.0610899231924886e-05,
+      "loss": 0.6252,
+      "step": 1606
+    },
+    {
+      "epoch": 0.8570666666666666,
+      "grad_norm": 0.41195121058189305,
+      "learning_rate": 1.0533567011952094e-05,
+      "loss": 0.5671,
+      "step": 1607
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.40865072858341894,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.5746,
+      "step": 1608
+    },
+    {
+      "epoch": 0.8581333333333333,
+      "grad_norm": 0.36578120703551986,
+      "learning_rate": 1.0379704283181179e-05,
+      "loss": 0.5613,
+      "step": 1609
+    },
+    {
+      "epoch": 0.8586666666666667,
+      "grad_norm": 0.36830492227380884,
+      "learning_rate": 1.0303174233840528e-05,
+      "loss": 0.6237,
+      "step": 1610
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.4927935605607071,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6696,
+      "step": 1611
+    },
+    {
+      "epoch": 0.8597333333333333,
+      "grad_norm": 0.42478793778815627,
+      "learning_rate": 1.0150917907899926e-05,
+      "loss": 0.6476,
+      "step": 1612
+    },
+    {
+      "epoch": 0.8602666666666666,
+      "grad_norm": 0.39578411104719047,
+      "learning_rate": 1.007519208596045e-05,
+      "loss": 0.6248,
+      "step": 1613
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.4106414831054956,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6353,
+      "step": 1614
+    },
+    {
+      "epoch": 0.8613333333333333,
+      "grad_norm": 0.3985521483275362,
+      "learning_rate": 9.924546254786493e-06,
+      "loss": 0.6148,
+      "step": 1615
+    },
+    {
+      "epoch": 0.8618666666666667,
+      "grad_norm": 0.444840586650571,
+      "learning_rate": 9.849626695403324e-06,
+      "loss": 0.6618,
+      "step": 1616
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.575592785908353,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.7099,
+      "step": 1617
+    },
+    {
+      "epoch": 0.8629333333333333,
+      "grad_norm": 0.3683201876374113,
+      "learning_rate": 9.700595407649805e-06,
+      "loss": 0.603,
+      "step": 1618
+    },
+    {
+      "epoch": 0.8634666666666667,
+      "grad_norm": 0.4579511056092031,
+      "learning_rate": 9.62648412430951e-06,
+      "loss": 0.628,
+      "step": 1619
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.39842390606428885,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6429,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8645333333333334,
+      "grad_norm": 0.3891734424738252,
+      "learning_rate": 9.479071385238892e-06,
+      "loss": 0.6085,
+      "step": 1621
+    },
+    {
+      "epoch": 0.8650666666666667,
+      "grad_norm": 0.39349054248607734,
+      "learning_rate": 9.40577036970538e-06,
+      "loss": 0.6449,
+      "step": 1622
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.42839801800170924,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6763,
+      "step": 1623
+    },
+    {
+      "epoch": 0.8661333333333333,
+      "grad_norm": 0.39235277618039877,
+      "learning_rate": 9.259980141081115e-06,
+      "loss": 0.639,
+      "step": 1624
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.3993459662079003,
+      "learning_rate": 9.187491363342093e-06,
+      "loss": 0.6433,
+      "step": 1625
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.41856979205822786,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6068,
+      "step": 1626
+    },
+    {
+      "epoch": 0.8677333333333334,
+      "grad_norm": 0.4034791766233207,
+      "learning_rate": 9.043327563322112e-06,
+      "loss": 0.6116,
+      "step": 1627
+    },
+    {
+      "epoch": 0.8682666666666666,
+      "grad_norm": 0.40221452484703857,
+      "learning_rate": 8.971652971536148e-06,
+      "loss": 0.5847,
+      "step": 1628
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.3967911364122253,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.5719,
+      "step": 1629
+    },
+    {
+      "epoch": 0.8693333333333333,
+      "grad_norm": 0.47801316808208466,
+      "learning_rate": 8.829119474567671e-06,
+      "loss": 0.6542,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8698666666666667,
+      "grad_norm": 0.3986920049012869,
+      "learning_rate": 8.758260995011825e-06,
+      "loss": 0.6256,
+      "step": 1631
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.38939715834170985,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.5897,
+      "step": 1632
+    },
+    {
+      "epoch": 0.8709333333333333,
+      "grad_norm": 0.41777207997562515,
+      "learning_rate": 8.617361631727138e-06,
+      "loss": 0.6776,
+      "step": 1633
+    },
+    {
+      "epoch": 0.8714666666666666,
+      "grad_norm": 0.37259748507667356,
+      "learning_rate": 8.547321168745193e-06,
+      "loss": 0.6007,
+      "step": 1634
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.4736891450729395,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.647,
+      "step": 1635
+    },
+    {
+      "epoch": 0.8725333333333334,
+      "grad_norm": 0.43992339582496437,
+      "learning_rate": 8.408059725858719e-06,
+      "loss": 0.5661,
+      "step": 1636
+    },
+    {
+      "epoch": 0.8730666666666667,
+      "grad_norm": 0.44125070212055256,
+      "learning_rate": 8.338839161809997e-06,
+      "loss": 0.6368,
+      "step": 1637
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.3545513928119534,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.5164,
+      "step": 1638
+    },
+    {
+      "epoch": 0.8741333333333333,
+      "grad_norm": 0.5070293760527375,
+      "learning_rate": 8.201219382016556e-06,
+      "loss": 0.6184,
+      "step": 1639
+    },
+    {
+      "epoch": 0.8746666666666667,
+      "grad_norm": 0.4904896591151178,
+      "learning_rate": 8.132820577225387e-06,
+      "loss": 0.6146,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.3967245313182166,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6031,
+      "step": 1641
+    },
+    {
+      "epoch": 0.8757333333333334,
+      "grad_norm": 0.4736125983986299,
+      "learning_rate": 7.996846159099557e-06,
+      "loss": 0.7248,
+      "step": 1642
+    },
+    {
+      "epoch": 0.8762666666666666,
+      "grad_norm": 0.35345499941635394,
+      "learning_rate": 7.929270951805178e-06,
+      "loss": 0.5785,
+      "step": 1643
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.44186475929953245,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.5418,
+      "step": 1644
+    },
+    {
+      "epoch": 0.8773333333333333,
+      "grad_norm": 0.3808805874759711,
+      "learning_rate": 7.794945549701993e-06,
+      "loss": 0.6139,
+      "step": 1645
+    },
+    {
+      "epoch": 0.8778666666666667,
+      "grad_norm": 0.38242530597868213,
+      "learning_rate": 7.728195756009204e-06,
+      "loss": 0.5911,
+      "step": 1646
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.5134288879788514,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6435,
+      "step": 1647
+    },
+    {
+      "epoch": 0.8789333333333333,
+      "grad_norm": 0.6337127497375044,
+      "learning_rate": 7.595522979965819e-06,
+      "loss": 0.6515,
+      "step": 1648
+    },
+    {
+      "epoch": 0.8794666666666666,
+      "grad_norm": 0.42488961332511715,
+      "learning_rate": 7.529600393796232e-06,
+      "loss": 0.5878,
+      "step": 1649
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5240689106181592,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.7358,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8805333333333333,
+      "grad_norm": 0.4718658727732926,
+      "learning_rate": 7.3985838094349444e-06,
+      "loss": 0.5768,
+      "step": 1651
+    },
+    {
+      "epoch": 0.8810666666666667,
+      "grad_norm": 0.4401430668645281,
+      "learning_rate": 7.333490202478666e-06,
+      "loss": 0.6079,
+      "step": 1652
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.47047426174415424,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6162,
+      "step": 1653
+    },
+    {
+      "epoch": 0.8821333333333333,
+      "grad_norm": 0.4339177242243759,
+      "learning_rate": 7.204133330911178e-06,
+      "loss": 0.6586,
+      "step": 1654
+    },
+    {
+      "epoch": 0.8826666666666667,
+      "grad_norm": 0.3599831842810963,
+      "learning_rate": 7.1398704525792e-06,
+      "loss": 0.5841,
+      "step": 1655
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.4099964056909604,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.5877,
+      "step": 1656
+    },
+    {
+      "epoch": 0.8837333333333334,
+      "grad_norm": 0.36751292737528646,
+      "learning_rate": 7.012176770311862e-06,
+      "loss": 0.6466,
+      "step": 1657
+    },
+    {
+      "epoch": 0.8842666666666666,
+      "grad_norm": 0.4165507690117416,
+      "learning_rate": 6.948746347689183e-06,
+      "loss": 0.6184,
+      "step": 1658
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.39162379094696625,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6243,
+      "step": 1659
+    },
+    {
+      "epoch": 0.8853333333333333,
+      "grad_norm": 0.4116873277237225,
+      "learning_rate": 6.8227192865295995e-06,
+      "loss": 0.623,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8858666666666667,
+      "grad_norm": 0.4210521732498824,
+      "learning_rate": 6.760123024328624e-06,
+      "loss": 0.6184,
+      "step": 1661
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.4491235615447384,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.6641,
+      "step": 1662
+    },
+    {
+      "epoch": 0.8869333333333334,
+      "grad_norm": 0.5129827043466911,
+      "learning_rate": 6.635765971293484e-06,
+      "loss": 0.7014,
+      "step": 1663
+    },
+    {
+      "epoch": 0.8874666666666666,
+      "grad_norm": 0.41375745180416124,
+      "learning_rate": 6.5740055518083375e-06,
+      "loss": 0.5795,
+      "step": 1664
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.3943462641779985,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6885,
+      "step": 1665
+    },
+    {
+      "epoch": 0.8885333333333333,
+      "grad_norm": 0.41858767027493393,
+      "learning_rate": 6.451321849032288e-06,
+      "loss": 0.6441,
+      "step": 1666
+    },
+    {
+      "epoch": 0.8890666666666667,
+      "grad_norm": 0.42409503107067126,
+      "learning_rate": 6.390398932093555e-06,
+      "loss": 0.6457,
+      "step": 1667
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.4231058612450969,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6785,
+      "step": 1668
+    },
+    {
+      "epoch": 0.8901333333333333,
+      "grad_norm": 0.3960780998009343,
+      "learning_rate": 6.269391876739495e-06,
+      "loss": 0.6123,
+      "step": 1669
+    },
+    {
+      "epoch": 0.8906666666666667,
+      "grad_norm": 0.3742043328992552,
+      "learning_rate": 6.209308099669597e-06,
+      "loss": 0.6058,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.4736101999046393,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.6489,
+      "step": 1671
+    },
+    {
+      "epoch": 0.8917333333333334,
+      "grad_norm": 0.46955758979360435,
+      "learning_rate": 6.089980943839924e-06,
+      "loss": 0.6478,
+      "step": 1672
+    },
+    {
+      "epoch": 0.8922666666666667,
+      "grad_norm": 0.7634695841735873,
+      "learning_rate": 6.030737921409169e-06,
+      "loss": 0.618,
+      "step": 1673
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.4345938737690243,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.7012,
+      "step": 1674
+    },
+    {
+      "epoch": 0.8933333333333333,
+      "grad_norm": 0.40090791844884754,
+      "learning_rate": 5.913093872058528e-06,
+      "loss": 0.5792,
+      "step": 1675
+    },
+    {
+      "epoch": 0.8938666666666667,
+      "grad_norm": 0.4950215877037177,
+      "learning_rate": 5.854693196441641e-06,
+      "loss": 0.6744,
+      "step": 1676
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.3709300557238186,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6294,
+      "step": 1677
+    },
+    {
+      "epoch": 0.8949333333333334,
+      "grad_norm": 0.40671767753834,
+      "learning_rate": 5.738735415290642e-06,
+      "loss": 0.6353,
+      "step": 1678
+    },
+    {
+      "epoch": 0.8954666666666666,
+      "grad_norm": 0.4308083112903099,
+      "learning_rate": 5.681178656024055e-06,
+      "loss": 0.6227,
+      "step": 1679
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.4291744547709792,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.5853,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8965333333333333,
+      "grad_norm": 0.4476219517321725,
+      "learning_rate": 5.566910259474289e-06,
+      "loss": 0.6051,
+      "step": 1681
+    },
+    {
+      "epoch": 0.8970666666666667,
+      "grad_norm": 0.45810225605925653,
+      "learning_rate": 5.510198963413881e-06,
+      "loss": 0.6894,
+      "step": 1682
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.36983347197203204,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6412,
+      "step": 1683
+    },
+    {
+      "epoch": 0.8981333333333333,
+      "grad_norm": 0.4293497945626058,
+      "learning_rate": 5.397623022464226e-06,
+      "loss": 0.6402,
+      "step": 1684
+    },
+    {
+      "epoch": 0.8986666666666666,
+      "grad_norm": 0.5074833976497126,
+      "learning_rate": 5.341758713743828e-06,
+      "loss": 0.6519,
+      "step": 1685
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.38837993512590113,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.5602,
+      "step": 1686
+    },
+    {
+      "epoch": 0.8997333333333334,
+      "grad_norm": 0.4493312016772392,
+      "learning_rate": 5.230878253907912e-06,
+      "loss": 0.6596,
+      "step": 1687
+    },
+    {
+      "epoch": 0.9002666666666667,
+      "grad_norm": 0.3745228625348613,
+      "learning_rate": 5.175862433898282e-06,
+      "loss": 0.6226,
+      "step": 1688
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.3926946003367579,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6605,
+      "step": 1689
+    },
+    {
+      "epoch": 0.9013333333333333,
+      "grad_norm": 0.4031266176315358,
+      "learning_rate": 5.066680435123106e-06,
+      "loss": 0.6182,
+      "step": 1690
+    },
+    {
+      "epoch": 0.9018666666666667,
+      "grad_norm": 0.46302782865461184,
+      "learning_rate": 5.012514582391592e-06,
+      "loss": 0.6093,
+      "step": 1691
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.4023029515757742,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.5404,
+      "step": 1692
+    },
+    {
+      "epoch": 0.9029333333333334,
+      "grad_norm": 0.4345522125756673,
+      "learning_rate": 4.905033978977491e-06,
+      "loss": 0.5981,
+      "step": 1693
+    },
+    {
+      "epoch": 0.9034666666666666,
+      "grad_norm": 0.4018408074306652,
+      "learning_rate": 4.851719549248301e-06,
+      "loss": 0.6337,
+      "step": 1694
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.43128450719572176,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6143,
+      "step": 1695
+    },
+    {
+      "epoch": 0.9045333333333333,
+      "grad_norm": 0.4276601320324969,
+      "learning_rate": 4.745943229770122e-06,
+      "loss": 0.6236,
+      "step": 1696
+    },
+    {
+      "epoch": 0.9050666666666667,
+      "grad_norm": 0.4078760862656175,
+      "learning_rate": 4.693481655885257e-06,
+      "loss": 0.6019,
+      "step": 1697
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.4146378365591098,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6047,
+      "step": 1698
+    },
+    {
+      "epoch": 0.9061333333333333,
+      "grad_norm": 0.38866854543260576,
+      "learning_rate": 4.58941246311464e-06,
+      "loss": 0.6236,
+      "step": 1699
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.41247922047810026,
+      "learning_rate": 4.537805154995278e-06,
+      "loss": 0.6462,
+      "step": 1700
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.32855235101787567,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.5745,
+      "step": 1701
+    },
+    {
+      "epoch": 0.9077333333333333,
+      "grad_norm": 0.3532860254011047,
+      "learning_rate": 4.435445885824285e-06,
+      "loss": 0.5772,
+      "step": 1702
+    },
+    {
+      "epoch": 0.9082666666666667,
+      "grad_norm": 0.3960725048956789,
+      "learning_rate": 4.384694230432984e-06,
+      "loss": 0.5992,
+      "step": 1703
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3577972714817226,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6381,
+      "step": 1704
+    },
+    {
+      "epoch": 0.9093333333333333,
+      "grad_norm": 0.4057899771618968,
+      "learning_rate": 4.2840476357989825e-06,
+      "loss": 0.631,
+      "step": 1705
+    },
+    {
+      "epoch": 0.9098666666666667,
+      "grad_norm": 0.4259913592867645,
+      "learning_rate": 4.2341529971023255e-06,
+      "loss": 0.623,
+      "step": 1706
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.46727949277813496,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.711,
+      "step": 1707
+    },
+    {
+      "epoch": 0.9109333333333334,
+      "grad_norm": 0.43382358933471343,
+      "learning_rate": 4.135221781914034e-06,
+      "loss": 0.6746,
+      "step": 1708
+    },
+    {
+      "epoch": 0.9114666666666666,
+      "grad_norm": 0.4050494645563292,
+      "learning_rate": 4.0861855008460405e-06,
+      "loss": 0.5776,
+      "step": 1709
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.39697118504461587,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.6343,
+      "step": 1710
+    },
+    {
+      "epoch": 0.9125333333333333,
+      "grad_norm": 0.41634897817309924,
+      "learning_rate": 3.988972323910778e-06,
+      "loss": 0.6555,
+      "step": 1711
+    },
+    {
+      "epoch": 0.9130666666666667,
+      "grad_norm": 0.3592250048605172,
+      "learning_rate": 3.9407957183368095e-06,
+      "loss": 0.5408,
+      "step": 1712
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.3943800527241845,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.5769,
+      "step": 1713
+    },
+    {
+      "epoch": 0.9141333333333334,
+      "grad_norm": 0.3899844201519674,
+      "learning_rate": 3.845303192289074e-06,
+      "loss": 0.574,
+      "step": 1714
+    },
+    {
+      "epoch": 0.9146666666666666,
+      "grad_norm": 0.42674209778020844,
+      "learning_rate": 3.797987556970495e-06,
+      "loss": 0.5894,
+      "step": 1715
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.44357777850984076,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6398,
+      "step": 1716
+    },
+    {
+      "epoch": 0.9157333333333333,
+      "grad_norm": 0.4264535946350911,
+      "learning_rate": 3.7042182482018075e-06,
+      "loss": 0.6269,
+      "step": 1717
+    },
+    {
+      "epoch": 0.9162666666666667,
+      "grad_norm": 0.4433193047328518,
+      "learning_rate": 3.6577648547611033e-06,
+      "loss": 0.6676,
+      "step": 1718
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.4659300803923899,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6006,
+      "step": 1719
+    },
+    {
+      "epoch": 0.9173333333333333,
+      "grad_norm": 0.4280070596656568,
+      "learning_rate": 3.565721283350931e-06,
+      "loss": 0.6374,
+      "step": 1720
+    },
+    {
+      "epoch": 0.9178666666666667,
+      "grad_norm": 0.43230543719081344,
+      "learning_rate": 3.5201313802375456e-06,
+      "loss": 0.6566,
+      "step": 1721
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.4659420349151,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6639,
+      "step": 1722
+    },
+    {
+      "epoch": 0.9189333333333334,
+      "grad_norm": 0.4456625626122343,
+      "learning_rate": 3.4298160198856568e-06,
+      "loss": 0.6279,
+      "step": 1723
+    },
+    {
+      "epoch": 0.9194666666666667,
+      "grad_norm": 0.3314495580489305,
+      "learning_rate": 3.3850908323424967e-06,
+      "loss": 0.5381,
+      "step": 1724
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.41115079767153645,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6671,
+      "step": 1725
+    },
+    {
+      "epoch": 0.9205333333333333,
+      "grad_norm": 0.37481290351521623,
+      "learning_rate": 3.296506110302422e-06,
+      "loss": 0.6031,
+      "step": 1726
+    },
+    {
+      "epoch": 0.9210666666666667,
+      "grad_norm": 0.34037181057619237,
+      "learning_rate": 3.252646840332918e-06,
+      "loss": 0.5686,
+      "step": 1727
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.46170967564430004,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.669,
+      "step": 1728
+    },
+    {
+      "epoch": 0.9221333333333334,
+      "grad_norm": 0.3790339373028516,
+      "learning_rate": 3.1657951373467497e-06,
+      "loss": 0.6123,
+      "step": 1729
+    },
+    {
+      "epoch": 0.9226666666666666,
+      "grad_norm": 0.3954633192139074,
+      "learning_rate": 3.1228029636824475e-06,
+      "loss": 0.6229,
+      "step": 1730
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.433350783415259,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6782,
+      "step": 1731
+    },
+    {
+      "epoch": 0.9237333333333333,
+      "grad_norm": 0.4321933831959839,
+      "learning_rate": 3.037686613916857e-06,
+      "loss": 0.7233,
+      "step": 1732
+    },
+    {
+      "epoch": 0.9242666666666667,
+      "grad_norm": 0.4337168503950566,
+      "learning_rate": 2.995562691985898e-06,
+      "loss": 0.6596,
+      "step": 1733
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.4626165639483129,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.628,
+      "step": 1734
+    },
+    {
+      "epoch": 0.9253333333333333,
+      "grad_norm": 0.39743699028389945,
+      "learning_rate": 2.912183982969385e-06,
+      "loss": 0.6195,
+      "step": 1735
+    },
+    {
+      "epoch": 0.9258666666666666,
+      "grad_norm": 0.3519174310870393,
+      "learning_rate": 2.8709294448653225e-06,
+      "loss": 0.5934,
+      "step": 1736
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.40373434760817883,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6688,
+      "step": 1737
+    },
+    {
+      "epoch": 0.9269333333333334,
+      "grad_norm": 0.4004436444904916,
+      "learning_rate": 2.789290617426765e-06,
+      "loss": 0.6117,
+      "step": 1738
+    },
+    {
+      "epoch": 0.9274666666666667,
+      "grad_norm": 0.45114807122599826,
+      "learning_rate": 2.748906571878207e-06,
+      "loss": 0.6581,
+      "step": 1739
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.42634126146566326,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.6241,
+      "step": 1740
+    },
+    {
+      "epoch": 0.9285333333333333,
+      "grad_norm": 0.4408191682479045,
+      "learning_rate": 2.6690098200866098e-06,
+      "loss": 0.7201,
+      "step": 1741
+    },
+    {
+      "epoch": 0.9290666666666667,
+      "grad_norm": 0.37813244503353843,
+      "learning_rate": 2.6294973524274125e-06,
+      "loss": 0.5779,
+      "step": 1742
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.43341126939368974,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6321,
+      "step": 1743
+    },
+    {
+      "epoch": 0.9301333333333334,
+      "grad_norm": 0.4182754317940364,
+      "learning_rate": 2.551344823532964e-06,
+      "loss": 0.5721,
+      "step": 1744
+    },
+    {
+      "epoch": 0.9306666666666666,
+      "grad_norm": 0.38225025649677735,
+      "learning_rate": 2.5127049956730207e-06,
+      "loss": 0.5936,
+      "step": 1745
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.40240998851253346,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.5944,
+      "step": 1746
+    },
+    {
+      "epoch": 0.9317333333333333,
+      "grad_norm": 0.499859620034534,
+      "learning_rate": 2.436298790049363e-06,
+      "loss": 0.6874,
+      "step": 1747
+    },
+    {
+      "epoch": 0.9322666666666667,
+      "grad_norm": 0.3739439660247912,
+      "learning_rate": 2.3985326404461604e-06,
+      "loss": 0.5971,
+      "step": 1748
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.37083645714936647,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6019,
+      "step": 1749
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.457070008074725,
+      "learning_rate": 2.3238748115339324e-06,
+      "loss": 0.6075,
+      "step": 1750
+    },
+    {
+      "epoch": 0.9338666666666666,
+      "grad_norm": 0.3940632112896435,
+      "learning_rate": 2.286983355164529e-06,
+      "loss": 0.608,
+      "step": 1751
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.45402641693876467,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.7006,
+      "step": 1752
+    },
+    {
+      "epoch": 0.9349333333333333,
+      "grad_norm": 0.4380311216268415,
+      "learning_rate": 2.2140759094162467e-06,
+      "loss": 0.6746,
+      "step": 1753
+    },
+    {
+      "epoch": 0.9354666666666667,
+      "grad_norm": 0.4253395986872956,
+      "learning_rate": 2.178060137750071e-06,
+      "loss": 0.5907,
+      "step": 1754
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.534865339392156,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.6861,
+      "step": 1755
+    },
+    {
+      "epoch": 0.9365333333333333,
+      "grad_norm": 0.3965521221806843,
+      "learning_rate": 2.106905034576112e-06,
+      "loss": 0.5446,
+      "step": 1756
+    },
+    {
+      "epoch": 0.9370666666666667,
+      "grad_norm": 0.3638765529336139,
+      "learning_rate": 2.0717659155482738e-06,
+      "loss": 0.6358,
+      "step": 1757
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.41302575114306894,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.6443,
+      "step": 1758
+    },
+    {
+      "epoch": 0.9381333333333334,
+      "grad_norm": 0.3856587072635981,
+      "learning_rate": 2.002365067264289e-06,
+      "loss": 0.6731,
+      "step": 1759
+    },
+    {
+      "epoch": 0.9386666666666666,
+      "grad_norm": 0.4184150231637184,
+      "learning_rate": 1.968103545249611e-06,
+      "loss": 0.5624,
+      "step": 1760
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.4129135611595348,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.652,
+      "step": 1761
+    },
+    {
+      "epoch": 0.9397333333333333,
+      "grad_norm": 0.41559419100228573,
+      "learning_rate": 1.900458817025097e-06,
+      "loss": 0.6524,
+      "step": 1762
+    },
+    {
+      "epoch": 0.9402666666666667,
+      "grad_norm": 0.43875770711272727,
+      "learning_rate": 1.8670758128126909e-06,
+      "loss": 0.6379,
+      "step": 1763
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.4235053002596906,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6376,
+      "step": 1764
+    },
+    {
+      "epoch": 0.9413333333333334,
+      "grad_norm": 0.43980229694413764,
+      "learning_rate": 1.8011890226208527e-06,
+      "loss": 0.6442,
+      "step": 1765
+    },
+    {
+      "epoch": 0.9418666666666666,
+      "grad_norm": 0.44154118079189997,
+      "learning_rate": 1.7686854333893833e-06,
+      "loss": 0.5812,
+      "step": 1766
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.4212359075612435,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.5912,
+      "step": 1767
+    },
+    {
+      "epoch": 0.9429333333333333,
+      "grad_norm": 0.3833256754274344,
+      "learning_rate": 1.7045583519583074e-06,
+      "loss": 0.6238,
+      "step": 1768
+    },
+    {
+      "epoch": 0.9434666666666667,
+      "grad_norm": 0.3868226520224876,
+      "learning_rate": 1.6729350512519005e-06,
+      "loss": 0.6138,
+      "step": 1769
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4562045231270473,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.5786,
+      "step": 1770
+    },
+    {
+      "epoch": 0.9445333333333333,
+      "grad_norm": 0.4313624999644069,
+      "learning_rate": 1.6105694020169593e-06,
+      "loss": 0.6279,
+      "step": 1771
+    },
+    {
+      "epoch": 0.9450666666666667,
+      "grad_norm": 0.3779659342751667,
+      "learning_rate": 1.5798272397217095e-06,
+      "loss": 0.6687,
+      "step": 1772
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.43331408451293585,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6461,
+      "step": 1773
+    },
+    {
+      "epoch": 0.9461333333333334,
+      "grad_norm": 0.4561877150242593,
+      "learning_rate": 1.5192246987791981e-06,
+      "loss": 0.6213,
+      "step": 1774
+    },
+    {
+      "epoch": 0.9466666666666667,
+      "grad_norm": 0.41568795408595843,
+      "learning_rate": 1.489364501100332e-06,
+      "loss": 0.5681,
+      "step": 1775
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.41666064890528937,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.608,
+      "step": 1776
+    },
+    {
+      "epoch": 0.9477333333333333,
+      "grad_norm": 0.35051443835931156,
+      "learning_rate": 1.430526697162482e-06,
+      "loss": 0.5398,
+      "step": 1777
+    },
+    {
+      "epoch": 0.9482666666666667,
+      "grad_norm": 0.43540457235463903,
+      "learning_rate": 1.4015492666021312e-06,
+      "loss": 0.7033,
+      "step": 1778
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.4239455387270707,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.647,
+      "step": 1779
+    },
+    {
+      "epoch": 0.9493333333333334,
+      "grad_norm": 0.42480969329749607,
+      "learning_rate": 1.344477780953346e-06,
+      "loss": 0.619,
+      "step": 1780
+    },
+    {
+      "epoch": 0.9498666666666666,
+      "grad_norm": 0.4548466775430133,
+      "learning_rate": 1.3163838962890195e-06,
+      "loss": 0.6261,
+      "step": 1781
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.4474854245936277,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6213,
+      "step": 1782
+    },
+    {
+      "epoch": 0.9509333333333333,
+      "grad_norm": 0.4400101674067497,
+      "learning_rate": 1.261080262743297e-06,
+      "loss": 0.6145,
+      "step": 1783
+    },
+    {
+      "epoch": 0.9514666666666667,
+      "grad_norm": 0.3875863845126434,
+      "learning_rate": 1.2338706790069431e-06,
+      "loss": 0.5775,
+      "step": 1784
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4777366702556033,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.6843,
+      "step": 1785
+    },
+    {
+      "epoch": 0.9525333333333333,
+      "grad_norm": 0.4866754091612041,
+      "learning_rate": 1.1803363838667092e-06,
+      "loss": 0.6528,
+      "step": 1786
+    },
+    {
+      "epoch": 0.9530666666666666,
+      "grad_norm": 0.613569179778805,
+      "learning_rate": 1.1540118323243865e-06,
+      "loss": 0.5573,
+      "step": 1787
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.40498745819952764,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6056,
+      "step": 1788
+    },
+    {
+      "epoch": 0.9541333333333334,
+      "grad_norm": 0.413474162763506,
+      "learning_rate": 1.1022483143405705e-06,
+      "loss": 0.6025,
+      "step": 1789
+    },
+    {
+      "epoch": 0.9546666666666667,
+      "grad_norm": 0.44597027165955117,
+      "learning_rate": 1.076809502472831e-06,
+      "loss": 0.6104,
+      "step": 1790
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.4064801311758751,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6437,
+      "step": 1791
+    },
+    {
+      "epoch": 0.9557333333333333,
+      "grad_norm": 0.41690486845907654,
+      "learning_rate": 1.0268181528061749e-06,
+      "loss": 0.6525,
+      "step": 1792
+    },
+    {
+      "epoch": 0.9562666666666667,
+      "grad_norm": 0.4516547281715239,
+      "learning_rate": 1.0022657642890231e-06,
+      "loss": 0.6222,
+      "step": 1793
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.43088246059304064,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.6326,
+      "step": 1794
+    },
+    {
+      "epoch": 0.9573333333333334,
+      "grad_norm": 0.3937326241318572,
+      "learning_rate": 9.540479264726676e-07,
+      "loss": 0.6102,
+      "step": 1795
+    },
+    {
+      "epoch": 0.9578666666666666,
+      "grad_norm": 0.4179881256944634,
+      "learning_rate": 9.303826211592315e-07,
+      "loss": 0.6256,
+      "step": 1796
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.3752653805864915,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.5775,
+      "step": 1797
+    },
+    {
+      "epoch": 0.9589333333333333,
+      "grad_norm": 0.3374245664346441,
+      "learning_rate": 8.839395910626213e-07,
+      "loss": 0.6107,
+      "step": 1798
+    },
+    {
+      "epoch": 0.9594666666666667,
+      "grad_norm": 0.39522870952333566,
+      "learning_rate": 8.611620049653879e-07,
+      "loss": 0.5733,
+      "step": 1799
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.4943357520654654,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6241,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9605333333333334,
+      "grad_norm": 0.4213764358629093,
+      "learning_rate": 8.16495030759501e-07,
+      "loss": 0.5992,
+      "step": 1801
+    },
+    {
+      "epoch": 0.9610666666666666,
+      "grad_norm": 0.5586452322644974,
+      "learning_rate": 7.946057760332193e-07,
+      "loss": 0.5766,
+      "step": 1802
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.3824768095593952,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6512,
+      "step": 1803
+    },
+    {
+      "epoch": 0.9621333333333333,
+      "grad_norm": 0.43902398729396896,
+      "learning_rate": 7.517160581569372e-07,
+      "loss": 0.6067,
+      "step": 1804
+    },
+    {
+      "epoch": 0.9626666666666667,
+      "grad_norm": 0.39133704391079804,
+      "learning_rate": 7.307157230821426e-07,
+      "loss": 0.6296,
+      "step": 1805
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.39219399405215594,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6471,
+      "step": 1806
+    },
+    {
+      "epoch": 0.9637333333333333,
+      "grad_norm": 0.3877358568441335,
+      "learning_rate": 6.896044142100433e-07,
+      "loss": 0.6099,
+      "step": 1807
+    },
+    {
+      "epoch": 0.9642666666666667,
+      "grad_norm": 0.39164164231563814,
+      "learning_rate": 6.694935631773258e-07,
+      "loss": 0.5876,
+      "step": 1808
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.403644328513689,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6149,
+      "step": 1809
+    },
+    {
+      "epoch": 0.9653333333333334,
+      "grad_norm": 0.5572673393766915,
+      "learning_rate": 6.301617681886863e-07,
+      "loss": 0.62,
+      "step": 1810
+    },
+    {
+      "epoch": 0.9658666666666667,
+      "grad_norm": 0.48844084356238204,
+      "learning_rate": 6.109409416834688e-07,
+      "loss": 0.6791,
+      "step": 1811
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.4106253067003294,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.5834,
+      "step": 1812
+    },
+    {
+      "epoch": 0.9669333333333333,
+      "grad_norm": 0.5232318447915651,
+      "learning_rate": 5.733897176325665e-07,
+      "loss": 0.6561,
+      "step": 1813
+    },
+    {
+      "epoch": 0.9674666666666667,
+      "grad_norm": 0.4081165885957195,
+      "learning_rate": 5.550594322205504e-07,
+      "loss": 0.604,
+      "step": 1814
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.4384446679985943,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6374,
+      "step": 1815
+    },
+    {
+      "epoch": 0.9685333333333334,
+      "grad_norm": 0.3793257951017932,
+      "learning_rate": 5.192897883082747e-07,
+      "loss": 0.6427,
+      "step": 1816
+    },
+    {
+      "epoch": 0.9690666666666666,
+      "grad_norm": 0.37436951514651023,
+      "learning_rate": 5.018505366216175e-07,
+      "loss": 0.5791,
+      "step": 1817
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.6507421966514167,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6979,
+      "step": 1818
+    },
+    {
+      "epoch": 0.9701333333333333,
+      "grad_norm": 0.414392189643734,
+      "learning_rate": 4.678634341683252e-07,
+      "loss": 0.6493,
+      "step": 1819
+    },
+    {
+      "epoch": 0.9706666666666667,
+      "grad_norm": 0.3898163698700003,
+      "learning_rate": 4.5131568489236166e-07,
+      "loss": 0.6263,
+      "step": 1820
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.43892173194381723,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6251,
+      "step": 1821
+    },
+    {
+      "epoch": 0.9717333333333333,
+      "grad_norm": 0.4061868178940558,
+      "learning_rate": 4.191120373120749e-07,
+      "loss": 0.5776,
+      "step": 1822
+    },
+    {
+      "epoch": 0.9722666666666666,
+      "grad_norm": 0.561940286841165,
+      "learning_rate": 4.034562351727389e-07,
+      "loss": 0.5919,
+      "step": 1823
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.4444289938582935,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.5448,
+      "step": 1824
+    },
+    {
+      "epoch": 0.9733333333333334,
+      "grad_norm": 0.43147072910826345,
+      "learning_rate": 3.73036907948543e-07,
+      "loss": 0.6355,
+      "step": 1825
+    },
+    {
+      "epoch": 0.9738666666666667,
+      "grad_norm": 0.37023411773954457,
+      "learning_rate": 3.582734737004101e-07,
+      "loss": 0.5794,
+      "step": 1826
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.4362952651794109,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.669,
+      "step": 1827
+    },
+    {
+      "epoch": 0.9749333333333333,
+      "grad_norm": 0.3515469215393279,
+      "learning_rate": 3.296392843612273e-07,
+      "loss": 0.5957,
+      "step": 1828
+    },
+    {
+      "epoch": 0.9754666666666667,
+      "grad_norm": 0.4974092478566155,
+      "learning_rate": 3.1576861477621287e-07,
+      "loss": 0.5985,
+      "step": 1829
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3808216459432855,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6183,
+      "step": 1830
+    },
+    {
+      "epoch": 0.9765333333333334,
+      "grad_norm": 0.38782983278814614,
+      "learning_rate": 2.889203328748424e-07,
+      "loss": 0.645,
+      "step": 1831
+    },
+    {
+      "epoch": 0.9770666666666666,
+      "grad_norm": 0.3849222818790878,
+      "learning_rate": 2.759428007315212e-07,
+      "loss": 0.5895,
+      "step": 1832
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.43622022139414524,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6589,
+      "step": 1833
+    },
+    {
+      "epoch": 0.9781333333333333,
+      "grad_norm": 0.38740406391785887,
+      "learning_rate": 2.5088114782392257e-07,
+      "loss": 0.548,
+      "step": 1834
+    },
+    {
+      "epoch": 0.9786666666666667,
+      "grad_norm": 0.3775418121317073,
+      "learning_rate": 2.3879710189753656e-07,
+      "loss": 0.5708,
+      "step": 1835
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.37899514424739766,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6044,
+      "step": 1836
+    },
+    {
+      "epoch": 0.9797333333333333,
+      "grad_norm": 0.3952366771236291,
+      "learning_rate": 2.15522751523467e-07,
+      "loss": 0.6132,
+      "step": 1837
+    },
+    {
+      "epoch": 0.9802666666666666,
+      "grad_norm": 0.45795803936713125,
+      "learning_rate": 2.0433251657653308e-07,
+      "loss": 0.5841,
+      "step": 1838
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.39120816038265205,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.5932,
+      "step": 1839
+    },
+    {
+      "epoch": 0.9813333333333333,
+      "grad_norm": 0.45997035320821184,
+      "learning_rate": 1.8284609424142895e-07,
+      "loss": 0.6514,
+      "step": 1840
+    },
+    {
+      "epoch": 0.9818666666666667,
+      "grad_norm": 0.3990009793873229,
+      "learning_rate": 1.7254997101500137e-07,
+      "loss": 0.6181,
+      "step": 1841
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.4078619114901513,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6127,
+      "step": 1842
+    },
+    {
+      "epoch": 0.9829333333333333,
+      "grad_norm": 0.3757411604027671,
+      "learning_rate": 1.5285205417319149e-07,
+      "loss": 0.6095,
+      "step": 1843
+    },
+    {
+      "epoch": 0.9834666666666667,
+      "grad_norm": 0.38505502173080025,
+      "learning_rate": 1.4345031937879062e-07,
+      "loss": 0.6205,
+      "step": 1844
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.46882005313831376,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6354,
+      "step": 1845
+    },
+    {
+      "epoch": 0.9845333333333334,
+      "grad_norm": 0.4170014431891796,
+      "learning_rate": 1.255414374179531e-07,
+      "loss": 0.6405,
+      "step": 1846
+    },
+    {
+      "epoch": 0.9850666666666666,
+      "grad_norm": 0.3966189547801234,
+      "learning_rate": 1.170343437301491e-07,
+      "loss": 0.6008,
+      "step": 1847
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3802848692980568,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.5816,
+      "step": 1848
+    },
+    {
+      "epoch": 0.9861333333333333,
+      "grad_norm": 0.4275208205666537,
+      "learning_rate": 1.0091497795706728e-07,
+      "loss": 0.6343,
+      "step": 1849
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.41248547085907605,
+      "learning_rate": 9.330275400666332e-08,
+      "loss": 0.5959,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.4283474840922225,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6231,
+      "step": 1851
+    },
+    {
+      "epoch": 0.9877333333333334,
+      "grad_norm": 0.4245547427079539,
+      "learning_rate": 7.8973337634336e-08,
+      "loss": 0.6201,
+      "step": 1852
+    },
+    {
+      "epoch": 0.9882666666666666,
+      "grad_norm": 0.3908314103669169,
+      "learning_rate": 7.225618800222877e-08,
+      "loss": 0.5737,
+      "step": 1853
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.34079404885288034,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.5836,
+      "step": 1854
+    },
+    {
+      "epoch": 0.9893333333333333,
+      "grad_norm": 0.40286549864423876,
+      "learning_rate": 5.971710613821291e-08,
+      "loss": 0.5945,
+      "step": 1855
+    },
+    {
+      "epoch": 0.9898666666666667,
+      "grad_norm": 0.46596364619993086,
+      "learning_rate": 5.389521134989695e-08,
+      "loss": 0.6385,
+      "step": 1856
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.3940174311468667,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6279,
+      "step": 1857
+    },
+    {
+      "epoch": 0.9909333333333333,
+      "grad_norm": 0.4203965134296234,
+      "learning_rate": 4.314680098592705e-08,
+      "loss": 0.6402,
+      "step": 1858
+    },
+    {
+      "epoch": 0.9914666666666667,
+      "grad_norm": 0.3774077358110915,
+      "learning_rate": 3.8220317506654226e-08,
+      "loss": 0.6194,
+      "step": 1859
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4008472539371659,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.566,
+      "step": 1860
+    },
+    {
+      "epoch": 0.9925333333333334,
+      "grad_norm": 0.4080999169238299,
+      "learning_rate": 2.9262867509605163e-08,
+      "loss": 0.6448,
+      "step": 1861
+    },
+    {
+      "epoch": 0.9930666666666667,
+      "grad_norm": 0.4077745692946297,
+      "learning_rate": 2.5231927740154704e-08,
+      "loss": 0.6125,
+      "step": 1862
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.38226107295436784,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6245,
+      "step": 1863
+    },
+    {
+      "epoch": 0.9941333333333333,
+      "grad_norm": 0.41375038124086927,
+      "learning_rate": 1.8065678844314538e-08,
+      "loss": 0.5857,
+      "step": 1864
+    },
+    {
+      "epoch": 0.9946666666666667,
+      "grad_norm": 0.3966773367374224,
+      "learning_rate": 1.4930391117451426e-08,
+      "loss": 0.6348,
+      "step": 1865
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.40442872978303074,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6214,
+      "step": 1866
+    },
+    {
+      "epoch": 0.9957333333333334,
+      "grad_norm": 0.38841286408602566,
+      "learning_rate": 9.555535917993297e-09,
+      "loss": 0.5974,
+      "step": 1867
+    },
+    {
+      "epoch": 0.9962666666666666,
+      "grad_norm": 0.46383341076347295,
+      "learning_rate": 7.315984495548378e-09,
+      "loss": 0.578,
+      "step": 1868
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.49119919317599087,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6599,
+      "step": 1869
+    },
+    {
+      "epoch": 0.9973333333333333,
+      "grad_norm": 0.41483896589092045,
+      "learning_rate": 3.732667443390181e-09,
+      "loss": 0.6455,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9978666666666667,
+      "grad_norm": 0.3285919278758509,
+      "learning_rate": 2.388912514017516e-09,
+      "loss": 0.5492,
+      "step": 1871
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.47265575503752827,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.6146,
+      "step": 1872
+    },
+    {
+      "epoch": 0.9989333333333333,
+      "grad_norm": 0.4001102881783127,
+      "learning_rate": 5.972299119250125e-10,
+      "loss": 0.626,
+      "step": 1873
+    },
+    {
+      "epoch": 0.9994666666666666,
+      "grad_norm": 0.3969808974787441,
+      "learning_rate": 1.4930758944764479e-10,
+      "loss": 0.5833,
+      "step": 1874
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.4771903062703072,
+      "learning_rate": 0.0,
+      "loss": 0.6608,
+      "step": 1875
+    },
+    {
+      "epoch": 1.0,
+      "step": 1875,
+      "total_flos": 1649201477156864.0,
+      "train_loss": 0.7004796615282695,
+      "train_runtime": 29310.7261,
+      "train_samples_per_second": 1.024,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1649201477156864.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0364082e66eba99ad87431531c6484171aba51ef
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "down_proj",
+    "k_proj",
+    "up_proj",
+    "q_proj",
+    "o_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a015251aba2f3baf7f2ac1be2e37668039831dad
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad3b0810ecbe492df7ac9ea0a150f743913dc2e286ae5b069645bfac33125397
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..312bca50e02855ac65838f848b783490e0b1af5c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77ab988b2b75ddea21d2207ee190795b609eaebf993e7b5701d6fd564233b7b2
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..601aea95bb2089e62c16f16716bd520ab5e4b306
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 1.0020062019097136,
+      "learning_rate": 2e-05,
+      "loss": 1.4094,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 1.1672913328138943,
+      "learning_rate": 4e-05,
+      "loss": 1.5713,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.8842418536610446,
+      "learning_rate": 6e-05,
+      "loss": 1.3336,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.7848675740570894,
+      "learning_rate": 8e-05,
+      "loss": 1.1949,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8436775517980575,
+      "learning_rate": 0.0001,
+      "loss": 1.1434,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.808923767814001,
+      "learning_rate": 0.00012,
+      "loss": 1.043,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.9265473165199244,
+      "learning_rate": 0.00014,
+      "loss": 1.0547,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.8782500227023723,
+      "learning_rate": 0.00016,
+      "loss": 1.0547,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.7012466031858726,
+      "learning_rate": 0.00018,
+      "loss": 0.9094,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5475305846513644,
+      "learning_rate": 0.0002,
+      "loss": 0.9137,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5616025381116986,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.8534,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.587551632299327,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.9994,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5028924581189982,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.8336,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5466220925904238,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.9528,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5712862342882589,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.9179,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4988168205894673,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.8687,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.5180030366003013,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.9255,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.616942952759206,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.9314,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5584932054266343,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.9221,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.48240484153272345,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.8666,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.5009492975166543,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.9445,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.4991179929795562,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.925,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.44584094654822,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.8652,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.46242576323411977,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.9334,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.45158839685216784,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.8848,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.45338770941838574,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.7919,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.4692829974767608,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.8308,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.49506573014670635,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.872,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.46172406685438194,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.8506,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4520624065427708,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.8671,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.5370489718418412,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.9335,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4747014965518446,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.8181,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.5330780221340524,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.8758,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.4461456900225593,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.853,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.42907229569763317,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.8721,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.44077146618160284,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.8609,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.41030197878298086,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.8098,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.49557317411685103,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8831,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.44605271336641833,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.8377,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4826928471153355,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.8205,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.44838532908845163,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.7902,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.4734508589182639,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8583,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.44919162382201094,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.815,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4519943868115642,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8703,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4229071333914242,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.7932,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.4738970902391505,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8451,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.41887929710668087,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.7893,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.4799740282974879,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.8122,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.49665391720151236,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.8565,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.42640442472050694,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8022,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5923745737555824,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.8227,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.5169313193580595,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.8382,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.411955879890394,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.8153,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.4653962529308693,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8063,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.46401764632163195,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.8618,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.5098831978733065,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.8702,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.4550613856493534,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.7666,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.4334229661431585,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.8294,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5014227971036841,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.7827,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.44130053629511323,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.8197,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4293140157957783,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.7983,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4594223463930405,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.8582,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4141932528344203,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.842,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.39826030960177344,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.7777,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4629047201613823,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.8507,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5035872872686349,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.8771,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.4890199906535899,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.8973,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.45734084694048316,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.7985,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4622158365476711,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.8644,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4043135056245759,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.7464,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.4043710195936969,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.7411,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.48797963423635654,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.9087,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4514538357948917,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.8023,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.42213380789547084,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.7816,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4661438499135115,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.808,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.5232351498258501,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.9008,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.3975491846434713,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.7479,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.44571590233114655,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.8146,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4178992442982215,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.8273,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4530115017535405,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.8645,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4886539415071477,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.788,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.4056803818730166,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.8116,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.39073317913364825,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.7638,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4688396587663738,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.7963,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.5008490816743745,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.8696,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.41439579567404433,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.7953,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.5384060646901033,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.85,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.43573649346247634,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.7754,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.5280762687540471,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.7931,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.45320274465650234,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.7874,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.3980614452364451,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.7328,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.5159807437944037,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.8251,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.4798140262593944,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.7757,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.4452205576856887,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.7538,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.3745168487175859,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.7036,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.5074584143384333,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.804,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.43561945612065134,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.7436,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4848632241795599,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.8231,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.5139486642178871,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.7744,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5522899077467178,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.9019,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.47312384533831275,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.8012,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.46402196649732563,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.8558,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.41689457770001886,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.7365,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.4012615086815032,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7223,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.38947271506208403,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.7767,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.43740220066952523,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.7461,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.43193908663600816,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.7846,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.4301344958314146,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.7225,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.43533146434585573,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.8807,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.4237347133491233,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.7783,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.40306999298900875,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.7687,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3986230491682654,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7666,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4994081964028108,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.8079,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.4344344140485525,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8157,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.5043695566357764,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.8553,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.45373707073050146,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7418,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.5003550553196944,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.8143,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.43104315436973506,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.7843,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.4360808885133991,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.801,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4413032958474301,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.8097,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.37330247661022875,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.6842,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.44689400956413305,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.8236,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.4503572306579569,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.7274,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.3883114965737896,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.7537,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5153659790762082,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.8219,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.40559233592673544,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.7438,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.38508116038201756,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.7713,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.44845430495824773,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.7845,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.5045862534919716,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.779,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.45811118619528685,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.7595,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.554927213995397,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.7426,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.45414281431875614,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7745,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.4180034067605168,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.7423,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.430146230620113,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7587,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.39963289577534084,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.7324,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4398823647567502,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7728,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.4141436944414363,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.7746,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.44479075402997625,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7397,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.4285177243435202,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.8109,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4411044029778361,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.8116,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.4137959768828525,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.7417,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.439364759321868,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7878,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.4489862198564562,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.7126,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.48578216440407823,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7635,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.45022516264972906,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.7899,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.40872955925173193,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.7341,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.42586202092150144,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.7314,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.4171292822856141,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.7595,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.480380131441867,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.812,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.40526609693194515,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.769,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3840310308424327,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.719,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3752893256883143,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.8175,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.42208306169316506,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.7695,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4708087065120164,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7581,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.43669467431527925,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.8088,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3694687014511646,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7467,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.40360189905265087,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.7809,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.5168717922014412,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7352,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.3671113041044409,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.6766,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.45002238461802757,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.8266,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4309770941080891,
+      "learning_rate": 0.0001,
+      "loss": 0.7716,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.38193922359229104,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.6886,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.47255435453422723,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.8111,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.41050966292043334,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.7198,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.40167619297449497,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.7607,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.43591555439181373,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.7679,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.39375915423090696,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7431,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3911415923341417,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.748,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.37806000003935913,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.718,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.47918580480180134,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.769,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.456254451883631,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.7987,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.45635188698492374,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.7745,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4062422613227234,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.7558,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.44786737175933516,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7439,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4420163721822703,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.7468,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.4366403785449747,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.8282,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.44160567802460143,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.7693,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3950052585958004,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.754,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.4864733609910337,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.7768,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4203970972293743,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.802,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.36364201707045174,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.6714,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.3773423440015053,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.7531,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.4122032139585757,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.7392,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.4060138041258608,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.7826,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4500005144010625,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.6901,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.37550186795184387,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.6867,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.451039864858227,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.7846,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.38497885909127655,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.7088,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.5154266123974721,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.8177,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3703640026584789,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.6701,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.4386400051645292,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.7671,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.402822326738162,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.7669,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.43335857320343696,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.7568,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.44062106618622304,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7122,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.46213988006162576,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.7027,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.4508439790652391,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7829,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.4830750076650203,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.8339,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.4822485380332219,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.704,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.3450710872496977,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.7296,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.39481863420916286,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.6931,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.49669230264861525,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.7934,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4154651254147214,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.7512,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.4602137806235495,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.713,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.39711180804932905,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.7526,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.38725643050847247,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.7804,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.38570099899258914,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.763,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.41759594142356127,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.6926,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.4784349568071053,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7353,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3642809349585427,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.7078,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.42279399570363735,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7707,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4232770458907553,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.7892,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.44417509587658666,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.8029,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.4112498099064009,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.7499,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.38248075464712855,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.694,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.388512224349573,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.7339,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.4493476842881038,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.7018,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.4022224547119,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.7408,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.4416873626755746,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.7273,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.33804220900980275,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.7121,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3567832952479447,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.6917,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.4449544370633313,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.8005,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.447920701823006,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7294,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.3731730633660234,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.6956,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.42750086388493475,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.7655,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3920466926065124,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.6975,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.3611682156930113,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7159,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.41926361819237346,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.7275,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.41971660784876186,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.7545,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.7413673275066183,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.673,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.3806863118171017,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.6866,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.41477013408202706,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.7455,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3876082407382047,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.7878,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.41300439564787,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.6675,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.36066932638629273,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7172,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.37561012170468494,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.7123,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.4408534490279242,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.779,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.4389182520268332,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.7727,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.44084624793958344,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7353,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.45965825857016396,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.8268,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.40486898098610974,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7036,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3831590332909565,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.6851,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.6399623309462235,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.8118,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.3955238681930386,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.6972,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.4022287665333359,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.733,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4298350782931847,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.7647,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.49694477718490887,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7549,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.42114719991142247,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.7375,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.3809928321742791,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.7021,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3798248954210015,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.7627,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.4004290153653874,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.731,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.40098733120968666,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.7532,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.39095986270735283,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.687,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.4113257164040196,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.7505,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.394684889161784,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.7308,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.41218562414416526,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.682,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.5006514104155203,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.7833,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.40635351115200874,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.6955,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3816074195458233,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7069,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.4225561882163358,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.7832,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.41080313956124365,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.714,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.41574441763530995,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.7687,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.458118134115343,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.718,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.4244817211567879,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.7004,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.409980416870785,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.6762,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4303430200185168,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.6777,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.5187693825788201,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.7903,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.43812839922116,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.731,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.4613001122623699,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.6949,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.42749542124196754,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.748,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.40464163859777,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.7253,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.39804294526476447,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.7374,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4266101798907561,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7215,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.3885077025405382,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.7357,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.4846915505912817,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.8117,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.452520369015205,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.7179,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.4184692167854737,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.6905,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.420499872484043,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.6931,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.4251946133856474,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.6707,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.4272730684539094,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.7537,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.6309998230648284,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.7164,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3862301256612526,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.7741,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.3931981740744817,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.6871,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.37541414814492263,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.6975,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.4028054962453646,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.7087,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.37853946883499306,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.73,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.5788423755412683,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.7262,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.34013834265448084,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.667,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.43818614152472873,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.7841,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.35493850044501696,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.6632,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4196389310359502,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.7426,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.41823350521904995,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.7649,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.7150743902715252,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7215,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.45743484487287117,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.804,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.4112747257690636,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.69,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4095135251036486,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.7374,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.5067443148357712,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7021,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3598532464586451,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.7058,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.4433523684821114,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.6705,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.4160728625035564,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.7909,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.41033752228612996,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.6993,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.4877390319538481,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.631,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.44045946935661956,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7106,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.45442637015135684,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.7077,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.4241902847444021,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.6472,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3707542425270281,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.7192,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.3856152515473988,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.6616,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3917590365344406,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.7024,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.4543793555941507,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.7265,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.3625434659433941,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.6541,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4035496798827605,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.7096,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.3782322554766045,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.6533,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.4651090593785289,
+      "learning_rate": 0.0,
+      "loss": 0.7347,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 271218014191616.0,
+      "train_loss": 0.783779461414386,
+      "train_runtime": 4880.0845,
+      "train_samples_per_second": 1.025,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 271218014191616.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7cf87d675e08341bc5385e4fd15960420adf621
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "q_proj",
+    "down_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..618c34f4b39b838fa33e06044a8d7fd38bcb5115
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a98696296054cee5685487066ca076077023d0606ae9c626dfb0e2c99c2ef33
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..268844bfc521d01270d9785d933ed2a8e7dfdc4e
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a69debb061221fb65c4c29b26cee8d8c34f90d50e87801d857baad12862433f
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..02792653adc727c1ee1a33fef301f301358cd438
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,1134 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 156,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0064,
+      "grad_norm": 1.002639836398211,
+      "learning_rate": 4e-05,
+      "loss": 1.4904,
+      "step": 1
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.9148778845307886,
+      "learning_rate": 8e-05,
+      "loss": 1.3615,
+      "step": 2
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.6218126570709165,
+      "learning_rate": 0.00012,
+      "loss": 1.2636,
+      "step": 3
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 1.1733819649415596,
+      "learning_rate": 0.00016,
+      "loss": 1.2422,
+      "step": 4
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.8961458294612699,
+      "learning_rate": 0.0002,
+      "loss": 1.0297,
+      "step": 5
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.7104478974141764,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 1.0069,
+      "step": 6
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.4453839306748887,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.9213,
+      "step": 7
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4364578353790173,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9275,
+      "step": 8
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.46386602430694124,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.9556,
+      "step": 9
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4390379657400523,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.925,
+      "step": 10
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.4562302929225185,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.9569,
+      "step": 11
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.39341154607993345,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.9187,
+      "step": 12
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.38162793658742256,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.8549,
+      "step": 13
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4132867175537064,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.8685,
+      "step": 14
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.40801041892631884,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.878,
+      "step": 15
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.3705814731127435,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.8901,
+      "step": 16
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.36330646232631925,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8798,
+      "step": 17
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.33080409264465793,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.8781,
+      "step": 18
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.32818135077853167,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8538,
+      "step": 19
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3362079631555025,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.8387,
+      "step": 20
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.33995378997861403,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8369,
+      "step": 21
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.3917103689027093,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8547,
+      "step": 22
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.3348872695789784,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8289,
+      "step": 23
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.35776545677090854,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.807,
+      "step": 24
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3334221740476462,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8377,
+      "step": 25
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4039360148046414,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.8377,
+      "step": 26
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3214933683269574,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8202,
+      "step": 27
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.3609707150993955,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.8722,
+      "step": 28
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.31292033520753265,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.7987,
+      "step": 29
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.32661815417153717,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.7988,
+      "step": 30
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.32009062981637354,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.8287,
+      "step": 31
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.29528010997310794,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.8102,
+      "step": 32
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.3446237393250682,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.8627,
+      "step": 33
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.34109389969001735,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.8496,
+      "step": 34
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3162295461366009,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.7985,
+      "step": 35
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.35365519160873493,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8226,
+      "step": 36
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.33091699840279165,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.7905,
+      "step": 37
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.3611309922925739,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8525,
+      "step": 38
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.3130692668968462,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.7838,
+      "step": 39
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3078365535973117,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.8411,
+      "step": 40
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.33350693683260196,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.8001,
+      "step": 41
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.31163537934096214,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.7737,
+      "step": 42
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.3408303214049586,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.8311,
+      "step": 43
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.319324969812128,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.8081,
+      "step": 44
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3296644378182685,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.788,
+      "step": 45
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.3381874837565884,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.7745,
+      "step": 46
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.3439330013144162,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.7663,
+      "step": 47
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3395715333488213,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.7516,
+      "step": 48
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.3475199659571279,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.7809,
+      "step": 49
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3689529758365805,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.8353,
+      "step": 50
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.32978099329172,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.8185,
+      "step": 51
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.31695533987483404,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7293,
+      "step": 52
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.3002706071495285,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.7607,
+      "step": 53
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.31072433005235056,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.7501,
+      "step": 54
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3188597094696028,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.8266,
+      "step": 55
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3011529492928554,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7653,
+      "step": 56
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.33407894117308795,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8106,
+      "step": 57
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.3459865222149195,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7883,
+      "step": 58
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.30487947849131897,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.7963,
+      "step": 59
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.31576280319394795,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.8017,
+      "step": 60
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.2983397534698475,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7508,
+      "step": 61
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.3020225486456565,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.7391,
+      "step": 62
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.2991633849034537,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.7753,
+      "step": 63
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.31060377997959776,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.7797,
+      "step": 64
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3634309228876997,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.7659,
+      "step": 65
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3603104135897214,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7476,
+      "step": 66
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.31528714964980065,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7504,
+      "step": 67
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.29930350113657345,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.752,
+      "step": 68
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.289895410892452,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7543,
+      "step": 69
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3179723569904245,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.8121,
+      "step": 70
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3022004251250754,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7608,
+      "step": 71
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.3175969070633314,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7423,
+      "step": 72
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3051849195123186,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.7585,
+      "step": 73
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.31862225113793446,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.7477,
+      "step": 74
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3262675703408152,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7909,
+      "step": 75
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.28143259427212386,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.7668,
+      "step": 76
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.3273677507239454,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7539,
+      "step": 77
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.29009024969694625,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7765,
+      "step": 78
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3317167312872358,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7575,
+      "step": 79
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.30380303061938135,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7462,
+      "step": 80
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.2917701370546009,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7377,
+      "step": 81
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3395467838004645,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.773,
+      "step": 82
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.29853428274769916,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.7595,
+      "step": 83
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.2945108725430035,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.7495,
+      "step": 84
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.32639884964875193,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7449,
+      "step": 85
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.33706708071043895,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.786,
+      "step": 86
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.3479230154717327,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7484,
+      "step": 87
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.3101170670613661,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7877,
+      "step": 88
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.29316190913483436,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.7623,
+      "step": 89
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.33768620036386193,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.7881,
+      "step": 90
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.2905393513774914,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.7145,
+      "step": 91
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3043993112462114,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.7615,
+      "step": 92
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.2834334048638075,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.6896,
+      "step": 93
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.30493595189203726,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.7528,
+      "step": 94
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3092316178546374,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.7499,
+      "step": 95
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.30722136255670696,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.7695,
+      "step": 96
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3083405078773032,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7384,
+      "step": 97
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3159443481603721,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7514,
+      "step": 98
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.3450221012225604,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7751,
+      "step": 99
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.2733089025692975,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7175,
+      "step": 100
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.33594301474514837,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.7783,
+      "step": 101
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3180809001952563,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.7382,
+      "step": 102
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.28901339983137636,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.7788,
+      "step": 103
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.31126724419239143,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7217,
+      "step": 104
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3009274137359368,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7471,
+      "step": 105
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.31539695546289276,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.7997,
+      "step": 106
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.286607187152073,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.729,
+      "step": 107
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3018808665145025,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.724,
+      "step": 108
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3127535971736329,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.7455,
+      "step": 109
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.26012111785159153,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.707,
+      "step": 110
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3051050713891758,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7723,
+      "step": 111
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3118804782861242,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.737,
+      "step": 112
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.2737164240705061,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7149,
+      "step": 113
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3134425223786983,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.7469,
+      "step": 114
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.2818906727208132,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.6867,
+      "step": 115
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.29056613377962315,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.7748,
+      "step": 116
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.2972352441395284,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7007,
+      "step": 117
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.30985546794945074,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.7526,
+      "step": 118
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.34058599695364866,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7624,
+      "step": 119
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.30900205953471865,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7718,
+      "step": 120
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.38846582883643044,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.761,
+      "step": 121
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.2900413609414705,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.7262,
+      "step": 122
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.34617299231482673,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7707,
+      "step": 123
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.31898208311746445,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.7352,
+      "step": 124
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.27769781138534744,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7545,
+      "step": 125
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.28363617441671607,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.728,
+      "step": 126
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.302373451841636,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.7503,
+      "step": 127
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3271045216797624,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.7423,
+      "step": 128
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.2919143721730832,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7086,
+      "step": 129
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.2983360606395803,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.756,
+      "step": 130
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.2982372679255095,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7521,
+      "step": 131
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.29307631961647174,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7018,
+      "step": 132
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.40391346833649705,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.74,
+      "step": 133
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.32945897327262835,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.7246,
+      "step": 134
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.2949428002691319,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.7463,
+      "step": 135
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.30101875454180504,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7394,
+      "step": 136
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.32082387098249604,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.7855,
+      "step": 137
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.3086004908294671,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7121,
+      "step": 138
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.30293332836986925,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.6969,
+      "step": 139
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.31335455383145533,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.7462,
+      "step": 140
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.29406534419663677,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7404,
+      "step": 141
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.28288570790135903,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.7154,
+      "step": 142
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.34788707135985963,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.7389,
+      "step": 143
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.290633490117723,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.732,
+      "step": 144
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.27377340408363693,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.7141,
+      "step": 145
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.32626292933455625,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7551,
+      "step": 146
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.32123614225602803,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.7626,
+      "step": 147
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.35821692271121813,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7313,
+      "step": 148
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.3958937218979211,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7032,
+      "step": 149
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.2967738869621842,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.7575,
+      "step": 150
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.4158348977915751,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.6813,
+      "step": 151
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.31372988976948457,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.6922,
+      "step": 152
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.2804679598419863,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.7026,
+      "step": 153
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.30239885095076025,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.7268,
+      "step": 154
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.2686050777026808,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.6936,
+      "step": 155
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.308271729218658,
+      "learning_rate": 0.0,
+      "loss": 0.7043,
+      "step": 156
+    },
+    {
+      "epoch": 0.9984,
+      "step": 156,
+      "total_flos": 399576021729280.0,
+      "train_loss": 0.7931731194257736,
+      "train_runtime": 4808.9782,
+      "train_samples_per_second": 1.04,
+      "train_steps_per_second": 0.032
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 156,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 399576021729280.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..5720738f4c1dd88ecb68abd6bf7f14dc39f5a1c3
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "q_proj",
+    "gate_proj",
+    "up_proj",
+    "v_proj",
+    "down_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7ff46212d77e39e1ba1c94a12a7085c628ae8ae7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e44ee1bdeda95c7635baa01aa6d1e96db390fb5a146b26a960dad923d98a0c95
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c05c6912e7deaa9e4327a23815a617bbf2e863f5
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6368aa85e21905e872b8c52af77664941a9fa812c94755bee30265b2664e7155
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..e47681ad3379893b82bbcaf7e204bbe3c93ad6e0
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9396278575656405,
+      "learning_rate": 2e-05,
+      "loss": 1.3595,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 1.145754195917076,
+      "learning_rate": 4e-05,
+      "loss": 1.5312,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.9101096939447241,
+      "learning_rate": 6e-05,
+      "loss": 1.3688,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.9044070523413997,
+      "learning_rate": 8e-05,
+      "loss": 1.4315,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.9366614139356674,
+      "learning_rate": 0.0001,
+      "loss": 1.2066,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.697160557893367,
+      "learning_rate": 0.00012,
+      "loss": 1.1375,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.8327698812333196,
+      "learning_rate": 0.00014,
+      "loss": 0.9662,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.8262069347243871,
+      "learning_rate": 0.00016,
+      "loss": 1.0,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.6074353633567875,
+      "learning_rate": 0.00018,
+      "loss": 0.8859,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5809210582543565,
+      "learning_rate": 0.0002,
+      "loss": 0.9082,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5735256089738234,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.9748,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5052310574234449,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.8814,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5737885547299703,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.8842,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5237047240583436,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.8155,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.626094153559293,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.8704,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.49669603205928,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.8471,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.5730289800881923,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.9009,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.523720158885521,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.9077,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5260323034094564,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.9066,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4652749578063743,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.8705,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.5261765722589641,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.879,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.5450535617021041,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.9356,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.44327870329129865,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.8761,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.4694133788580677,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.7869,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.527008252484748,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.9204,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.4701252432281297,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.8754,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.48583022123196745,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.8034,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4835202635149471,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.8875,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.506312540888068,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.8465,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4340404299246127,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.8459,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4500528389649417,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.8574,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4257338464346157,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.8621,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.4373720446193359,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.8511,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.48306945420553293,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8102,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.4581716737765401,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.9058,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.47027795850992604,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.8383,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.4759605551389717,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.832,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.44489231449356653,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8074,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.41453774455730197,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.7913,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.44096292259798536,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.8537,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.4578671860720513,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.7987,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.456394506352762,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.9166,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5715403594135529,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.8748,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.45471408830104054,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8128,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4150435625868661,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.8093,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.4128857957966661,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8009,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.42300454464386417,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.8088,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.4909389370245195,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.8211,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.4204437825519599,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.836,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.41517854198610105,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.7663,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.4126379219703116,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.8117,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.3995468225425603,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.8237,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.44308231794851455,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.8533,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3933044567365795,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.7194,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.42377677279686027,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.791,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.44100566311913353,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.807,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.42240855138098576,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.8092,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.44453501976497334,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.741,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.4585697459896131,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.7875,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.49505450824682223,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.9184,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.44646042397781077,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.8436,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.44598647925458745,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.7649,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4214221216363781,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.7784,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.45227184298961903,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.8158,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4040848091911473,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.8471,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.4687728839581517,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.7811,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.40027728864449313,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.7781,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.44021524315744653,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.7774,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4053592172372432,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.7382,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4064108947924071,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.778,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.4573478740667755,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.8036,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.43521713254634253,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.7037,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4872643814124734,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.8449,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5829514491611524,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.9214,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4632547941021662,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.8129,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.4094238985943658,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8073,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.39371642360909304,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.7885,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.5496513465587326,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.848,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.5168778359980416,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.7576,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.49770210847776,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7872,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4768471023589497,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.9025,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.46799804884228635,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.7977,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.6111257397554586,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.8315,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.48243384372342013,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.7998,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.44643797481247555,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.8077,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.5199204456546236,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.8154,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.48964963393648087,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.8594,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4378723113342222,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.7646,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.43319531220408636,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.8159,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.5565230031288813,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8249,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.4749038313521005,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.8222,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4413848996413543,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.8223,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.5162310450089371,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.9414,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.477965982604003,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.8763,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.43195358779919846,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.7583,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4856603711240389,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.8111,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.45508732740535235,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.7746,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.43409683925481063,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.7758,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.4089576078302274,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.7664,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4572742935111086,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.8346,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.4165602802496886,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.7574,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.42531464700157917,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.7713,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.4189404767776771,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.7765,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.41752852415604136,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.6862,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.48896299612656,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.7441,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.5274334652372455,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.8666,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.47976529095518916,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.8523,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.47384189744054706,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.8258,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.4663707910797713,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.8216,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.40466388928952995,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.6917,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.4045145628766846,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.787,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.40223285013388077,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7167,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.39586520973460043,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.7342,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.515213255581294,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8779,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.49916465611083227,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.737,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.5297401665467757,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.8885,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.48401564617054504,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.7736,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.5318711299830755,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.8255,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.5203546383396569,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.8579,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4466955402618473,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.8372,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.43889266882348626,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.8065,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.4686283414276429,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7721,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.46181916173875265,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.8308,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 1.1664688148777886,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.8059,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.43732852103952685,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.858,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.4400476906570904,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.8223,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.37050483573414034,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.714,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.45862151205497165,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.7781,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4442232869655172,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.8805,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.43746776301879925,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.7322,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4886796303948087,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.8185,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.45164193050243195,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7738,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.46359406856527713,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.7534,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.47446632746929185,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7351,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.41229097641545315,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.7103,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.44348387315275645,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7993,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.49196078833834966,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.7868,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.4338579023775582,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.8031,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.6994487653762459,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.8465,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.559336834442375,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.8581,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3917694435086478,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.7599,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3764541665137546,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.6855,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3824503642860104,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.7803,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.44439215452009584,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7088,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4221325726629454,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.7315,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.41321303437913515,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.8322,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.4121968181469142,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.7856,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.40550187666587395,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.7344,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.4087353652660321,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7539,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4345196746937004,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7812,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.433613477877486,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.8072,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.4060850663863163,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.7538,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.4823324231748682,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.7305,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.3634968717832757,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.754,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.5342361721517209,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.7521,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.4128252327373045,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7943,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.3564308128607441,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.6861,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.36480024633628716,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7548,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.4304158686063223,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.6694,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4179596357660802,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7384,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4450104509243802,
+      "learning_rate": 0.0001,
+      "loss": 0.7904,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.41911222525899255,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7513,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.4370069264323097,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.8009,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.48398984419907476,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.8622,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.42412653600318445,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.7707,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.46585747957106305,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.7407,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.43496655382328697,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7837,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.4160469387945875,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.7998,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.45209366925053024,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.7506,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.45837697636906116,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.8359,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.4520572966553641,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.7738,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.41486392065415506,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.681,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.44158687529956925,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.7782,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.37624162529254834,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.719,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5264135969316172,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.7705,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.4157147562878753,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7895,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.3976821965716464,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.7239,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.43353829558388224,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.7398,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.38614809151311424,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.6837,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.40695965628587244,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.7224,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3705268756816273,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.6957,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.37224292668224535,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.7061,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.39949350081200635,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.7367,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.4420689247752516,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.7376,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.414213858078857,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.6978,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.44715963773825573,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.7319,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.419463795582743,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.7685,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.5305448031704577,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.8223,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.44307595364597574,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.7755,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3879214149914402,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.7146,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.417380175997682,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.728,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3993602836290595,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.7149,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.4114404798361249,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.7508,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.4081368204736118,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7242,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.41641053203328376,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.7608,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.5162978723322689,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.8776,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.4416732459374655,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.8061,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.40922067204695417,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7072,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.3910408579777948,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.7045,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.4294596839944686,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7395,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.45617607800708243,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.7262,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4249970983415297,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.6206,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.38178663102554916,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.7617,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3884939823750097,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.7651,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.36729627243649654,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.7126,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.47826654307607736,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.7822,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.36161038152146086,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.6956,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.38006603015370655,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7106,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.40738285628597803,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.7725,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3374199271417341,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7519,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4761794602226593,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.8196,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.41660560521680234,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.692,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.40816431754864885,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.79,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.4053361082334398,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.7295,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.42265666409916125,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.7094,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.4271793829562403,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.6791,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3791251414844227,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.7291,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.34542493641922667,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.707,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.36758317059781026,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.7234,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.45619840247066157,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.8309,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.3620017253708174,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.7362,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3982523955481365,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7207,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.38184227358525125,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.7306,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.4472881830492068,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.8214,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5409463111197855,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.8137,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.8781222763001304,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.8318,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.4573105786147632,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.725,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3878888689024318,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.7151,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.36990394677897365,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.6794,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.7414686994665193,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.8003,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.3653209812387363,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.6701,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.38286313485527906,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.6541,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.38402242018511595,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.7397,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.4025903717697035,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7243,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.6383945559677234,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.7876,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.595752420576717,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.7897,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3650985592911939,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.6789,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.49809803666203206,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7296,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.40778843923946057,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.7261,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3814737670780612,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7428,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.37005090547095554,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.6798,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4021157794755927,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7861,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.4832224308224902,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.7694,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.5222420995257865,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.7208,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.46968945645061355,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.7398,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.40669039374836935,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7821,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.40816318256752393,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.7062,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.375101363348895,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.6283,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.4324430468772202,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.731,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.38879064939889446,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7415,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3692851453372047,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.7277,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3486513220133344,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.6959,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.44618582177638677,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.7638,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.42547013855164556,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.7555,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.4369715149874801,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.6905,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.41733388331032406,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.7008,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.4279520566519289,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.7013,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.37575572736837176,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.695,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.4435807395712066,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.8064,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3768698645952864,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7009,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.3726210345324877,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.6723,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.48204144200060334,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7801,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.4046556701900165,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.6695,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.4138196095845522,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7282,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.43139821829854313,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.6608,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.3717057475902722,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.6974,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.42052364318572816,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.7487,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3569959670544782,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.6482,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.38847176021176255,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.6822,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.4927376300455678,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.6963,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.4543632723679882,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.7343,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.43839276302132524,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7572,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4132661946203213,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.7811,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.462865482643029,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.739,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3872644843649215,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.7639,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.5394434644926044,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7964,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.40421908805526796,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.6796,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.4243003617984817,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7039,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.45483524594960506,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.7669,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3953883062663472,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.7068,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3777504055589925,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.7404,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.38222760426249,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7188,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.34598602379610305,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.7038,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.36538375721222566,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.6319,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4020557339041932,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.6852,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3757675955529075,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.6756,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.4004774320585294,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.6527,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.42414155376682977,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.7401,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3803954584255736,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.7108,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.5025161262007907,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.7621,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.42764338083831827,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.7229,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.4573366276957102,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7085,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.4218408813127882,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.7889,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.48315909057614376,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.7131,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4459536132032088,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.8133,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3720322174198035,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.6519,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3889515936331356,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.6526,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.4756886306246455,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7094,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.43376182376356753,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.749,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.4246866739631707,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.6724,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.367498280328426,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.7028,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.4259633905488669,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.692,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.3840615778351889,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.7006,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.39249924769502315,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.7546,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.41658451162515586,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.7532,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.495311090413998,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.8044,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.4467776892689143,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.7202,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.40244598198055637,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.6841,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.38046173309726555,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.7399,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.35812245930234876,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.6724,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.4140137745357569,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.7405,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3989913820387328,
+      "learning_rate": 0.0,
+      "loss": 0.6796,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 268745683697664.0,
+      "train_loss": 0.7839559385409722,
+      "train_runtime": 4830.1995,
+      "train_samples_per_second": 1.035,
+      "train_steps_per_second": 0.065
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 268745683697664.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ca054b4e1a4cb2f87c721040f4eb090a8d1ef56
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "o_proj",
+    "gate_proj",
+    "q_proj",
+    "v_proj",
+    "up_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..10138262f50c53b138494c0a47870bd5f8b98634
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2125dd55328c02732e6e1395a2477e8c8b7d1d847459aacf6f0f3d8fb1ee1f6d
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4d27b2535bc6037ac75c7d592638492046e7e0b6
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f463687a87e9276099a7514b3bd94e68802cac249240f9380f3c4a7f1e15a1f2
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..354e3ad8b86c59eae4993818defdd11d2c9d5dc5
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,1134 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 156,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9499681432110237,
+      "learning_rate": 4e-05,
+      "loss": 1.4454,
+      "step": 1
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.9758169479697124,
+      "learning_rate": 8e-05,
+      "loss": 1.5182,
+      "step": 2
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.7382712135428642,
+      "learning_rate": 0.00012,
+      "loss": 1.3954,
+      "step": 3
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 1.089772647132298,
+      "learning_rate": 0.00016,
+      "loss": 1.1687,
+      "step": 4
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.0435192868527843,
+      "learning_rate": 0.0002,
+      "loss": 1.0096,
+      "step": 5
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.7711189389458536,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 1.021,
+      "step": 6
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.48985254526410515,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.8913,
+      "step": 7
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.47364793976366343,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9003,
+      "step": 8
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.49327744700652487,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.9267,
+      "step": 9
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.45752990733305904,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.9139,
+      "step": 10
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.4724605651441254,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.9408,
+      "step": 11
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.4153618059983818,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8533,
+      "step": 12
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.43872032715248077,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.9176,
+      "step": 13
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.41300429747993544,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.8658,
+      "step": 14
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.3869759096230377,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.8707,
+      "step": 15
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.3723540582481531,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.8743,
+      "step": 16
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.43460617257627365,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8422,
+      "step": 17
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.3636882439505951,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.885,
+      "step": 18
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.3767420330312791,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8368,
+      "step": 19
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3406912995511453,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.835,
+      "step": 20
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.33117330188035315,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8708,
+      "step": 21
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.3742514806628294,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8547,
+      "step": 22
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.3318707541389069,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8137,
+      "step": 23
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.34753505685394287,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.8179,
+      "step": 24
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.30730689317524723,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.809,
+      "step": 25
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.3148215426142574,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.8289,
+      "step": 26
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3134692640328103,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.7951,
+      "step": 27
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.3338606409194597,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.801,
+      "step": 28
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.3217309263031535,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.781,
+      "step": 29
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.3624509351724031,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.86,
+      "step": 30
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.37146419689776405,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.8107,
+      "step": 31
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.31965751320137953,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.8001,
+      "step": 32
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.30710001958247035,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.8178,
+      "step": 33
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.3226409081760079,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.7828,
+      "step": 34
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.30238598320280313,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.7647,
+      "step": 35
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3272140288265171,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.7543,
+      "step": 36
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.37431785142236074,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.8817,
+      "step": 37
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.3040480104274814,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8085,
+      "step": 38
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.3510312247121624,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.8227,
+      "step": 39
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3423625936053891,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7742,
+      "step": 40
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3648454521920254,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.8479,
+      "step": 41
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.37752553285739987,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8158,
+      "step": 42
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.5139251891902187,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.8121,
+      "step": 43
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.33201787227820184,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.8126,
+      "step": 44
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.36263163138465143,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8131,
+      "step": 45
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.33167025843660425,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.8204,
+      "step": 46
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.35387812318119244,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.9064,
+      "step": 47
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.32546257173439347,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.7826,
+      "step": 48
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.34640750110083973,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.7741,
+      "step": 49
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3226221675147661,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.791,
+      "step": 50
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.315562950778229,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.7615,
+      "step": 51
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3594636590102823,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7334,
+      "step": 52
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.3647397045233328,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.8031,
+      "step": 53
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.33851642590902986,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.837,
+      "step": 54
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.29718756649280836,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.751,
+      "step": 55
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3142832139988199,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7477,
+      "step": 56
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3325477610175841,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8051,
+      "step": 57
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.37146073441055394,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.8098,
+      "step": 58
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.36048313195055315,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.7997,
+      "step": 59
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.3500392080646858,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.8462,
+      "step": 60
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.31468963310853965,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7831,
+      "step": 61
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4084100906531495,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.8192,
+      "step": 62
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.33355778834850514,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.8336,
+      "step": 63
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.3121111615175075,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.7478,
+      "step": 64
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.330025473226303,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.8114,
+      "step": 65
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.32343888780989766,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7916,
+      "step": 66
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3303905759396039,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7448,
+      "step": 67
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.33442251076374147,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7537,
+      "step": 68
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.3253341707475316,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7906,
+      "step": 69
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.40322939877722475,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.8455,
+      "step": 70
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.2894092016470807,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7185,
+      "step": 71
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.31132315142861977,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7429,
+      "step": 72
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3034494888254519,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.7781,
+      "step": 73
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3015266217194556,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.7584,
+      "step": 74
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.321935083376902,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7666,
+      "step": 75
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.33496778793189047,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.7769,
+      "step": 76
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.33483461042093743,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7391,
+      "step": 77
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.35029433823654627,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7702,
+      "step": 78
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.27106878465082573,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7186,
+      "step": 79
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.31734658980806324,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7045,
+      "step": 80
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3159312041607431,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7688,
+      "step": 81
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3510053326354264,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.8354,
+      "step": 82
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.3313711207225331,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.7626,
+      "step": 83
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.31851365189344344,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.7941,
+      "step": 84
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3683419820928145,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7969,
+      "step": 85
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.303900770492725,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.7311,
+      "step": 86
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.3195515925884969,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7518,
+      "step": 87
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.34328458663535893,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7791,
+      "step": 88
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.29192287357056207,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.7308,
+      "step": 89
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.29828259930043177,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.7037,
+      "step": 90
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.2828954581458353,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.7057,
+      "step": 91
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.30315109744782637,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.7378,
+      "step": 92
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3134055693364635,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.7212,
+      "step": 93
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3451596219699706,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.8015,
+      "step": 94
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.29775782579656,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.7456,
+      "step": 95
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3291043205732606,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.7262,
+      "step": 96
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.2906190303452574,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7429,
+      "step": 97
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.34321569626617143,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.8265,
+      "step": 98
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.3158233835384171,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7595,
+      "step": 99
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.30111907255901293,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.726,
+      "step": 100
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.30986176836690604,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.6807,
+      "step": 101
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.29450367356819085,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.7687,
+      "step": 102
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.3310156455760937,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.7563,
+      "step": 103
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.2677526853851952,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.708,
+      "step": 104
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3035262178953314,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7715,
+      "step": 105
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.33621579158338233,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.7644,
+      "step": 106
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.3010244557747383,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.7646,
+      "step": 107
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3067826338841327,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.7047,
+      "step": 108
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.26568750293000415,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.723,
+      "step": 109
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.2989424371151441,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.7847,
+      "step": 110
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.2917555500270605,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7342,
+      "step": 111
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.29628166341367834,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.782,
+      "step": 112
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.39119195663022793,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.8312,
+      "step": 113
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.30694775810386343,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.726,
+      "step": 114
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.310752903744039,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.7483,
+      "step": 115
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.26099588812120716,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.6752,
+      "step": 116
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.28492516975887444,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7416,
+      "step": 117
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.34517639491760527,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.7962,
+      "step": 118
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3060176941402745,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7104,
+      "step": 119
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.2828848083006681,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7432,
+      "step": 120
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.2777162172957675,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7413,
+      "step": 121
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.3170164491566114,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.7569,
+      "step": 122
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.321325071237149,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7699,
+      "step": 123
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.27620495853360366,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.6775,
+      "step": 124
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.2941483196149313,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7463,
+      "step": 125
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.2638396195061165,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.719,
+      "step": 126
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.3195419763352248,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.7671,
+      "step": 127
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.29556898125453157,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.7125,
+      "step": 128
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.29771793027755095,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7065,
+      "step": 129
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.2922812432322988,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7679,
+      "step": 130
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.2878303583502017,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7354,
+      "step": 131
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.2977577857426242,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7079,
+      "step": 132
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.2901915437404231,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.6891,
+      "step": 133
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.28352945726609136,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.7081,
+      "step": 134
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3241683039687902,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.7052,
+      "step": 135
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.3297607032361027,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7568,
+      "step": 136
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.31796151841422116,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.7706,
+      "step": 137
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.3487317539823554,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7904,
+      "step": 138
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.30800851312340427,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7017,
+      "step": 139
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3099184827803778,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.7483,
+      "step": 140
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.31481061184272635,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7396,
+      "step": 141
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.2681021791486132,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.6792,
+      "step": 142
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.2964128919386318,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.6899,
+      "step": 143
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.31152184572681707,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.7062,
+      "step": 144
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3385766223989507,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.745,
+      "step": 145
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3580776963221539,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7262,
+      "step": 146
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.33745749791756396,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.7639,
+      "step": 147
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.2898419277646106,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7454,
+      "step": 148
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.3068386648982399,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.6922,
+      "step": 149
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3052370320816486,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.7228,
+      "step": 150
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.2855813856814325,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7091,
+      "step": 151
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.3812616847566796,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.7375,
+      "step": 152
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.34762263468639226,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.7907,
+      "step": 153
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3071632002763469,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.7126,
+      "step": 154
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.2743953863530211,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.7176,
+      "step": 155
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.34240904021127194,
+      "learning_rate": 0.0,
+      "loss": 0.7214,
+      "step": 156
+    },
+    {
+      "epoch": 0.9984,
+      "step": 156,
+      "total_flos": 391925396799488.0,
+      "train_loss": 0.7946632943856411,
+      "train_runtime": 4833.0764,
+      "train_samples_per_second": 1.035,
+      "train_steps_per_second": 0.032
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 156,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 391925396799488.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..cd791ba0e59c63ed92e0aac5f633b0609a9ccb9f
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "q_proj",
+    "up_proj",
+    "down_proj",
+    "gate_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5d92c2d48634080679166b1702b7732636f61cc6
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd86f5378de423b831a40299e4fdd9539d90ceef220c946c3fe1b4905549d668
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5f8979bc20bc02f43ada5abf62484595711d9481
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dac86a923535fb17ad6727e74e80ce96d588fad79ee0191bd14195b64f19a13d
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1edf24cfcbf106106f3fabaf556b9e35f22a84ef
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 1.0996556147448786,
+      "learning_rate": 2e-05,
+      "loss": 1.4738,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 1.155590197159421,
+      "learning_rate": 4e-05,
+      "loss": 1.6051,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 1.146820508048761,
+      "learning_rate": 6e-05,
+      "loss": 1.259,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.9163565348699131,
+      "learning_rate": 8e-05,
+      "loss": 1.3473,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.9417661773161884,
+      "learning_rate": 0.0001,
+      "loss": 1.1457,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.0858107165146322,
+      "learning_rate": 0.00012,
+      "loss": 1.0252,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.9643047603536231,
+      "learning_rate": 0.00014,
+      "loss": 1.0901,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7884322251663225,
+      "learning_rate": 0.00016,
+      "loss": 1.0381,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.6207317444770607,
+      "learning_rate": 0.00018,
+      "loss": 0.9786,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5542708757243915,
+      "learning_rate": 0.0002,
+      "loss": 0.9105,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.6661573203112948,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.9134,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.581721865275086,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.8831,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.6258602792844252,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.9742,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.8144383904909923,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.8465,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5506198986379944,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.8677,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5961739562843801,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9714,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.629664764526395,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.8722,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5227064497725998,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.9123,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5224978847489002,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.9474,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5416788737330189,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.8616,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4664391292587506,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.873,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.6092522246389432,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.9629,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.529580513692818,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.8706,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.4625768462826553,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8322,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.6022314082445355,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.8923,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.495252362666634,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.8946,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5573210343339869,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.8745,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5263467061927173,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.8315,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.45530522891080205,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.8728,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4488557169584717,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.8714,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.422211135636021,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.8085,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.43629793688244217,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.8506,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.4934816820721558,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.9185,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.49422231415258716,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.881,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.42607497457838045,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.8277,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.5082800492131699,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.7598,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.4241801294691474,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.8652,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.41316516092448563,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8215,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.5063945285706223,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.8528,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.5019548596596508,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.8396,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.48735346313769123,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.8144,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5197868954555636,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8153,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.48803173140285644,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.8591,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.43518894315913303,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8188,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.5364419130913426,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.9385,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.435592143632023,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.7764,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.48004579520498175,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.7933,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.527366295217339,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.8798,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.5380112089762901,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.8508,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.46806208738974403,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8426,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.44311714881174813,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.7469,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.45340181396943946,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.8339,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.4544588923922235,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.8198,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.4488945712149492,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8535,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4697023213136025,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.8351,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.44940044996293804,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.8159,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.5035390443723099,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.8946,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.5032674537128853,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.8703,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.42842145274459875,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.8642,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4972268182951825,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.7786,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.5821189188510526,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.8606,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.40727673401873243,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.7698,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.42146358155953934,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.7587,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.5060938378098108,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.825,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.47374672312323146,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.8218,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.39289842872392167,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.7907,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.40504091920518287,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.7731,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.46608745877751256,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.8955,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4945171098342708,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.8577,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.8348779091493823,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.797,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.43537627735797024,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.884,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.44115911348304754,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.7575,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.41669159487312485,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.8395,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.43611174569275457,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.7673,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.48683682101110376,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.8394,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.39225378081689416,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.7666,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.461179637171612,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.7901,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.4411987636616236,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.8018,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.5385646530712281,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.8043,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4357285533429329,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.8693,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.45406278912963355,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.8352,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.45523658265619166,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.8385,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.4244244156508769,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.7901,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4569043840214544,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8455,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.45844501767610796,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.8152,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.5322576158474753,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.7228,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.4138777808285333,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.8249,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4799177917816522,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.7839,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.47193551015131996,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.8329,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.505900314758877,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8218,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.5310009281948525,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.7454,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.43552333332885873,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.8166,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.41823268630022065,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.8135,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.42006171215316657,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.8138,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.48524525282413145,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.8095,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.444659269135687,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.7406,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.39882715463751306,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.7929,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.40893464140153446,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.7789,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.4556551565194326,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.8106,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4710503877413765,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.8376,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.44873922703843017,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.7397,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4780873178804398,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.6855,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.4292692486434121,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.7567,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.451175302234134,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7917,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.38705001178653353,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.7167,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4969376218965549,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.7367,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.4548344658229783,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.7754,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.4224808161911037,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.7699,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.3594920451103802,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.7257,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.526316503013457,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.8518,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.4354561024147389,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.7457,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3834512633324317,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.783,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.42602748759310605,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.8114,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.46161254448655237,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8501,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4647927281034576,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.8369,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.38934195690887574,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7882,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.6055647046806627,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.798,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.48064016323776393,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.8316,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.4036750964150837,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.7445,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4069960376767926,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.7529,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.3780256529635664,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.7842,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.37948188830044266,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7771,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.38434792324076283,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.711,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4078342044913948,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.7397,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4494696168619162,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.7915,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.3762112960128191,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.7708,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.4128183849828177,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.8064,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.43747572041533334,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.7968,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.37451679628856716,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.823,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.49343862517101406,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.797,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4059370775780427,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.7822,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.4765446987311867,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.8045,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.44304669968975896,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.7983,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4495572284704048,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.8173,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.49519931957917346,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.7146,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.45440934320219123,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7902,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.46136539516278813,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.8228,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.46998132335066334,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7491,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.4857141173134919,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.7795,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.37304693609839623,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.7062,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.45211235804047367,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.7339,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.46871049328137226,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.8194,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.40545006074712786,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.7521,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.5132829944284613,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.8219,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4217086703740498,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.8332,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.43581538181417107,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.7329,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.43691731849117116,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.8389,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.43966396101540145,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.7951,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.38406120627380513,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7095,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.49975454028098837,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7603,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.39940277248775485,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.6845,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.409292796075142,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.7485,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.4620447146843351,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.7555,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4949103640758731,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7818,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.48693309628498127,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.822,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.4275166982224059,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7868,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.47251853589629417,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.7993,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.4408106892384705,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7888,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.5455614880532736,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.8227,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4297050359627414,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7715,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.39100077443136555,
+      "learning_rate": 0.0001,
+      "loss": 0.7334,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.46422894754991223,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7624,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.37146356743726766,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.7447,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.38940213180935757,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.7478,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.43337090859888355,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.7858,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.4449843824081987,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.6978,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.4444574240792365,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7824,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.39391177448125786,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.6804,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.434169523981853,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.7449,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4052559568894996,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.757,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.4093093475639199,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.7287,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.5057830506402323,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.7704,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.405469711690343,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.7555,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.41785558744905005,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7296,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.39699871829153915,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.6315,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.4027572504138232,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7537,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4350632254202678,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.7564,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3939867904779663,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.6851,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.43939859961499766,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.7935,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.48625092642856155,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.7706,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.4981537473001381,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.7447,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.43905428940129065,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.669,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.40314450778540883,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.7944,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.47831491273949467,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.7644,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.376804270353829,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.6484,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.40960972586278444,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.7668,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.47390937055957816,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.7606,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.39995555750872014,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.6889,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.43246771778131426,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.7386,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4511103506749145,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.6925,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3744787661442005,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.6582,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.4015516253801773,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.6855,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.46137894759038195,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.7506,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.36133467499460037,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.6634,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.398949905319442,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.7445,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.43191616189051685,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7167,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.4382384369223291,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.7552,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.364192338002942,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7017,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.4273232505653493,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.7702,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.45241456258728546,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.6983,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.41870882008182425,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.7223,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.36662315343893065,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.7102,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.34240621241431185,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.7169,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3986673895031164,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.7466,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4311966052909308,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.7265,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.5541661302074115,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.7585,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.44000575936807124,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.6811,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.47849947838940127,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7414,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.43089757195163225,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.7657,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.44970102772398723,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.6995,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4017912074199865,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.6994,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.37774081653175595,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.7418,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.4471206292757365,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.7225,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.4289588771903726,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.7391,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4957800599644603,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.6979,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.4169874944494124,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.7691,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3873593192271462,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.7223,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.35372466810924785,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.7342,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.41673004219163245,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.8016,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3824442425010731,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.7741,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.40158610632934527,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.7634,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4244144316802503,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7989,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.6061648550240569,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.719,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3792471751631609,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.735,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3380204782373729,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.6407,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.4869562528787163,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7544,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.5001594842961744,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.7652,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.45072491971435924,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.7817,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4100150948448548,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.6942,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.40874454525721804,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.7549,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.4260284237081343,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.7028,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3552812102786192,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.681,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.44069685250106094,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.8137,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.4690337765690944,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7733,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3741311567109358,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.699,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.4028572108946936,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.6446,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.45854603721647536,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.7475,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3681731161185109,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7596,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.3776434010253435,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.7007,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3738418901709509,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7149,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.4835092594634167,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.7032,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4017853036129437,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.6546,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.4480182382215609,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.6762,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.3812030302518587,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.6977,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.40746694197290256,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.7143,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.4111923252375617,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.6488,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.4073753912868798,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.683,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.388898515608764,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.7862,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.385706083912587,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.6774,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.41895090641408606,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7269,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.47082427321169534,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.745,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.38248626438781763,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.6811,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.4254072536431337,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.7069,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.37837697092083017,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.6164,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.38604759856266024,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.7188,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3871010273288266,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.7856,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.36665735756837253,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.6653,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.41254820532951914,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7099,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.4222647650532378,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.7257,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3622288431601804,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7196,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.43802019298705874,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.7194,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.515059265768125,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7283,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.42005833634395573,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.6789,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.43683440181136446,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.6844,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4115772545929074,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.732,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.34094173712492826,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.6698,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.40968587705008036,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.7368,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.341086694470386,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.652,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.39824391443335655,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.7309,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.40651294928363996,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.6989,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.47718296599282495,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.7175,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4288787554748259,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.6879,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4376532859776581,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.7295,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.4050176075548045,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.7493,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.41762319521896446,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.6795,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.5144078304017623,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7602,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.4664003629503036,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.7801,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.428652215880614,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7168,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.39893922809100696,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.726,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3518817379354701,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.6395,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.4669389577355934,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.7581,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.38174878134103185,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7058,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.40214897406185196,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.6688,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.5337589314562605,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.7816,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3904988161266009,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.7557,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.38486028703867425,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.7289,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.4776346872652581,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.7448,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.43484107579010384,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.6657,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.36820384941205775,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.7404,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3970556699438135,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.6972,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 2.58235750777289,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.67,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.5069706192556233,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7102,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.4923262833153959,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.7609,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.4142497515699397,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.6891,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3792544487029188,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.7453,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.45932577189942847,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7927,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.39999104491608034,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.6499,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.4266247591676935,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7357,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.40088208414107185,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.7559,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.37949174127982427,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.7175,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.4595268076207302,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.7367,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.41448073681420394,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7889,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.45148737131128025,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.7616,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.5663093229287312,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.7589,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.44329650062299025,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.7482,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.4885780490332609,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.796,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.47646806559659416,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.788,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.4473469054171859,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.7049,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.36500013878681864,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.6675,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3952870718353004,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.7168,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.35403001404819195,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.7338,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.4191626699523053,
+      "learning_rate": 0.0,
+      "loss": 0.7846,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 273934979694592.0,
+      "train_loss": 0.782174748296921,
+      "train_runtime": 4830.1855,
+      "train_samples_per_second": 1.035,
+      "train_steps_per_second": 0.065
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 273934979694592.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b26b3f5e148b66cdcc899e12b200aec7e66dd956
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f9b1ecab6d4efc4660af745ca103f2591711b9e8
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4b8a982da8abdd36e0f519116503d61efcac7cfc555ef7920a46d13be3e28d0
+size 671150064
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..32ba8d5e6d8d938e923bb098c9909f532859a5b7
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:090d67a5e8b539a067a31c2fbac49c8951630d0bacd85d8940f5e5a92bb6c5d5
+size 918507402
diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..75489b583d64b22e5968095b01180eac92b9cecd
--- /dev/null
+++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,1134 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 156,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0064,
+      "grad_norm": 1.033650962864784,
+      "learning_rate": 4e-05,
+      "loss": 1.5395,
+      "step": 1
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.9072482054967783,
+      "learning_rate": 8e-05,
+      "loss": 1.4104,
+      "step": 2
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.6929696257997826,
+      "learning_rate": 0.00012,
+      "loss": 1.2895,
+      "step": 3
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 1.2743831532728433,
+      "learning_rate": 0.00016,
+      "loss": 1.2358,
+      "step": 4
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.9635249398731617,
+      "learning_rate": 0.0002,
+      "loss": 1.0529,
+      "step": 5
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.7620736106367577,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.9811,
+      "step": 6
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.548706344742551,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.9539,
+      "step": 7
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4838581066198791,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9446,
+      "step": 8
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.4729470711419524,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.9248,
+      "step": 9
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4410831370616841,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.9363,
+      "step": 10
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.4263271322679063,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.9441,
+      "step": 11
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.43101174777747714,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8779,
+      "step": 12
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.37861484383938987,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.915,
+      "step": 13
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.3801259524726845,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.8727,
+      "step": 14
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.3915827921639002,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.8938,
+      "step": 15
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.3476027967669314,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.8467,
+      "step": 16
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.3818502534495098,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.9224,
+      "step": 17
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.37169132676026895,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.8099,
+      "step": 18
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.3396087648907952,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8625,
+      "step": 19
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.381238289245554,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.8612,
+      "step": 20
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.38304900844693557,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8244,
+      "step": 21
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.3592327009433714,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8547,
+      "step": 22
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.40916090982578535,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8672,
+      "step": 23
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.3635660633456493,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.843,
+      "step": 24
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3434073608671318,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8477,
+      "step": 25
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.32796730748027947,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.7971,
+      "step": 26
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3090897990220882,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8387,
+      "step": 27
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.35001291264577755,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.8317,
+      "step": 28
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.36428491335127045,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.8841,
+      "step": 29
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.3243954309502511,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.8171,
+      "step": 30
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.33128151860617544,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.8143,
+      "step": 31
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.3243232431069116,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.7925,
+      "step": 32
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.31250678604816295,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.812,
+      "step": 33
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.32678858604781935,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.8368,
+      "step": 34
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.35597874869490725,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.8239,
+      "step": 35
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3089808870901914,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8152,
+      "step": 36
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.2946998902064204,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.8018,
+      "step": 37
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.30147846071695233,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8034,
+      "step": 38
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.3495620439449062,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.7917,
+      "step": 39
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.35017459683865043,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.8415,
+      "step": 40
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3255738294856779,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.8381,
+      "step": 41
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.32143555884545844,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8136,
+      "step": 42
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.35224643730658256,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.7686,
+      "step": 43
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.3330551520720871,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.8107,
+      "step": 44
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.33671933143502814,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8266,
+      "step": 45
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.33238469396347947,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.7786,
+      "step": 46
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.3119497080343661,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.812,
+      "step": 47
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.33556678648575605,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.7689,
+      "step": 48
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.29754676091345855,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.7856,
+      "step": 49
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3212550236302518,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.8287,
+      "step": 50
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.3223260509089707,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.7134,
+      "step": 51
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.32472077807782423,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7696,
+      "step": 52
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.3144612888628789,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.7227,
+      "step": 53
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.31748264767086415,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.7652,
+      "step": 54
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.33053354962059733,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.7867,
+      "step": 55
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.31913427175442033,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7623,
+      "step": 56
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3338267291180613,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8261,
+      "step": 57
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.31258460736200183,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.8039,
+      "step": 58
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.35243368970699207,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.8099,
+      "step": 59
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.30397586139928945,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.7435,
+      "step": 60
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.2747692975616536,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7764,
+      "step": 61
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.29199473562926864,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.7222,
+      "step": 62
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.3078900971458151,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.7792,
+      "step": 63
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.324691531513113,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.798,
+      "step": 64
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3335964441250956,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.8066,
+      "step": 65
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3373556037280127,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.793,
+      "step": 66
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.32941353137034224,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.81,
+      "step": 67
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.35399403922854145,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7501,
+      "step": 68
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.34917009678959915,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.789,
+      "step": 69
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.33376866968787544,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.7425,
+      "step": 70
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.33811642179908985,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7682,
+      "step": 71
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.3425051696044902,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7822,
+      "step": 72
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3107034281284392,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.7819,
+      "step": 73
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.33282426680005467,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.815,
+      "step": 74
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.32805129818689494,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7319,
+      "step": 75
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.29585326468195644,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.7203,
+      "step": 76
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.3561659679909226,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7699,
+      "step": 77
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3469249098416054,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.8084,
+      "step": 78
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.33422921005367806,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7931,
+      "step": 79
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3503998377141623,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.796,
+      "step": 80
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3126441931642006,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7468,
+      "step": 81
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3021120067347973,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.7493,
+      "step": 82
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.3067566109741886,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.7414,
+      "step": 83
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3094509650385033,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.7324,
+      "step": 84
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.30257407103089773,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7559,
+      "step": 85
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3216531345523285,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.748,
+      "step": 86
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.30249740732137886,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7437,
+      "step": 87
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.31341182206609874,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.6947,
+      "step": 88
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3024434846916235,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.7259,
+      "step": 89
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3411376996056166,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.7815,
+      "step": 90
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.2856733234825206,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.7155,
+      "step": 91
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.33601917343194493,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.7818,
+      "step": 92
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.2918013505386562,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.709,
+      "step": 93
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.31526319175179096,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.7301,
+      "step": 94
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.30250418306277116,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.7195,
+      "step": 95
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.2790662847069248,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.6778,
+      "step": 96
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.2998584844908892,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7082,
+      "step": 97
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.29565362571693427,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7342,
+      "step": 98
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.29865538765003863,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7361,
+      "step": 99
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.30332168702872603,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7342,
+      "step": 100
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.29712882764596105,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.7177,
+      "step": 101
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.2929311235146598,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.7374,
+      "step": 102
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.3118133634434965,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.744,
+      "step": 103
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.31646373104256637,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7162,
+      "step": 104
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3492955334143746,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7402,
+      "step": 105
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.29816146956270106,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.728,
+      "step": 106
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.32464952556140103,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.7358,
+      "step": 107
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.32711705811548675,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.7357,
+      "step": 108
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.2882275477535859,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.7364,
+      "step": 109
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.2933869942149808,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.7941,
+      "step": 110
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3031355662381024,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7894,
+      "step": 111
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3060693032435272,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.7371,
+      "step": 112
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.3259781194694751,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7029,
+      "step": 113
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3702015020625932,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.7746,
+      "step": 114
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.30249066864439494,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.7306,
+      "step": 115
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.2897777887282797,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.7018,
+      "step": 116
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3344142843455273,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.8018,
+      "step": 117
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.29515807103280634,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.6781,
+      "step": 118
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3025305424101093,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7593,
+      "step": 119
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.28223422497881434,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7145,
+      "step": 120
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.3066517795043625,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.6869,
+      "step": 121
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.30642475984381673,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.6967,
+      "step": 122
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3006047300862463,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.6908,
+      "step": 123
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.2930985762959953,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.7421,
+      "step": 124
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.36699533414091595,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7152,
+      "step": 125
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3191184250838783,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7233,
+      "step": 126
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.2882949563889631,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.6723,
+      "step": 127
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.2846618831877869,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.7624,
+      "step": 128
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.2859925434034348,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.695,
+      "step": 129
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.2939429745246576,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7305,
+      "step": 130
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3265178745760517,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7318,
+      "step": 131
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3091109426117566,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.6894,
+      "step": 132
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.2756556893678473,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.7089,
+      "step": 133
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.28711391805261105,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.7045,
+      "step": 134
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3018136761543984,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.725,
+      "step": 135
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.48785789730683393,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7147,
+      "step": 136
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.2946491236531151,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.7513,
+      "step": 137
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.35417168372316066,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7318,
+      "step": 138
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3429059982297449,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7576,
+      "step": 139
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.28996922983926465,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.6904,
+      "step": 140
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.3139948030812815,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7424,
+      "step": 141
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3313436912829056,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.7348,
+      "step": 142
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.2862586930535613,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.755,
+      "step": 143
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.3354881979829791,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.7147,
+      "step": 144
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.2781947839944141,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.7277,
+      "step": 145
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.31691833779027606,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.6999,
+      "step": 146
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.3340163070707816,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.7372,
+      "step": 147
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3170369724196319,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7801,
+      "step": 148
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.30594912415669967,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.702,
+      "step": 149
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.28892681135303727,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.7478,
+      "step": 150
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.32463030584671543,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7746,
+      "step": 151
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.37353097089776116,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.771,
+      "step": 152
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.3437871535153094,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.7815,
+      "step": 153
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3425422993987472,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.7599,
+      "step": 154
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.2730176554215713,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.7038,
+      "step": 155
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.2825593416155209,
+      "learning_rate": 0.0,
+      "loss": 0.7707,
+      "step": 156
+    },
+    {
+      "epoch": 0.9984,
+      "step": 156,
+      "total_flos": 396278936240128.0,
+      "train_loss": 0.791970758101879,
+      "train_runtime": 4806.5232,
+      "train_samples_per_second": 1.04,
+      "train_steps_per_second": 0.032
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 156,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 396278936240128.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d226730e32a62891250be7d5ae91602ca108cb02
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "v_proj",
+    "q_proj",
+    "o_proj",
+    "up_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cb2be08dd2722e760e6af1fdf9d73005ea729bd4
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:139128bcd4e4cf0bd4fa30166f904da62fbd0bbcd8e0c6ca5a8c343433679245
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2f7ce6843e5d8686d1a2f7789e9524b2c9284534
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f28376e29671c5eb16c9c9ff151bb033cac16d4b7273c57016217ca38f0d5f5f
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..960fdcd74886c92021d74df11e31afce06029319
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.0078361387590205,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.4793,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9951652887052043,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.3539,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.896763252301588,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.4285,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8977678456462442,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.466,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.7792622012488023,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.2075,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.9015724813023276,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.2588,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.9539912695680726,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.1511,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.4709119751155775,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 1.133,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.8781207054143169,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 1.0133,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.7745071179422164,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 1.0139,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.6668394118072065,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.9199,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5593316156179626,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.9649,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.6349418219797407,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9399,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.6020677206864543,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.9946,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.5438695848971745,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.93,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 1.6406972588190947,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8839,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.9803440490535646,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8615,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5593171972828492,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.9058,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.47453770192222816,
+      "learning_rate": 0.0002,
+      "loss": 0.8859,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5909841084059502,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.9839,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.5533211598200701,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.9142,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5058722622366572,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.9088,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.5227840384513545,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.9214,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.674085142832656,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.9281,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.5157118294581174,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8217,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5218051567846262,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.9299,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.5422848174392554,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8486,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5486004540317999,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8745,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.5662083598242861,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.9145,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5381570540759785,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.908,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5211311574621448,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.853,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.6184615559840395,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.9835,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.5179203492136157,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8231,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.5422366829439323,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.9447,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.4753478694892907,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.8857,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.4920573954678667,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8765,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.4963933312524518,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8807,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5048401993711281,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8142,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.4966672780201818,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.9338,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.49280895715078565,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8611,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.5812776144179074,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.9696,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4418823813577595,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.7965,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.6151833185403617,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.9346,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.6356997114918449,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.83,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5937124928265956,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.8051,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.47768756181848726,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.822,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.6140160811543658,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.8776,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.516801826502444,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7676,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.6327146305470971,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.9702,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.499767257417617,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.9066,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.4924254887419153,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.8891,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.4290292230420388,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.7612,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.4408267627156317,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8049,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5494028698920849,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.9337,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.5377528312119375,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.9272,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4653086051429184,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.8384,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.47802147106541004,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8446,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.43698095251762437,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7921,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.496409999959844,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.8973,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5212049672896456,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.8597,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.5489720426969512,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8903,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4622687412170286,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.8887,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.47706889510794903,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.8442,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5376371336217138,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.8536,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5021900107320479,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8365,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.49048052824057126,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.8106,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.5047693137325724,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.8552,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.5085475864936128,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.841,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.5094769407721215,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8844,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5874346325784344,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.9541,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.5113285708843512,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.8147,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.6470490657139865,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.9404,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.444158830017984,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7675,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.42718865954168317,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8347,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.49382933622072317,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.8569,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.44584676514435373,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.7556,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.4583298153167484,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8167,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.4487175680774925,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7373,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.5304376805035587,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7214,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.49477371360533506,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.8556,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.47898340285063606,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.7639,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.47588171820515307,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8284,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.5177341087333374,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.7955,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.4972607753633679,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8604,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.433135078554396,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7556,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.43332045728202956,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7386,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 1.3570392266410942,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8819,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5174999688865969,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8708,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.5186816574437936,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7533,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4777235971030546,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.7657,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.4589332179072561,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7995,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.4365452209568981,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.8211,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.5361492053581633,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7863,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.5774950306753667,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.7324,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.5311524204034596,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.7907,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.5211202018919038,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8306,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.4493531014468885,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7914,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.489894272189929,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.83,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.5084569157516184,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8685,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4402306624297196,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8289,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.44943832297185154,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8216,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5389533019974405,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.945,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.4105928618047756,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7293,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4966495166041549,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7914,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.5620926313915939,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.8699,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.4998050408498213,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7713,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.4356477744811087,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.8587,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.5255567615971235,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.8821,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.4979809254631378,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.776,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4362326947314551,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.8105,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.4893990121965114,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.8302,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.41384047619369413,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7777,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.5179890724609715,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8515,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.4749814963541095,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.8121,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.4605472390929984,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7584,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.4817745217154679,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.8047,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.513266584644474,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7862,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.49942743495151193,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.8251,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.4486948636118088,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.8265,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4441154164066126,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8011,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.4885346589356598,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.7148,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.5136035090126265,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.8426,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.5877167718087726,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.8982,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4427950970842086,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7747,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.49655525453326005,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.7655,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.5534762211200998,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8763,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.5530009730644346,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.8714,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.5657937559041026,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.9082,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.48622536048605935,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.8297,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.5158794849276632,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.8475,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5045881311097942,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.8018,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.44554526837359076,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.7265,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.47940295287330836,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.9335,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.6038453663214071,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.849,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.44736082282588496,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7963,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4949858443463578,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.8341,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.4748678665983683,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.8569,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4682317494097038,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7801,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.49593152677518926,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.7977,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5112040698042739,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.8287,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.5144222579666844,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.8476,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.4488921177561266,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.786,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.43113259073368637,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7908,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.46919227137644687,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.843,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.4857406827968805,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.8096,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4718601050025545,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.8514,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.5026802643275337,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.8073,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.4380565959244541,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7799,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.40950675857094365,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7193,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5496617762158832,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.8451,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.44702446743165664,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7141,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.4148928347211178,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7337,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.508432249555421,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.8465,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.430962234328245,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7478,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.44406799965662086,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.843,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.4534186344976709,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.7591,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.38372442007494273,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.7459,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.45382619962872733,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.7637,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.6556052367907826,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7904,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4755089731315473,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7667,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.5616630620433459,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.9078,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.47033499421903646,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.8548,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.5037243767448392,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.9103,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.4203751041485349,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7424,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.411831853489757,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.7919,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.5326554379731826,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.8259,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.4403852288388378,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.763,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.5821163850741805,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.9002,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.591727532299676,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.8832,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.46804647996398024,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7929,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.42203323854415953,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.733,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.46752128447350355,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7937,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.46846317463295867,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.8266,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.5263210553238438,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7909,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.47749438944518474,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.7531,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.5645333688308465,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.8229,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.446871568864066,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7403,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.5520991124359584,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.812,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.4963547298286559,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.8204,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.474444324347614,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.8623,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.4738044014485687,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.715,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.6103325138954283,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.8508,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.5647299154567543,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.8472,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.41521843096999544,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.736,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4813369439604522,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7744,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.4656646536960523,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.8342,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.44361921983143243,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.8086,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.4105783992742901,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.767,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.5209585768818658,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7964,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.507362190210602,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.8028,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.49789901798638553,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.8082,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.46776010139530455,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7808,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.43939402349934514,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7466,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.45520971799451754,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.8107,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.45900953614309165,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7699,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.43840709703169906,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7479,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.43202304168661726,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.6823,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.43563508758389163,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.8153,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.448063176823589,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7478,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.48558137728839135,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7288,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.508285060888712,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.8239,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.40613474777069003,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7125,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.3996910851018506,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7347,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.43747574374571657,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.6955,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.5165934739549206,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.8473,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.3886022920700298,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7759,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.3957656254609609,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.697,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.4484079810056674,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7516,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.4529369609607294,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7369,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5335332069111763,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.8667,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.5285752487579878,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.8172,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.3969427948794199,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7163,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.4410382415479905,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7776,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.4159115145082527,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7557,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.4742973139852936,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7229,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.5216060088398573,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.8442,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.43771511199061625,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7495,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.572707160140616,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.8126,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.5544836753126516,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.8129,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.4532086857255862,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.8037,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.4918849781329419,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.82,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.41644324342222866,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.8029,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.3963250611740579,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7718,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.38606317904676285,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.707,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.44871910710492474,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.802,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.42275304153937604,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7547,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.5053291473297197,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.8012,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.49099852316863113,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7517,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.5178109830426583,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.8004,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4736718415067566,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7315,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.36026009543390325,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7507,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4535422092166841,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.7609,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.4470489385174667,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7381,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.5304272092082939,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.8047,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.4103397249611091,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7025,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.4282875131696741,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.7825,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.3959591611639748,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7256,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.4297625468076226,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.686,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.4338317125135978,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7852,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.5220480793777401,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.8364,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.4718410962391697,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.8609,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.4425877448351768,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7453,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.46482627683884487,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7669,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.45474128814187065,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.8225,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.489351688159818,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7513,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.4376194429250256,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.72,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.41802244933852195,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7178,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.409712330038051,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7023,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.5079825393723467,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.8606,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5206399287075734,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.8485,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.7223531563041334,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.8535,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.46611260705327295,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7691,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.47362875116167796,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.802,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.4389329527007687,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.8546,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.42871349078083126,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7478,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4720877579208673,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.8079,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.4959446137649537,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.8681,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4123902782366385,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.6864,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.401872969849858,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7413,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4362674378527309,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.747,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.4428292876454781,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7856,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4450863577634821,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7419,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.4428815143185777,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.768,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.46916136107051853,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.746,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.4119649528684462,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7287,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.4242183391349964,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.724,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.43575759754640503,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7238,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4954243974585828,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7593,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.472036317691702,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7596,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4092156773129198,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.8284,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.4588338499843614,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.6954,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4902923158674284,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.8042,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.3951625455518061,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7471,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.4398892247631177,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.773,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.405873315768006,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7573,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.44096360410755114,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.7857,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.5224663568877417,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7902,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.43451134385124535,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7854,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.44300948878433516,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7104,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4370012262335751,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7029,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.42393845788721735,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.7269,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3940993083465362,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7364,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.5130244197979122,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7748,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.38842129810108295,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7487,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.407653599404947,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.7482,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3708263323809625,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7218,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.4677986528867026,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7554,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.42020831082895277,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.7586,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.4339661675175279,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7593,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.393783616139176,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7852,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.45222289125813675,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7711,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.44485794781453797,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.8108,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.5271931699927285,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.8133,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.4251533047014541,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.6985,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4060145386422878,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7187,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.47221750761818,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7109,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.3581608522783374,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7512,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.8522901308197447,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.8318,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.4568743543110379,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7927,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4809043240428301,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.709,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.4959346652066973,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.7968,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.48271862062812654,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.8049,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.442702118060553,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7725,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.5620878781206495,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7311,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.39885638849328964,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.6662,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.484997683733805,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.7391,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.4482024572192048,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7423,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4592358505318248,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.7186,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.404374569147412,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7292,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4756451675893965,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7022,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.38598764122770934,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.7359,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.5088816244441849,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.8187,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.3991736960217553,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7517,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.5061478823334051,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.7845,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.38312118917493265,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.6894,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.40434609793297854,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.713,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.4226916043338718,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7646,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.43692635202741587,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.683,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.5029615187149719,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.7439,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4105261731013297,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7488,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.3911529679615706,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.6928,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4807368991319861,
+      "learning_rate": 0.0001,
+      "loss": 0.8258,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.5699018479220594,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.727,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3524982175685148,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.7082,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.4191858175776619,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7889,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.5089043426821488,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.8518,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.39226352932466774,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6196,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.47792692363151024,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.6957,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.44851193673389383,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7451,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.36285820996546586,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.677,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.4231831734407228,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.8222,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.4800096092127521,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6948,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.3921582958393825,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.6989,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.4605387181041602,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.801,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.3697875554335195,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.698,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.41037288560797924,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7173,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.5513440460409281,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.7883,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.47597729596937366,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7454,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.40362975305365,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.6796,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.47052048349930625,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.8046,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.39305474918369454,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.7177,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.4365853618850097,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7464,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.46443883518953344,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.7721,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.6077144067602148,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7881,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.4559313064847387,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7696,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.48334528459124015,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.7902,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.4122577959320843,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.7244,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.3967435244897451,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.754,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.4959668953567126,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.8717,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.43450610691260594,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6782,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.49715994893977894,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6976,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.4067290311812931,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7294,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.4690739518969345,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.7932,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4981800204868436,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7171,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.46035418939471695,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.6594,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4645533919344865,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7845,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.37790686444774557,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6723,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.5346137138402413,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.8126,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.5802932651326059,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7267,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3998975157211213,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.7429,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.42551219430876636,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.7446,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3915114759014101,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.7621,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.42707938100584764,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.7594,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.4217286541447519,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.7066,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.4813579329876303,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.8272,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.5209421571253114,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.7684,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.38388397495120674,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.6926,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.38206811685405234,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.7066,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.42079054181453934,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.742,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.45638369051473626,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.7812,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.45426445383407044,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.8086,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.5540822685134705,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.8019,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.4241661579124998,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.7149,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.41545262832705115,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.7546,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.47854883595431164,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7275,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.5417155836721987,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.7819,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.3418374137491104,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.6197,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.37827506427354074,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.7662,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.37678842749919733,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.6813,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.38552606554567614,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6521,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.39221566211771824,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6736,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.5814535406006868,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.9052,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.4316499224413905,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.7434,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.42320188391812447,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6543,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.5127523870382765,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.8075,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.37951728641973953,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7253,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.3968770913229026,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.7194,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3788579760423852,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.7089,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.4327607445645209,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7981,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.40926501415323624,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.8147,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.36710894474207795,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.7413,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.37772753596676223,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.7627,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.4103176842308724,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7856,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.4787503849919845,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.8353,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.5057142005966992,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.893,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.42615384142110324,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.7218,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.38574767132403925,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.74,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.4207612858058705,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.7154,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.39842148956461876,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.6989,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.4004093364012239,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6954,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.40440694527606397,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.7167,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.4302673835565511,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.7042,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.39755984527488586,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6723,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4766648621407094,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.7455,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.414554232063145,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6784,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.43440251704921684,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.7554,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.4205705236293427,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.7643,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.5240881892346563,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.7497,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.41002431801695394,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.6746,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.39079544638877645,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.7334,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.46154129162530355,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.7821,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.43857763101329666,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6967,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.4745556878551065,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.721,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.43407976222305766,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.7525,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4377749247872826,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.7309,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.36927751303735046,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.7268,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.3715247184775538,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.7297,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.42401079947649123,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6671,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.46140173636102044,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.7596,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4120529712114191,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.6686,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.46392851458326273,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7256,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.41595596761697523,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.692,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.5004337449226676,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.801,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.35561942584168765,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6882,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.43571514552457885,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6887,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.48997334399222864,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.7369,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.48227674033755163,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.7833,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.3959685747725765,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.7303,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.694068235573397,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.7366,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.49007966016681326,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.7407,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.42312320284128413,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.7419,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.534244873446867,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.8048,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.3618785576090114,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.7265,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.45673110894257185,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.6546,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.4761630111237167,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7687,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.38102982193869156,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.661,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.6090794065883615,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.872,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.8268384433744854,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6434,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.42124555231335026,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.8004,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4366646958594062,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.7084,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.40981492026915917,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.7563,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.3963709876378491,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6875,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.4350406188024903,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6756,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4856520578662212,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7326,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.444036509677123,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.7455,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.44396341098705794,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6529,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.3636036284538335,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.7295,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.5035071754980905,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6848,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.3752917947702577,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6623,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.4664576764813504,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.7635,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.3555481752677377,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6789,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.38442265931936404,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.6852,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.41769954474382226,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.766,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3938081186676616,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.71,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.7310037019848448,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6511,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.46401448267889905,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.8104,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.4204022615858649,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.734,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.6267879420567184,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6532,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.4369141796964484,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.713,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.42811342285199183,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.7457,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.4308026861336839,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7334,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.4067378877022371,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.7069,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.44304826502818534,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.7197,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.4028710850374116,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6856,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4464052817108853,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.7472,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.48050418959351304,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.7196,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.37982366825403496,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6638,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.44567261953196763,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.73,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.5719709727175741,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.7953,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.504208070255483,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6936,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.45824944366726106,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.7434,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.44150429031169014,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.7289,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.4566032504648868,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6659,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.42578567358612823,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6234,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.467928502515594,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.7224,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.45007822419266574,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6943,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.4108721661580886,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.7003,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.4681752861870969,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.673,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.4080367829644397,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.7153,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4133472524221295,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6626,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.46763846734027786,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.663,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.5539047759878163,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.7581,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.38354130612880843,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6638,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4199217200454841,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.744,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.42056017357147596,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.7472,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.369375746532977,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6312,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.4931786683894933,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.7021,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.47565217238459334,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6572,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.47582521206217604,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.7609,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4241576219472043,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.7538,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.5125539953983531,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.7831,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3492899730093599,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6928,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.3876442697873921,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6825,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.3974699605820346,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.6964,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.4629654109049974,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.7326,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.5108010894457006,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6617,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.45041661415736134,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6491,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.41031905128027335,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.7253,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.463599990726639,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.7482,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.44554078726064866,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6834,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.5519141136342646,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.8325,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3867379104449113,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.7434,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.46304328790007515,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7524,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.4644753053921928,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.8113,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.46405361531317724,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7376,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.47290916009529965,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.7238,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.38646347584691115,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.6916,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4688049279162531,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6537,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.40977830334252874,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.6816,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.41665687032634674,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.7616,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.4449277242678797,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7309,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3862853262939256,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6341,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.417870667519656,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6424,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.4121557958551208,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6814,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.42721070624989443,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6832,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.40377802251968753,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6557,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.3639893684851359,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6833,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.5400234489168635,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.7916,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.49982839638910553,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.7992,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.49879430542119474,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.668,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.4329818300875685,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.6549,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.5055663085996884,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.7629,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.45750246094611186,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6691,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.4497702082009352,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.7286,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.38766683544166136,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6497,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.41779772342848726,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6854,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.5014496809811316,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.7739,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3817297222875174,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.7148,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.45123970026378324,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6932,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.46221502063723513,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6627,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.4967064651434326,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.7581,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.4087309191651596,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6832,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.4018611215265461,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.7426,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.4238146860197108,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.7402,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.45870761560589585,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.7407,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.4243634380902265,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6786,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.40647965811740283,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.7112,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.5011817292459243,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.7392,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.4060613600018548,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6708,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.37370989838982127,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6711,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.46309490981922485,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.762,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3494927600003019,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.627,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.3818463473849368,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6823,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.41204709750745633,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6753,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.4223149820814058,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.7162,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4287433314251421,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.6805,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.42808426536818117,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.7022,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.40190051662355125,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6966,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.4371617503095811,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.7524,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4287837509396157,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6495,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.37595505918327743,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6713,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.3987376962211029,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.5807,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.43144229370725906,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6696,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.36384872319589756,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.6645,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.4038613126225733,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.7311,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.4283741889682376,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6764,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.4612154444409386,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.8144,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.41363644715873055,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.7091,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.42739581915469604,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.73,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3669064171593529,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6378,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.6607804454509449,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6361,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3642953445748799,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6676,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.4787090231117326,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.7741,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.35372153768638664,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6657,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.40850515430804185,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.682,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.47580526864703004,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6537,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.43774834156062925,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6584,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.4465029012018035,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.8118,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.40382894066508757,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.7346,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4004050705875985,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.6783,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.46148850023139054,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.6339,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.4142316676102528,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.7225,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.4259084891522085,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6397,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.42425539806284446,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6534,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.4564821915530866,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6699,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4327125181675075,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.7108,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.46541524052046224,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6956,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.4303975211543722,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.7029,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.4677034577629268,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.682,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.41970920358434805,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.7246,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.44325982926714413,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.663,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.543833318950384,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.7259,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.4462626991617039,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.673,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.47848876324986994,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.7009,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.4330236099546544,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.6902,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.42637120113247723,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.6401,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.3973135444199052,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6778,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.6770882959135993,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.7072,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.4007464012163911,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6781,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.43284323516860274,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.7363,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.49863862119221625,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.7979,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.5242826580870628,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.7447,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.5720376546393448,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.7279,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.39947577817568036,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6182,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4661137521776312,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.7281,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.37893881694450166,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6584,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.4611650389520086,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6902,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.387985776119187,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.6598,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.48483349583891894,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.7276,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.4412989632525287,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6627,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.41484413216610105,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6308,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.4690159050347405,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.642,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.4245437890533855,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.7825,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.3831070802690724,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.666,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.41107280846289773,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6961,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.6288518414789465,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.7746,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.41613371819591594,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6798,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.4064966673354674,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.69,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.468932838961255,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.7338,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.5218631986752835,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.7963,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.3781245114300269,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6981,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.41240953882994247,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6917,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.43377123595954153,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6831,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.472989623336461,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.7305,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.3338845198810002,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6196,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.43989366995203444,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.6993,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.5190236187058785,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6933,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.5606180991324131,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.7477,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.41189275650330803,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.7116,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3816408908934506,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6426,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.43072600762165386,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6697,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.4098582536733861,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6662,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.4947135664257666,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.7612,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.5299959699798045,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.8105,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.46021437422339323,
+      "learning_rate": 0.0,
+      "loss": 0.7593,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 543415710547968.0,
+      "train_loss": 0.7692152653694153,
+      "train_runtime": 9780.5025,
+      "train_samples_per_second": 1.022,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 543415710547968.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0766b2d281496fcf3cff71bd3176c25625b3073e
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "up_proj",
+    "o_proj",
+    "gate_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4f2ad89cb9ad3b1ffdd0fd767694f3939c7da52b
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2aaac0136bc40e13ef336040fb4c6ac3a9c34aa7bdb2a455fa6188347d62fcd
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4c22fb562e3f3931dbbb04339c145ba33b13141f
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f417253e2c0e04301233f7852cd6cd14dd424032956d69b6dd2a3ff9757fbbc8
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab6f50a3a2408f37c1998134245e2f35c15a4625
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9163293866966448,
+      "learning_rate": 2e-05,
+      "loss": 1.4166,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9209812937726838,
+      "learning_rate": 4e-05,
+      "loss": 1.4921,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7513419496799603,
+      "learning_rate": 6e-05,
+      "loss": 1.3631,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.754331327849121,
+      "learning_rate": 8e-05,
+      "loss": 1.3313,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8161092277218196,
+      "learning_rate": 0.0001,
+      "loss": 1.1536,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.7970259355167723,
+      "learning_rate": 0.00012,
+      "loss": 1.0271,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.8143357287470446,
+      "learning_rate": 0.00014,
+      "loss": 1.0623,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7430977217863324,
+      "learning_rate": 0.00016,
+      "loss": 0.9672,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5186908227716984,
+      "learning_rate": 0.00018,
+      "loss": 0.9226,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.42489469182136214,
+      "learning_rate": 0.0002,
+      "loss": 0.9687,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5091331543962424,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.9466,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.463799838358979,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.9561,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.43868713681710897,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.9048,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5255668414303949,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.9016,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.4617659004309836,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.9447,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4896592600229671,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9412,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.4416036841873187,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.8995,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.37320680671820966,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.896,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.3781335739253041,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.8611,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.39217880082155243,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.9179,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.3875066650438494,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.8952,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.46226087229767915,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.8898,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.38249225776490886,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.8179,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.42886624175871285,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8406,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.4533228894108049,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.9388,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.33852756379078713,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.8298,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.3778536242602346,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.8718,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.35953660416723576,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.8864,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.3143937264114336,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.8184,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.3788967311933849,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.8882,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.36213335832310506,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.8855,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.37933788919137773,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.8466,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.3546289025572818,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.8183,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.37374088665594296,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8501,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.4253406504416569,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.9169,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.40895647289121645,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.879,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.3165715042361572,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.8024,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.32960741200530574,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8001,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.35716428328685657,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.7774,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3485123728123483,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.7808,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.34264025569520523,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.7901,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.3338318387200505,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8176,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.30934157025197984,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.7435,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.36528602365420015,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8768,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.33795181946399744,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.7496,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.31841058178664267,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8061,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.3588333916772071,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.7551,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.32979363052773386,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.7937,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.3663299223535037,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.8081,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.365420266603761,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8451,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.36331027934440596,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.8703,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.33315022571857567,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.7579,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.3701346588086481,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.8121,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.33621803428395475,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8605,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.3270542438452731,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.7837,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.3276369008956474,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.7942,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.34788580077673004,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.8199,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.3422893547509533,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.7744,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.38231898220230265,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.7988,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.32894744574213003,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.8069,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.3238013073265371,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.7645,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.3642101864143703,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.8245,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.34996224525739966,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.8031,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.3792361629658678,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.8767,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.3380215318560921,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.825,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.3240107838253595,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.7525,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.3641652055005377,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.8785,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.32116798048624046,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.7986,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.350351230087757,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.8094,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3343616667111424,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.8007,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.34400861184293763,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.8058,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3217629099202676,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8036,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.3448799506680468,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.8184,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.33561964969079927,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.7817,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3392407704651979,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.7715,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.29657870397100855,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.709,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.33813418600024964,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.7871,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.3416826898530567,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.7969,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.2981910538689299,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.7411,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4029774190488523,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7657,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.3688934607028786,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.8689,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.33197282771751185,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.8135,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.3328457603624298,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.7945,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.3597081373185118,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8227,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3624062974101868,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.8193,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.319880066064466,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.7496,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.36856807057057184,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.7915,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.38038518435759694,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.7681,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.34489430198243376,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.7606,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3260103390407813,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8265,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.3654157265599615,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.7735,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.33510436709742036,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.7758,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.3373925001019564,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.7878,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.30010836206908226,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.7726,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.38889888718744725,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.7793,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3711177679722351,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.779,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.3425210907474298,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.7696,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.33997751764005274,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.7488,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.32141948623233824,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.7392,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.33949663819952836,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.728,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.3262082210447533,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.7563,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.3034623801146321,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.7026,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.3271828380287641,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.7985,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.29986047797674475,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7154,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.35317859176949684,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.7909,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.32786176013369694,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.7529,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.30299131489469927,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.76,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.3535296640756614,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.774,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.40807068025912596,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.7698,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.37071784614450315,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.7906,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.3275874632790161,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.7984,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.2914969550625818,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7286,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.32725028471181783,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.769,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3727566498615632,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.7692,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3370225841889968,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.7556,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.2842272226710898,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7451,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.36320902592755067,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.7637,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.30249909231708233,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.733,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.31992510603267815,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.6979,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.35107423715677044,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.7993,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.33625336785734383,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.7927,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.3602866370778261,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7855,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.3074605944536328,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.7248,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.3057958902745291,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.7032,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.38999508557023527,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.8462,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.3817235639029034,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.801,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.33494104617840476,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.8228,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.3248689476374051,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.767,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3317346185237444,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.7687,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3126217755965711,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.7345,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.3342062499387303,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.7589,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3295916994383612,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.748,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.3130205270632654,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.7199,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3398177033860641,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.731,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.3155687808886285,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.7867,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.34238595877766037,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7422,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.30576992227691874,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.7544,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.3076010730264354,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7642,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.38408408830985924,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.7807,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3049755918909417,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.6989,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.292438543795018,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.7232,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.33263754585380034,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7598,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.28143421334423013,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.7277,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.30315453345237825,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7473,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.30219790685112,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.7636,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.322073440323831,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.7898,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.37521946130708883,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.7455,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3256047252143069,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.7092,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.4779417757830283,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7832,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3596860434953128,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7472,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.32962255387490563,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.8,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3611211399160521,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.7491,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3192671808382749,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.695,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.31949935931855866,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7193,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.31110700730922364,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.7104,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.41730556147696163,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7666,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.3357401697744798,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.764,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.2924060557957266,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.6929,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.3193691092922649,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.7175,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.30621879833675425,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7382,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.32699795272360216,
+      "learning_rate": 0.0001,
+      "loss": 0.7488,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.2987335478288324,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7109,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3392541280977018,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.813,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.36226136393700703,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.6547,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3002445644070877,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.7063,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.3299914054053417,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.7538,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3096490280262253,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7447,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3410705854093673,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.7041,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.3507201924983843,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.7612,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.31780260642477853,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7358,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.29922638948580016,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.7304,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.39868752013565906,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.7745,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.34171184725576925,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.7764,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.2987398982694719,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7352,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3286939484464021,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.7682,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.3422171703812053,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7081,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.5352184148808062,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.7529,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.36553182360962533,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.7203,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3725899918477243,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.739,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.36281213507325893,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.7357,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.2972337348051776,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.7466,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.30521793330329594,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.7264,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.3408082485075347,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.7924,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.27453729413856376,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.6938,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.322681402186739,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.7584,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3414611441702101,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.8018,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.31233590764343705,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.7349,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.37213900764320684,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.7521,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.2590892828912147,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.6945,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.2781516718298695,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.6655,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.34817878862191254,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.786,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3138559242318282,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.6972,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3271741493104444,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.7641,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.28973751174107104,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7129,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.33571550673664213,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.8082,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.26899932541943944,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7517,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.3329509997257,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.8128,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.3609394516840636,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.8096,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.29276928147697384,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.7309,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.28767354411045265,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7004,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.3088729628061986,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.7131,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.3357986952575956,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.7105,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.31206066419022793,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.7159,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3868313645901953,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.7563,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.29313931261667653,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.7047,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.34241164911628297,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.7462,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.35188915481458427,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.7409,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.2932460158152136,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7333,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.29631995985705306,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.7014,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.31510080269240404,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7171,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.3295408127883741,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.706,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.31233092467065754,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.7453,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.35669248566869416,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.7156,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.3155211412518988,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.7567,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3673810003938718,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.7397,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3384960762159778,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.7745,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3109655404121361,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.6912,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3222530441680068,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.7148,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.40222613798662027,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.758,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.31264219212955136,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.7595,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.37754809999159067,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.7217,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.32156745646521057,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.707,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.3142160541974962,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.7026,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3295178426188756,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.7076,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.30884944419028393,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.7142,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.2679648303037726,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.6856,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.2966145053474197,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.7432,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3123776672696557,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.7356,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.31681605079604264,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.6957,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.30323256782096997,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.731,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.30683641302921955,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.7196,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3099465622888912,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.7059,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.3394631523163047,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.7381,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.2956908287206056,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.6998,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.39008891875701335,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.7491,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3274417615767872,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.7393,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.32468926558137884,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.6482,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.33865733040761303,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7172,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.3239366885437433,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.6902,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.30733315996560057,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.6929,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3769329156429099,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.7152,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.3273555773412006,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7099,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.2931808275573847,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.692,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.3752786184487787,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.6858,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.32213365090116863,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.7617,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3326490848140708,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7433,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.30800910739286863,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.6919,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.49114707854361583,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.7006,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3082444913571276,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.6905,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3185600456869451,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7205,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.36133186034657844,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.7925,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.34594125770304723,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7896,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.338131887400637,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.7372,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.32059408039911114,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.6755,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.2903927824066888,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.7277,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.30891151567079217,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.6868,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.30717699046560654,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.6682,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.30079449574608474,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.6718,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3709182222658317,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.7455,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3480906038452339,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7436,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.32079050007375537,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.7135,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3820019792030307,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7058,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.46781284966016706,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.6724,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.322646476532804,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7516,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.32793966242976536,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.6871,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.32915100964470706,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.7256,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3078508862353911,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.7451,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3153781972650277,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.716,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3529676795978728,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.73,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.28845262326412136,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.6776,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3480016710788754,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.7023,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.2996303480950192,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.6844,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.3183815751311741,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.7043,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3016624458410541,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.7089,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3167064263070341,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.709,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.2915655775574416,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.6358,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.30560329801302655,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.6783,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3006983164274014,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7104,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.32381519416112386,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.7695,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.28976807968257356,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.6894,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.30162867207666694,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.6573,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.31635142898683855,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7277,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.36265604881660407,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.6736,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.32074207308074953,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.7407,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.2883460722478291,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.7147,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3307282963021205,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.685,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.30251432198462375,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.6537,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.302807765576497,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.6969,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.32237386815312,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.7084,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.37620709533236935,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.7136,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.3688407096189695,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.7006,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.33157890719191463,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.6921,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.31827075119431947,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.6691,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.2970866545880075,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.6984,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3010996146445767,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.7151,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3619267851120905,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7775,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3506925860508666,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.6799,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.3098875126472622,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7015,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.32082064744176364,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.6796,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.31943579966438246,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.7002,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.33985923602030726,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.6423,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.2892635952892666,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7323,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.37923359172715465,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.7404,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.3103628292548366,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.6936,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.34823523050987065,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.7705,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.3020439440237327,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.7011,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.5310387928312621,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.7165,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.28494078878406287,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.6638,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.37140395118247604,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.7276,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.29302849163530503,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.6829,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.2949229958469089,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.6725,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.37943794632171773,
+      "learning_rate": 0.0,
+      "loss": 0.7934,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 786873517342720.0,
+      "train_loss": 0.7703683385864283,
+      "train_runtime": 9556.9637,
+      "train_samples_per_second": 1.046,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 786873517342720.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd2e3c9bac94eb7f24dc48c7b39242778281e781
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "o_proj",
+    "gate_proj",
+    "down_proj",
+    "v_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..26ac00add58f331d7a6a8c9898692e65e2143de2
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:325eff7774f190b073e51cfe5f3e4b738c697abd6552ab41d94f592754e122f8
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5a466207d4f2ebc68fcc00da3e4a30375d36bb48
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:288f02279fef438d6c35b4405dc74883532028e256f714839af57fe7276025e6
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c9f9d20b13387f57c1b60120e64f43cea86c9cf
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.0085854697429473,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.4793,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9972559026650626,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.3539,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.892014568991878,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.4286,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9307992964341537,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.4656,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.7602254990544522,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.208,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.9039945441243217,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.2587,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.9450987941401989,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.1507,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.4738996367046087,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 1.134,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.8769935738644637,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 1.0135,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.770842931812109,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 1.0147,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.6435483023409994,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.9206,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5638381409433151,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.9648,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.6361226951053639,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9395,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.6113438676456088,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.9943,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.5433136850170159,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.9305,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6196928495947501,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8827,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.5560559689651063,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8621,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5674823412664283,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.9053,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.4694725740359349,
+      "learning_rate": 0.0002,
+      "loss": 0.8856,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.6006019984416087,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.9853,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.5621161215924341,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.9147,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5365689431483717,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.9093,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.5172863622066685,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.923,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.6347044714087057,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.9312,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.5147510606941328,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8204,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.6953390865660324,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.9291,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.5346997972592501,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8496,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5887298482561468,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8733,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.5744288001668765,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.9139,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5817644265642208,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.9084,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5170973145542248,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8523,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5704175423395856,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.9821,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.48982925326254356,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8225,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.5533715409598637,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.944,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.58521736370168,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.8852,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.4982855683284927,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8766,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.5098007683350397,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8812,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5125510085738779,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8148,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.5086907760916723,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.9317,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5425362095412308,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8609,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.6110549252989314,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.9693,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4506531338293441,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.7965,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.5972484794514582,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.9399,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.6574173123923841,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8266,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5890215740181415,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.8091,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.4660388302518513,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8218,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.6497393240417777,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.8786,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5292049724664672,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7716,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.6314025950524799,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.969,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5200569642060622,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.9039,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.5003737674200991,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.8915,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.42454237909222065,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.7601,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.44440771011539576,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8047,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5465428784613534,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.9307,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.4840542158590719,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.9252,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4611462002254872,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.8354,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.46015103495950843,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8425,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4315958592617036,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7914,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.4981308035119613,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.901,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5382472930103323,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.8608,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.535777393531857,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8906,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.46334889541129715,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.8885,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.5049068018444929,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.8412,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5453569712273227,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.8551,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5033554954544484,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8339,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.4870927782123071,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.8106,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.5240204968040884,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.8529,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.48607963333726895,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.8412,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.4974957815786268,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8824,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5939769044560865,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.9555,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.47771639209882233,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.8176,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.6386894662834409,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.9388,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.4576001640778111,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7665,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.4200124513468208,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.834,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.4970757026020292,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.8543,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.4640803261141436,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.7556,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.46055917056692386,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8154,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.45748233373993097,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7386,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.5061480509699664,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7209,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.5125866939770451,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.8597,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.4834281117651111,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.765,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.48871433617747,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8293,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.4533350258950691,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.7953,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5020657989282823,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8578,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.49816197793406886,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7567,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.43322873088029873,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7421,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.49779652062498836,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8806,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5295204130991914,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8717,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.4990838250346169,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7532,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.45254960997056176,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.7629,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.45750879146448953,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7991,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.43373785740403087,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.8199,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.5067483645268225,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7793,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.5323777835671849,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.7334,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.5172495648535365,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.7907,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.49612967926384,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8262,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.45671458373517876,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.794,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.5037946215348362,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.83,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.5738153638450577,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.872,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4265537385053224,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8331,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.4439066886609966,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8204,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5125766235279255,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.9437,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.4027517341407651,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7318,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4784749643379989,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7918,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.5608708183821162,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.869,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.5216604137548994,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7746,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.4301735202968478,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.8609,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.5054438035773254,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.8819,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.4870990362057011,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7721,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.42631773989843086,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.8154,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.468453256337647,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.8279,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4050949188886547,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7765,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.5029661100503304,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.848,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.4620683173397088,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.8113,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.4465748334104479,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7604,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.4827698092867937,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.8053,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.49998806429255743,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7869,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5224572558802736,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.8282,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.44637335744386086,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.8276,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.45395325027658845,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.801,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.4668880071503474,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.7133,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4582223709509232,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.8445,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.5683295114262743,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.8978,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.41420851133767184,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7754,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.48421886671784903,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.7609,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.5469804299166952,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8702,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.5433538041032796,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.8688,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.5529201646076498,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.9029,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4503807504561782,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.8253,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.5328965011735313,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.8496,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.4954691357865197,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.7994,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.42442377421361566,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.7241,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.46628402843496697,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.9319,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.5646971847258934,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.8479,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.43213337310707894,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7964,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.48895432647000003,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.8323,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.46953643840086196,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.8575,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.46635817800169,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7786,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.4863706332365572,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.7945,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.49557735178852874,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.8256,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.5610746106786217,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.8474,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.45160520338706195,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7835,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.4365881164231204,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7897,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.5502156329221844,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.843,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.46958433144630546,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.8087,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.47221562406214135,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.8508,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.5237199615035483,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.8,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.4376415127234786,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7777,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.4169761952295241,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7176,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.54410249493475,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.8416,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.47043227050187336,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7098,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.40960863633131694,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7343,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.5067507056604774,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.8486,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.4245163031138323,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7495,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.4556737057249965,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.8479,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.4493326167767346,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.7581,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.3716425384364705,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.7431,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.444819110511713,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.7672,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.6455832769852934,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.8011,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.47385823913702435,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7675,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.5493023801642081,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.9084,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4614982241890986,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.8556,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.5027741137636845,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.9164,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.4127974732677408,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.746,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.4174843002501883,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.7914,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.5144471825402751,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.8316,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.4239583452770837,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.7628,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.5609278354318298,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.9009,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.5569568486822811,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.8878,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4666621535700966,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7921,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.4428243771161841,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7353,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4714927066425922,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7925,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.5064662007808188,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.8268,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.5196635747410672,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.785,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.4933582591088359,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.7448,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.5666841217272961,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.8215,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.44991866863529384,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.74,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.5370065695691301,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.8109,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.48386915891784066,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.8156,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.47624527776429076,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.863,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.46309055722372316,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7133,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.5546169176698907,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.8498,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.5552794343943931,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.8429,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.42596936374424016,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7344,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4780424925511933,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7698,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.4642570392124382,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.8356,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.44471874740393336,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.8078,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.41678047427736536,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7649,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.5060691569695956,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7913,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.5092570360201568,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7979,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.5519518990796427,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.8106,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4608973951013133,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7798,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.438865871044298,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7487,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.44750886312268195,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.8061,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.45558219396674005,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7705,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.46146903515791504,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7486,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.42258959370942567,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.6807,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.4420660188622748,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.8124,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.44872318502255715,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7467,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.48951210749650576,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7335,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.5051382436063975,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.8258,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.4126253533370679,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7109,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.39969454905380336,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7345,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4369673430067481,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.6952,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.5253068641077018,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.8515,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.3759400648888865,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7759,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.38825750674357584,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.697,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.43550078166914036,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7519,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.4664952435365108,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7405,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5219653886563514,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.864,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.5150644370748351,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.8156,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.38827105265454487,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7163,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.4520282940991433,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7775,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.4019485406898617,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7576,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.4715011902057589,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.722,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.5281060367751487,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.8446,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.5490901682193271,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7485,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.5550854172212718,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.8158,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.5539551532383808,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.8125,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.45711728899539267,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.8025,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.4557303163398866,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.823,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.42817988049854483,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.8055,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.405898803334239,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7766,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.4070049345009079,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7065,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4775835805503555,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.8062,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.42928661633445536,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7561,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.5839596443542385,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.806,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.5608646212586603,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7526,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.5443624713634069,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.8007,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4708628421536427,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7315,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.3602042451103198,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7519,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4600262840841861,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.7585,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.45411860254567327,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7386,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.5952105328243131,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.7995,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.4315997635636127,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7063,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.4457882641252247,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.7835,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.4051747892824553,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7272,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.43009388687261124,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.6833,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.43541512528849646,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.786,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.5336227600402277,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.8376,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.514637844230442,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.8645,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.460137726129797,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7485,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.6221242818247743,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7667,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.4618588089300471,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.8215,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.4285571304149657,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.751,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.40258047650756856,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.719,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.41924632079464363,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7204,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4180521137734907,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7032,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.5591578173202266,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.8644,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5129824870743946,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.8502,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.6307386849527056,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.8539,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.46736986952092735,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7708,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.4799931475937041,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.8015,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.43476429346809675,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.8577,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.45237012150052847,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7461,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.5050499956650181,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.8059,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.4862202647945296,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.8695,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.40591057767369276,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.6893,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.4108968185982489,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7418,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.5100121891818531,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7532,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.44386921266144647,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7848,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4528466297685063,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7398,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.4491985544541127,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7668,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.46701793529554486,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.742,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.42062540861302994,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7292,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.42052169271938605,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.723,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.415839073871717,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7236,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4914666164380902,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7554,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.46799457192175714,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7626,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.410108044119609,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.8287,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.4091253324705526,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.6915,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.5009956691028776,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.8084,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.41570873163794475,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7516,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.4489434399582018,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7754,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.4131301748977756,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7577,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.4472588663356602,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.7858,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.5565824209551299,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7866,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.43623811369525867,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7859,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.6975936438544309,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7157,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.42423336466106243,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.6974,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.40055370404173907,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.7262,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.39890372346188857,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7396,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.5096735125273331,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.778,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3795017666300686,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7481,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.4108604523234828,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.7503,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.36367980390858484,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7245,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.3977646017526582,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7559,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.39638159967335806,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.7583,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.42130562584263453,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7599,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4077415100488722,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7852,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.4456509250757706,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7719,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.46395650572041225,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.813,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.5392629108164115,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.8178,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.4077572738017568,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.6964,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.40929898598852493,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.721,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.46110445447232007,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7109,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.359324747412484,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7509,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.8336808335526302,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.8277,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.4620609468430462,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7921,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.48121311623043744,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7093,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.4350895665343257,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.7967,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.5679574902032004,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.8072,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.46950631028416706,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7734,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.570490021193672,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7301,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.40568313662242395,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.6713,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.47606013944566766,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.7327,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.44278344909344114,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7389,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4626487984327057,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.7148,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.41413130581808577,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7274,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4690577728618916,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7047,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.37772131967552147,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.7371,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.5241995894421716,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.8145,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.45138163052349395,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7504,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.49809750128277064,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.7833,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.3862594160358528,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.6885,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3912616000490057,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7126,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.42671759932825865,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7653,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.43525088205981943,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.6812,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.44353296841742135,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.7444,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.40347951408980726,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7487,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.407563259800527,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.6918,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4825945149443946,
+      "learning_rate": 0.0001,
+      "loss": 0.8286,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.46888527256064205,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7278,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.35415300739627886,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.7081,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.4174570287002455,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7862,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.5042407232677681,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.8516,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.4019155298012977,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6205,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.47936001841636144,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.6954,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.45559451960122294,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7448,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3725050996355319,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.6761,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.43152640749522186,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.8228,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.5019787195538405,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6953,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.3933828775717379,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.6986,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.46109184390562064,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.7974,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.3751402629468758,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.702,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.41075744550477633,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.715,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.5443171673871285,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.7848,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.4429556434128808,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7455,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.4083669981733497,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.6803,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4715431488867771,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.8047,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.38448011267664034,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.713,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.42425944294235907,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.748,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.4379812604643885,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.7697,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.6209340714517821,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7874,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.44201198942465997,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7699,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4764254770834733,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.7901,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.4096441586651275,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.7228,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.3969513853243152,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7529,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.5365493484110804,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.8709,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.43614872993323006,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.676,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.5092788403946433,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6988,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.410847615267607,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7271,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.47391115175094317,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.7974,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.5095539719795574,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7132,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.4571194378914754,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.6577,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.47737735497939765,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.784,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.38353722509787763,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6723,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.5695389740488869,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.8151,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.5391637529898152,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7274,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4006072175621919,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.7434,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.4222671576070959,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.7428,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3976579330761039,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.7616,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.42255730442297545,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.7609,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.41246860766662846,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.7043,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.4910922066018139,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.8279,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.4583975979375754,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.768,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.39329145110417946,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.6895,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3889200786137619,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.7061,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.4678598370358913,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.7445,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.5114556160689178,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.7812,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.46430482828067804,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.8086,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.4652368825183028,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.7996,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.4156483527126143,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.7143,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.42072838682916724,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.757,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4610948866880415,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7254,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.5409017437356095,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.78,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.32624499419219716,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.62,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.38165761820976635,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.7645,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.3828166260395758,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.6832,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4046787667999106,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6513,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.3643996551478938,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6731,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.5878438451616897,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.9034,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.43507716349366665,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.7459,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.4675199604593121,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6524,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.5220350889448834,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.8117,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.38381197210212664,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7285,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.39760158207342916,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.7173,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3884490761221106,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.7115,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.43957523114488356,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.798,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.4333978171844909,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.8164,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.3732803287774478,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.7454,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3794733664568056,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.7613,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.41549742261080647,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7869,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.4855203473158915,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.8364,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.5197083474910514,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.8886,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.42369930329880323,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.7242,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.4699463590136928,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.7397,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.431419333342201,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.7159,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.387076267269984,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.6996,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.39386317351097094,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6947,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.4080080280633503,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.7187,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.4418437822394713,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.7051,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.39331968967272185,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6688,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4753675991971554,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.747,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.4301540427697761,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6812,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.44731642973716046,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.7549,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.43316276103068396,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.7612,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.5390689427095837,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.7472,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.40013520823674165,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.6761,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.39368096794889446,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.7313,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.46588282500462086,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.785,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.456856055688501,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6943,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.4852444095134256,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.7224,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4396387069700316,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.7541,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4586911399489298,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.7346,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.38451486127934603,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.7263,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.3734322129644761,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.7304,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.4219042551455218,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6699,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.44381594455152107,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.7627,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.41809763823810103,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.6691,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.4663430287654327,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7223,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.5170665655234487,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6877,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.5405806804681932,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7992,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.34421740208058965,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6877,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.4252508028678407,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6885,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.6942440541587961,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.7351,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.49142346057250874,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.7834,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.40869861841731175,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.7305,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.5489019189247966,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.7375,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.5050582962020529,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.7412,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.4262794392530901,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.7407,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.5015819918976162,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.8035,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.43399711540507313,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.7248,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.4359167953464953,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.6538,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.5158105983586037,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7668,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.393681478987024,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.6613,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.6203780278226882,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.8716,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.44289616391666736,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6449,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.42697599683590876,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.8015,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4410676387461542,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.7096,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.3890591971991562,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.7553,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.3943773090859102,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6896,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.3952617637753136,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.675,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4841894572166325,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7346,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.4366709343004436,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.7471,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.42191672845476863,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6509,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.3775530356391045,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.733,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.39105134753052495,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6837,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.43353346912673657,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6601,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5872624794785681,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.7603,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.3759509267407406,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6827,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.38646793874726243,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.6838,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.41969699299771746,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.7628,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.4213752239198554,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.7109,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4646135767858213,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.652,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.45933925738321174,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.809,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.41950659530606604,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.7318,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4390391420303596,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6524,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.4205145624082347,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.7154,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.43775520420161573,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.7427,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.443039095805843,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7381,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.40446606491238196,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.7041,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.4565815989336157,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.7207,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.40131850955302295,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6876,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.44971058723052154,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.7439,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.46751286290328736,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.7202,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.38823586195199966,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6626,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.4451744270156566,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.7288,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.6022477158409618,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.796,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.539410763170956,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.69,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.45742363917437995,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.744,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.47344603336882474,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.728,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.4135333089243574,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6671,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.43002045441598563,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6231,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.42317314012568297,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.7195,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.46626381925481214,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6961,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.4136562321861586,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6987,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.4784975974141901,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6789,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.40533604299838283,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.7154,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4206369423773519,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6601,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.42757838826384376,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6631,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.5391649221416999,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.7564,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.3904100841608839,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6596,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.42062545130796897,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.7431,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.4857018013340481,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.7497,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.36259452986829344,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6305,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.37515589328507154,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.7014,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.42845416708723516,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6606,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.4553734430542505,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.763,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.40015155573782624,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.7534,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.5208900301952243,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.7815,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3503748165237743,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6909,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.3908558731476936,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6813,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.4687428745927545,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.6946,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.4332820639854068,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.7331,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.42723456851448505,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6618,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.46402871060489587,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6508,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.40578292960530027,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.7229,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.43520879496687526,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.7499,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.4536989641209546,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6821,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.542222972927613,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.8348,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.39390918306234635,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.7415,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.4533121637690508,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7509,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.4687709391858201,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.8083,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.49228224147446814,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7353,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.4758894189648476,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.7248,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.3559686727798047,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.6908,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4770784964280279,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6538,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.7078603935083368,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.6817,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.39213160731883573,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.7589,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.47020008634550475,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7336,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3792146423670458,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6326,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.4262609061539926,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6411,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.42577224096572075,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6806,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.42374880063325143,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.681,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3952042103509857,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6542,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.3616527944352411,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6835,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.5241196153307605,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.7916,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.5390849479993916,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.7987,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.48039231414527545,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6692,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.42707179923530125,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.6568,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.46109134311696376,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.7601,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.47142533598707004,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6752,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.4504467554154392,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.7277,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.3831266438749168,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6506,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.40016219076978526,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.682,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.44514968765868584,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.7746,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3843751561475455,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.7161,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.4669995535333905,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6942,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4529295547385918,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6589,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.482964134784402,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.762,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.41287764501078317,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6815,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.4098495155093636,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.741,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.41847355992725155,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.7382,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.4653257835694142,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.7441,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.43252875114325257,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6772,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.40733522956105084,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.7139,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.4956527752274115,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.7383,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.39614957218583297,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6676,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3753280480518952,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6703,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.475121842569078,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.7619,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3521757249003535,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6272,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.3764674116747337,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6801,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.5801586357247913,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6769,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.44486716819587074,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.713,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.44155527404950856,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.6813,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.44335611847768286,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.702,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.46990914035756637,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6971,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.4321875894671604,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.7515,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4252627880976508,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6518,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.39611014885838497,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6757,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.4095060338394822,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.5828,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.41851390395160787,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.669,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.37014421972947137,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.6667,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.39958569740213346,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.7322,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.4151542139680637,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6759,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.4469625332463983,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.8135,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.4066896591691095,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.7106,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.4163489054937086,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.7317,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.36587621421685,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6346,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.37495976884733584,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6356,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3925825846368969,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6646,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.49011988410515533,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.7772,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.38832737961220826,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6655,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.3968057567239704,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6856,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.48571992999735075,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6528,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.42691389717667355,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6593,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.45668408205123967,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.8113,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.40497751915448754,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.7361,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.39305353036446256,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.6766,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.464968814745453,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.6357,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.4401056844056556,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.7237,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.43214470617398676,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.642,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.4230345584965771,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.654,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.4304618514743957,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6704,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4125636074837623,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.7088,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.5083751865235997,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6955,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.4480447682345799,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.7025,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.4669983763007555,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.686,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.42135345865339036,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.724,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.4415599946558524,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6629,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.5102318567062917,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.726,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.4339243196871157,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6724,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.4587114055215017,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6998,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.4279050205280435,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.6884,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.44477451329874085,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.6413,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.5444579696189681,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6767,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.42887921931200046,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.7089,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.4004212833108163,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6784,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4173772009199892,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.7361,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.5019148581562954,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.7951,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.5466510572971621,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.7459,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.5495540131846346,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.7266,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.402119858865409,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6183,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4746782428308101,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.7274,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.36514268392405524,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6576,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.47592924104667145,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.689,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.39293283630147624,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.6597,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.45086953532530955,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.7274,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.44438641462434025,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6614,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.4162291164037949,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6342,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.5466716041149762,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6394,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.42271815559211845,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.7811,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.4244644129053721,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6654,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.42256064647922514,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6945,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.6239794621223662,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.7737,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.4172502376073947,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6797,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.39826667579957786,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6891,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.4601710262147938,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.7329,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.5297655920286328,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.7912,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.3757166865051945,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6968,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.3936377352463753,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6908,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.43741423927983253,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6844,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.47345694727007964,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.7313,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.32409034288972116,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6177,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.43997495517123886,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.7024,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.5443286523974251,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6954,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.4949478207365622,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.7485,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.40186144816647534,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.7105,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.37852597841630553,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6434,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.44720806471916646,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6678,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.400060471651697,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6656,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.4990555952833043,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.7616,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.5903516833425473,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.8112,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.43333961125183756,
+      "learning_rate": 0.0,
+      "loss": 0.7567,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 543415710547968.0,
+      "train_loss": 0.769115243434906,
+      "train_runtime": 9658.4573,
+      "train_samples_per_second": 1.035,
+      "train_steps_per_second": 0.065
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 543415710547968.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf42828caa84117ccf1e34e7e03ab421595fd65b
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "up_proj",
+    "v_proj",
+    "down_proj",
+    "gate_proj",
+    "o_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bd16e72d0ffe5a221179f5c9ec253efeacb775f7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1b105009b66ec919db82ffe954b793fe0f891faa335074e2968b8ffe2b62b8e
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c372d246968cddbf373a89266de2c1160f262c57
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fb9795a816cf36277735654517284cac53c8a54b525688e6f5e38c12eb631a9
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..839168a196563227f5e7c111544d612590d4674f
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9103283675313728,
+      "learning_rate": 2e-05,
+      "loss": 1.4166,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9144320274792768,
+      "learning_rate": 4e-05,
+      "loss": 1.4921,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7274963361526622,
+      "learning_rate": 6e-05,
+      "loss": 1.3633,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.7533872931679354,
+      "learning_rate": 8e-05,
+      "loss": 1.3309,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8072799234288184,
+      "learning_rate": 0.0001,
+      "loss": 1.1534,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.8128649163781364,
+      "learning_rate": 0.00012,
+      "loss": 1.0277,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.8094747897805041,
+      "learning_rate": 0.00014,
+      "loss": 1.0626,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6501117061319456,
+      "learning_rate": 0.00016,
+      "loss": 0.9675,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.4844842936596786,
+      "learning_rate": 0.00018,
+      "loss": 0.9223,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.42169366246618345,
+      "learning_rate": 0.0002,
+      "loss": 0.9691,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.40603834149407325,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.9463,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.45702055345432224,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.9557,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.42227819444173914,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.9032,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5015499256486801,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.9017,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.4590177978923712,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.9441,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4696048910755479,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9416,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.4382509564334526,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.8984,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.38129492603906207,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.8963,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.38394803376304665,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.8606,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4387045971731528,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.9166,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.38444814054921106,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.8944,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.45143495292294156,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.8889,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.44336900265005136,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.8192,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.41164037436606715,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8402,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.4334476983961227,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.9395,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.3442486491029148,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.829,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.37737297293447203,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.8714,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.36390961631619445,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.8872,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.3354385508656872,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.8179,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.38209135195320887,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.8878,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.3637649637601686,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.8873,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.3752837293323211,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.8485,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.36023976019261966,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.8172,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.39540517944934683,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8485,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.41411169758141353,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.9169,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.43753763956502895,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.8761,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.3122556706775927,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.8011,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.3340338425438589,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.7981,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.33378726558197863,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.777,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3620481789765836,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.7779,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.38288334079362785,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.7896,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.33613117188038577,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8166,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.33308279105593896,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.7427,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.36908712891619183,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8754,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.37395396744736786,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.749,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.3160307397453434,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8054,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.36749447963029164,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.7574,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.33549636928097215,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.7923,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.37868595879272665,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.8073,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.38925373377460204,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8455,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.3672188908686244,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.8705,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.33169876099654205,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.7603,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.3795202679699071,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.8117,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3449122176312621,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8594,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.33161046480987155,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.7841,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.32548937249448306,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.7966,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.360653048373736,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.82,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.351951999314567,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.7742,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.3701117149066087,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.7952,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.32825144421037894,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.8084,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.3321888433864363,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.7646,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.3743402139687879,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.8244,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.3514231301804191,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.8038,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.3933077216029149,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.879,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.3549182509819441,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.8278,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.34188248262920085,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.7532,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.3729093320580842,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.878,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.33083049897801425,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.7991,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.3546403903312824,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.8093,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.33586476961490047,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.7997,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.3488750651611648,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.8055,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3269454217721887,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8059,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.3526119888506058,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.8178,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.34485108812383425,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.7827,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3403414943684799,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.7709,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.30538519888266924,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.7102,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.3423028009161411,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.7889,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.3382000191220972,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.7956,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.3046490107370409,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.7416,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4203866308689628,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7684,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4060027558515808,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.8698,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.33995904638689217,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.8143,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.3588441569836783,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.7956,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.3674202406542926,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8214,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.37482770602753407,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.8209,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.3464001739697096,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.7486,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.37285581487412267,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.791,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.3979485453242663,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.7703,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.3507154145343656,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.7614,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.34617791618977534,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8254,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.37892909020804283,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.7727,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.3361348593587784,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.7759,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.3381620224272412,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.7888,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.3062100919489959,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.7714,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.3849578010135168,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.7798,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3650869029786026,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.7793,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.4220453796587287,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.7737,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.3445256004727701,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.7471,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.3195235032079133,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.7382,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3413154323610123,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.7305,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.32466681948160775,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.756,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.30531327493430477,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.7009,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.35300100306276294,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.7995,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.2865252501809062,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7136,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.38024243713722367,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.7912,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.33295045305208426,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.755,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.3071984559961337,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.7608,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.370771625197189,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.7725,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.3697591742580659,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.7693,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.36908515335555,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.7923,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.3232317554930679,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.7994,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.2864545104930335,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7266,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.3281035382183986,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.7697,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.432073780382289,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.7696,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3489210583638896,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.7551,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.29005250465130256,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7455,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.3687315971108826,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.7632,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.3196634468449587,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.7339,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.3021021607384978,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.6969,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.35914168156137066,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.8014,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.33533749250540656,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.7945,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.35729590877733236,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7868,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.30367986407370473,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.7287,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.29868048103393635,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.7024,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4254752746800852,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.8463,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.3701875254407606,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.8023,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.35082155416776684,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.8221,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.3240073690528122,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.7686,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.34042694987957256,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.7697,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3165193575330039,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.7345,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.3164043962116355,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.7569,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.33222007537208365,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7474,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.3379494196541218,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.7189,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.36085614892637663,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7306,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.31824319737856027,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.7862,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.3355156358831619,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7457,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3112035899857585,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.7573,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.30736498729966194,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7667,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.35606634972260365,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.7795,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3059543952599057,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.6992,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.28162119452183293,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.7221,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.335028879547796,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7598,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.2967556622516192,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.7301,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.28900459549646895,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7467,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.29948470291870005,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.7643,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3289223426110118,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.7917,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.36344371877255777,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.7449,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.32629958114495206,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.7101,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.4729809069084755,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7837,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3557131350847042,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7462,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3310833706511039,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.8,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3557288386127655,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.7474,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.30640476176594855,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.694,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.35396779632156705,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7205,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.31843226775773836,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.7112,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.36432332157220065,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7683,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.34809268929906395,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.7658,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3264788391551195,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.6921,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.3400045059101885,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.719,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3248741495647313,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7398,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.31271136364143504,
+      "learning_rate": 0.0001,
+      "loss": 0.7488,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3333951914768973,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7105,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3439039077825358,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.8108,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3371739068964004,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.6551,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.2919025988586933,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.7045,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.36491481768587625,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.7529,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.31445880002906634,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7426,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.377819675471851,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.7031,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.3508448667090174,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.7604,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3163495470425652,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7372,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.310680946219706,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.73,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3767362502428194,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.7748,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.3333698966486753,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.7763,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.2953895967701411,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7358,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3263010222901318,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.7688,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.3339166843318738,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.709,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.39246895356613587,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.7528,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.33949846938337874,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.7201,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3488329704671586,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.7388,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.35937058036793534,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.7339,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.30344315288005674,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.7444,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.30055346873379907,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.7264,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.5512874605658908,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.7922,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.28422384034250225,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.6951,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3264059434946543,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.7573,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.34136021341909206,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.8017,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.31936628762037644,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.7352,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3705583084948377,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.753,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.2641217408168145,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.6945,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.28199637025305835,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.6657,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3455130815729331,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.786,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.33204650997498225,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.6985,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3363804677488833,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.7668,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.28777279245420617,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7124,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.32216300619118104,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.8082,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.2751244879997451,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7523,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.33953479421377675,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.8128,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.3698224464316948,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.8092,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.2933579428054677,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.7312,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3017197415602167,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7024,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.31184940241088266,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.7134,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.31997021975402207,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.7077,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.32029492277494004,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.7162,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.34003208753770975,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.7546,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.29865775478304046,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.7071,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.35713876514251774,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.7462,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.354479165253479,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.742,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.3072783382105601,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7326,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3066891047944988,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.702,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.32137740444038204,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7166,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.3226385011981905,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.7072,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.30654979020218126,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.7463,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.364514113762281,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.7154,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.31061468570850853,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.7561,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3820596683556745,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.742,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3690660332818625,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.7749,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3103712139550411,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.6921,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.31693912519742806,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.7153,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.4035650161680625,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.7578,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3155224002650208,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.758,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.29153296754586383,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.7223,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.36059846357452535,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.707,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.30478604752052557,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.7,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.2765974186524337,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.7077,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.30696336878631286,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.7149,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.2693060484483783,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.6853,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.2996865559401467,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.744,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.32116367983000393,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.736,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.30953157227898026,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.6969,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.32878634968048037,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.7321,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.3058791861937667,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.7205,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.31626313050720906,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.7064,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.33087842953319835,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.7387,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3103197856204972,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7006,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3892646864117474,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.7486,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3265850973989887,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.7401,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.30929124848146605,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.6481,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.36052434727246496,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7153,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.32077054306044706,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.69,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.2942321680000018,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.6921,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.366812231700345,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.7179,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.2976605054353928,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7097,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.2806071151500792,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.6906,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.29862145430358444,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.6859,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.31848989816101214,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.7626,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3175131736732565,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7436,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.28853504741657543,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.6933,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.32076762021357175,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.7019,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.34196822163816115,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.6907,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3134739522586932,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7212,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3518835608545312,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.7932,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.33140803770709104,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7868,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3565207578988336,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.7389,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.30549965924216654,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.6764,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.29213626540883963,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.7273,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.2980038332160969,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.6846,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.300210767086982,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.667,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.29777208684669104,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.6726,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.32989535704655576,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.7441,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3527085657083696,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7425,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.3413606436658763,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.7133,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.34174194853533874,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.707,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.30750666981171837,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.6718,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.34851966567817805,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7525,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3430438314400539,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.6865,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.34507491001563145,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.7272,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3067889589850218,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.7442,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.32827798904559324,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.7176,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.40722093804907067,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.7319,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.2876035229559397,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.6754,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.30978808958198306,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.7003,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.29665723847819137,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.6846,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.3242757800911127,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.7033,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3077986435730653,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.708,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.35347342436745144,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.7094,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.3074112596076014,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.6347,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.2885076062887646,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.6771,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.32272553689543476,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7111,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.310886031194646,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.7692,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.292492219987561,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.6892,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.2868717907909984,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.6572,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.30933088783540064,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7281,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.3204501856072015,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.672,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3173471684890607,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.742,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.2820215009364713,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.7136,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.33732261505341615,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.6868,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.30613483894533183,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.6549,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.33200651204689885,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.6965,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.317578222945079,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.707,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3353103698687747,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.7123,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.33880236092959853,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.6995,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.31986137813310067,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.6933,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.33061247165083923,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.6693,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.29596791807317174,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.6963,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.30279235472694177,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.7168,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3508660189931767,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7781,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.34720284637704274,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.6795,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.3069150363317921,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7026,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.33833933003093686,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.6799,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3123949902112611,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.6992,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.3649115379085674,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.6437,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.3187056613092533,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7325,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.37774427223289525,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.7433,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.30360889754672465,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.6937,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.36622826874025366,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.7688,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.29793100657713634,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.7004,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.32667040098315064,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.7172,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.2774095648074841,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.6632,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.36101836423025013,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.7271,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.28363398532541456,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.6835,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.3101541265795471,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.673,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.4525513676593632,
+      "learning_rate": 0.0,
+      "loss": 0.7941,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 786873517342720.0,
+      "train_loss": 0.7704597816635401,
+      "train_runtime": 9554.0421,
+      "train_samples_per_second": 1.047,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 786873517342720.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..23a3e536e974f1eaaf496d7257728d54b167649d
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "down_proj",
+    "k_proj",
+    "gate_proj",
+    "up_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2c6ac7dd93432a68c489b02d96b5a1445c5a6d31
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:615e942fb4821335cea29abdf1b9f617266efbbef026514a2aecd3f9106caa9e
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5d318c828d80516011d93fee452cb99f234e50c1
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ace8b7e231a47f824654d09f66cc1bb791af3142f0e069bd42bb59062967cb5d
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f30f002e9c734dcc89180ac3c07469166621cc10
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.0015174620485605,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.4793,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9876415696365153,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.3539,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.8871311223104527,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.4285,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8886476879769438,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.4661,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.7508929168773036,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.2084,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.8976026372242618,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.2591,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.9424845533668358,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.1513,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.455853155223566,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 1.1339,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.8693405772483687,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 1.0139,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.7654137947938375,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 1.0142,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.6451888198745874,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.921,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5674718053393812,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.9655,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.6338107894244854,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9404,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5970437433211514,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.9946,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.5386631750608524,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.9299,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6098521133390008,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8836,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.5526053541676853,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8614,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5621891595051264,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.9052,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.4766365170854893,
+      "learning_rate": 0.0002,
+      "loss": 0.8847,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5840529147180359,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.9846,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.5547468909516751,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.9126,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5236711341322138,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.9088,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.5134909604364669,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.922,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.9697300427476122,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.9295,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.5444734161267164,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8188,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5285720378875282,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.9296,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.5472043933513683,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8499,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5731832933338967,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8745,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.5860201677194361,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.9126,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5533538251539861,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.9087,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5203941717384726,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8501,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5830666219014088,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.9825,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.6611403135288123,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8221,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.5509131235427106,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.9468,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.48262614802977466,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.885,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.4967468079606848,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8773,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.5054682556817377,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8839,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5783100473180441,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8111,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.5091982204899665,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.9338,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.48714751969431214,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8598,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.5905380509640572,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.9708,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4517410539140721,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.7965,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.5897811049658183,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.9407,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.6449325148181358,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8267,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.590279216662507,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.8075,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.4803036071450405,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8232,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.650057906490265,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.8773,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5248713216167825,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7718,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.640458383752968,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.9686,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5541359099271482,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.9041,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.5028891773577449,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.8902,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.4272704300386147,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.7603,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.47583155154421025,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8069,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5709391164161426,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.9319,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.4817867624586776,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.9246,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.468122073430705,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.835,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.48146205808220116,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8406,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4293599866988802,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7913,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.513833564071936,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.8971,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5306753517830095,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.8591,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.5352719795282737,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8931,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.46783532766005226,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.8904,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.4614025887933628,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.8454,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5292160356256126,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.8513,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.4982992250084739,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8386,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.48428581778363466,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.8086,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.5073401208837878,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.8522,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.4801419636570018,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.8365,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.5221625549443353,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8845,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5986785798900521,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.955,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.48615409615097854,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.8108,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.6451006868580349,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.9381,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.45118744899163543,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7676,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.4340630088345874,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8363,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.4992414504376428,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.8589,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.45489056952933793,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.7578,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.462574375461729,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8144,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.44677457011919625,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7371,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.5300906125839625,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7211,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4956868841920524,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.857,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.478050340938763,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.7653,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.506827815430912,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8293,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.4453021459427925,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.7946,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5009642065830214,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8618,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.43906905180332645,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7575,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.41950432980358227,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7369,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.5016133929837371,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8835,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.516494245891528,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8725,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.5021999595004972,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7533,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4430566994249469,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.7624,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.4509312202200304,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.799,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.43880539468467566,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.8197,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.5132542928814076,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7823,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.5245855062966562,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.7357,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.4947356222564879,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.7901,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.5166934209160752,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8242,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.567129098577025,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7955,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.49886841034517554,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.8322,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.5001197133372923,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8714,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.43228199460060496,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8307,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.448189800279342,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8203,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5166084063249642,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.9382,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.42395231697251196,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7309,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.5325728930102084,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7919,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.5810813596160656,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.8665,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.5178420018432477,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7748,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.43174207765650063,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.8587,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.5084392269746427,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.8812,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.4938158911376374,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7739,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4209234076367087,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.8116,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.4902464756683709,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.8293,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4029217813145927,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7758,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.5027079156271373,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8489,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.46448340749276795,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.8098,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.44950359161598746,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7572,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.490597035755986,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.8055,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.5056506525939135,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7864,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.48824543659451236,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.826,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.438338905865828,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.8237,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.45226308155612466,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8023,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.4794240733462027,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.7136,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.455112467805177,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.8448,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.554454372998596,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.8949,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.40719407499070376,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7724,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.49565188196879706,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.7619,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.5315727228630976,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8732,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.5278973926598879,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.8674,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.5451926259797549,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.9044,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4471628586141413,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.8245,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.5099190212606985,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.8499,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.4842632782791375,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.7982,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.41845249804793744,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.7219,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.4521500741887537,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.9326,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.549396544660062,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.8468,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.4250544447837251,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7971,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.49064918738470137,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.832,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.463827496584346,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.854,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4597855238764686,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.78,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.4914738603082715,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.795,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4926261389560048,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.8305,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.5031856664703104,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.8475,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.45706982680396824,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7866,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.4301082418624238,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7872,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.44829906702649863,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.8394,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.45893221626141645,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.8098,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.46227383900189783,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.8534,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.489231355056065,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.8027,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.4365612758393478,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7776,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.42303435003406964,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7209,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5323466461183587,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.8451,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.44172675899358993,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7089,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.3987636549147191,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7322,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.4987262959398162,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.8479,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.40802324497108594,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7477,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.44260849482811404,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.8477,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.45152048928671507,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.7607,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.3727117850944028,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.7461,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4346644018549827,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.768,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.6387377791538981,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7941,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.46738874826437676,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7664,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.5469311399482338,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.9073,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.459477644555679,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.8536,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.5008793140691554,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.9077,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.4014856961750562,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7414,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.4156727906348157,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.7879,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.524723299497917,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.8268,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.42793136149893307,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.7652,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.5827078181581881,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8992,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.5770024054009357,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.8866,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.46303670018289184,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7922,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.43144009507026043,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7332,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4676567443973931,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7909,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.4518971099838989,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.8271,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.502779035397551,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7827,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.4739995975160528,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.7485,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.5806028445201168,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.8176,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.4440921268581451,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7396,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.5464928579487258,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.8075,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.518382011831501,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.8169,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.48733722711706706,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.8649,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.474150897982231,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7129,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.5739178426992518,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.8426,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.5405118513170662,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.8424,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4091725361226266,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7383,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4941518676866939,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7698,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.460410852033095,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.8312,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.4414176713439921,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.8052,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.4144602985262704,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7652,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.5133854428463844,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7949,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.5017733040045176,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.8006,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.5070132146678253,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.8106,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.5024867833341402,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7785,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.4484189925977328,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7487,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.47324489960943145,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.8092,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.46644172129515227,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7696,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4445964562034957,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.749,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.44053163752870483,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.6821,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.44006988291084115,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.8123,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.46162021021553556,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7466,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4861509641478491,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7303,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.8113002127550668,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.8212,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.4412783697302041,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7124,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.4009577314704931,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7361,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4836043807102253,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.6946,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.5510487709140544,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.8482,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.3948986077066301,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7756,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.4454833183647007,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.6966,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.44071084413902345,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7498,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.46124858780311434,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7398,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5221312959740511,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.8639,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.527156104993205,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.8172,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.39619485323325615,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7167,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.44701220142882203,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7817,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.4272688101955897,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.757,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.47077908804199275,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.723,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.5195474745442558,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.8476,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.4407026058813082,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7508,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.5789322923328071,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.8146,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.5501834951591267,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.8141,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5671286578471947,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.8022,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.45955116950959923,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8235,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.4183893271328529,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.804,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.4018444295412617,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7728,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3855479422960728,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7062,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.44530270924587134,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.8019,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4274637217340129,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7553,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.5473912939885323,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.8066,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.48002498604463145,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7501,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.49341673598237357,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.8,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.47316118480038,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7333,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.36381074635090893,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7522,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4578934599507819,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.7607,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.4275844193572388,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7397,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.547941164326141,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.8038,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.4294673298613944,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7054,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.4206285330078151,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.7842,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.3973909966495677,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7248,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.445338802308087,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.6854,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.43967054769693525,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7839,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.5125341807543686,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.8355,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.48474309555396194,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.8598,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.4664265460912641,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7469,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.4662157993678711,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7687,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.4957693519995061,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.8233,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.43547855134293173,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7524,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.39986545231882026,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7214,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.43736721625590835,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7195,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.41678788946271195,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7049,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.5097479119844933,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.8609,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5219257677032542,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.8485,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.5731982429069994,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.8518,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.4670563574293175,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7679,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.4618462877156586,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.8021,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.43635999776213674,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.8576,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.42260696560466565,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7485,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4644266252543304,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.8067,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.48179710380261775,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.8687,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.41539921332562196,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.6865,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.40719931133459253,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7393,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.5014877269699531,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7463,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.4364647270317339,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.785,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.44651786022049156,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.743,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.4489730804175122,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7673,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.456792277856867,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7405,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.4137202400333378,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7288,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.41595808890975944,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7253,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.4078746657180001,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7226,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.48736207830058065,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7571,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.4647000749119668,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7615,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4105994274631069,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.8328,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.40839766721449233,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.6947,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4823030145018645,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.8035,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.3840383966747989,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7476,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.45375744146113045,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7732,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.4322322503914143,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7601,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.43814929438507033,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.781,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.5190464537109442,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7936,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.44274623078075137,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7882,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.4488098560364818,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7131,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.433049679759616,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7035,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.39746170986200574,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.727,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.390316648352027,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7329,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.5461119127144377,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7774,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.37948324279727014,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7489,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.4129850534536828,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.7478,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.37358709629956094,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7256,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.4030533874194643,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7531,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.40792951134610655,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.7589,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.4494647090693021,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7628,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.40488625838702924,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7828,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.44909496919489056,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7729,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.4533143891301691,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.8096,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.534450701226846,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.8171,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.4048228043574203,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.6978,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4250809091760721,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7161,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.4493275958933995,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7077,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.36313082010522685,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7488,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.8067070103111783,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.823,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.4732468862594865,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.795,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4929348065341102,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7082,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.44341789397499826,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.8003,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.4867929464096527,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.8053,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.4384093233663122,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7734,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.5702832504318212,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7345,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.39132879436845286,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.6696,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.4934264866325654,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.7325,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.4603780809347021,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7395,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.46640500803625956,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.7166,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.3909709155701661,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7246,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.465947545486099,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7073,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.3799746763745742,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.7366,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.5134298034724935,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.8144,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.41525626685746175,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7505,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.5349137667265772,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.7829,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.411967581572483,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.6906,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.40928029419771667,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7107,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.4230622141851907,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7622,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.49519974665945765,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.6816,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.450165136834805,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.7455,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.42307727725626,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7468,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.4045647312806453,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.6947,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4735590110789679,
+      "learning_rate": 0.0001,
+      "loss": 0.8264,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.5044907920990858,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7254,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.35767594936525043,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.7072,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.42213819388136953,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7865,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.5148095210861435,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.8567,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.4085537065981769,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6186,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.46419483428766023,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.6963,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.4487462892431637,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7467,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.36919109547608464,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.6777,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.4332804643362148,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.8242,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.476007591449096,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6951,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.39042388236881836,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.6984,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.4568185090445949,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.7992,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.379082207436869,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.7,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.41974138322610066,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7113,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.5327757442213293,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.7839,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.46462763671394586,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.747,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.4237837783253501,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.679,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.47716642768101986,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.8039,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.3990031991734295,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.7181,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.43282253577010726,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7476,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.46147540239235874,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.7725,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.6306841195084987,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.791,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.44299697883912553,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7714,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4742086986588113,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.7892,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.40386184053331575,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.7201,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.3943880217838005,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7554,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.47769776472851405,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.8727,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4309249235815185,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.675,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.4968939254696869,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7007,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.581901884314562,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7262,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.47410879725161503,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.796,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.5017773554214965,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7131,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.4827622442496861,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.6633,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.47634020407804994,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.788,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.38496982587382605,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6744,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.5585107304142405,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.8145,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.5340891376397738,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7253,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4069800636168623,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.7443,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.4322847059956011,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.7448,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.4037556172005087,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.7598,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.42088050249324677,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.7613,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.40703844656525856,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.7048,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.48121410465951564,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.8222,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.4596219784863138,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.7658,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.38319275608875203,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.69,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.38323886885120834,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.7025,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.45770876533845695,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.7418,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.46141885369023555,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.7805,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.4594229161181626,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.8136,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.4688572983621563,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.7967,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.4267776312967211,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.7141,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.42227533245492355,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.755,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.45567834015494074,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7297,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.8492105997024391,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.7793,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.34574383083982096,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.6215,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.37546278553434914,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.7651,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.43367219836477805,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.6847,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3765877233648976,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6522,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.37197353852559156,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6724,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.5893490383226295,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.9073,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.4147043351321972,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.7421,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.43929000502257476,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6555,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.5101682678182368,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.8062,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.36406036685719256,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7238,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.39799113361555244,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.7179,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.39315786057172425,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.7108,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.43881095481650995,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7995,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.41286048761294303,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.8183,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.38009599010775147,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.7429,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.37823264328365747,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.7629,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.4229441576338051,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7849,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.4836979525548324,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.8376,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.5008828679160982,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.8903,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.42445687492843887,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.7238,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.3806642176576031,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.7443,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.4138456130919022,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.7149,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.3815781184842867,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.6988,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.38969147637983625,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6967,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.4092585102597334,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.7169,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.4432734726452317,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.704,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.39276989933160955,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6722,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.47489068681029745,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.7481,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.4298128905708399,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6799,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.5096331770598059,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.7565,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.41638459245578724,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.7637,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.5261378797450693,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.7515,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.3974344319305451,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.6764,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3977459350094927,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.7321,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.4677168900264908,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.7825,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.4408638164696113,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6959,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.49118497531075295,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.7222,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.43683565263329055,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.7531,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4367885778809987,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.7301,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.39642825524746744,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.7276,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.4015203362935855,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.7293,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.44012206651588054,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6704,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.4528152261382792,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.7571,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4232884321834926,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.6692,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.4589892785877267,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7233,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4121692638871751,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6903,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.4974120424904181,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.8023,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.35332028323560105,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6874,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.5184079608375904,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.69,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.5010633649261649,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.7363,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.4882482668086425,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.7833,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.39104905325649925,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.7307,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.530495265099178,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.7411,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4895216806828033,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.7393,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.4187151068051059,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.742,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.5769953743315126,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.806,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.37923197810456,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.7264,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.4117013354698191,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.6537,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.4866274176306346,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7663,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.37932179125869436,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.6625,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.6437435262608248,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.8709,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.4226429641505177,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6482,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.4129173694423031,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.8046,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4402767096651791,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.7119,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.3933043200442359,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.7576,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.39368578737069154,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6905,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.4252957814217015,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6766,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.5245243270238202,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7349,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.4445948767256047,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.7471,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.41184138635514905,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6516,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.3658960420810612,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.7316,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.4431033651023181,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6836,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.36265698022749926,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6612,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.4611539094672187,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.7611,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.35399277493822134,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6796,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.39985353378296284,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.685,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.4091364829563213,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.765,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.4004115744612513,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.7099,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.36773690341408966,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6552,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.456949920489654,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.8078,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.42208864825015796,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.731,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.43779177036677636,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6511,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.40841748301998027,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.7144,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.42406716442802694,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.7453,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.4280874157412337,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7338,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.4119513245583709,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.7045,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.45724210670950644,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.7179,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.405964538237769,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6856,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4527419433584937,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.7454,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.4802116690936312,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.721,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.39418840743948563,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6637,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.44283330912589575,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.733,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.576205073795686,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.7945,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.5340403767407518,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6905,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.45573412321129403,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.7426,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.4619023384747589,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.7297,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.4167715788679408,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.666,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.5297008170441028,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6242,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.43398173481967217,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.7218,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.464824555470651,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6911,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.4210661005022877,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6994,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.48277212401999947,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6729,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.40948887298085435,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.715,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.42391394456952675,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6595,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.4237343785261066,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6642,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.5415525593262541,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.7566,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.38128734164176914,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6635,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.42449619629861135,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.7446,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.41246628676655095,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.7489,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.37092692640773334,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6306,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.3501301527113845,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.703,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.45176747297015113,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6592,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.45908283704016833,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.7617,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.39489988126749265,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.7538,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.5291265836161576,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.7862,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.34729379078982947,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6901,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.38199809974385573,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6841,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.38819616099637727,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.6942,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.43991208587073005,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.7332,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.39666762145102386,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6615,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.4363503316776636,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6502,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.42029988863876855,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.7245,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.4407884192788708,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.7472,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.4435916356849103,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6827,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.5220657409120679,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.8304,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.38719546300280033,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.7387,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.4630988967754445,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7507,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.4814085621451986,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.8103,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.516539394331081,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7419,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.49268882102769035,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.7238,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.36707797580532886,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.6922,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.46391162167829475,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6503,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.39943678330719323,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.6815,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.41533716224726513,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.7624,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.45240632629368666,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7307,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.38785608034491204,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6346,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.48034518746902743,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6384,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.41332667504283876,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6829,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.4337494393162549,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6848,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.40607918521508257,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6542,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.35771532616714113,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6839,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.5254334891702237,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.7895,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.5118367019902981,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.7985,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.5483547105729336,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6695,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.435548249274326,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.655,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.4599163753660314,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.7608,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.47529799400955497,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6728,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.4475815935368909,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.7271,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.40024151914975997,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6488,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.4049672840482492,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6829,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.485994698610914,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.7746,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.40804461396074054,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.7168,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.44647625717321937,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6938,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4597697470689429,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6639,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.48485579308178456,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.7607,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.4194608779936511,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6819,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.40211362916556903,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.741,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.43258769474831466,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.7362,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.4665960330004546,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.7407,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.43638629170945437,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6793,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.41166650165988555,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.7082,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.4976658974941119,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.7379,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.40432300480341915,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.669,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3847586744856044,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6718,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.48610078187730693,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.7641,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3442919441790375,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.627,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.40165443953952795,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6798,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4111172074189193,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6745,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.42550186302416637,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.7142,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4198048455350389,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.682,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.42105356184842474,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.7032,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.38694532652504987,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6954,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.4346602726592264,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.7521,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.42171087258624906,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6507,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.37950491418682564,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6733,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.42228264645197383,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.5811,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.42265676405283753,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6716,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.366032607113853,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.6674,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.4273846569885087,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.7321,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.41439846077578496,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6743,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.4800472014547638,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.8113,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.4170099701429586,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.7094,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.42684358092676894,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.7315,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.36759838949620705,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6375,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.39182460906574307,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.635,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.37155113388771127,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6679,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.4892313200523182,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.7773,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.3614705577221202,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6662,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.40056040990097586,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6835,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.48693182831639875,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6551,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.43409098562506476,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6583,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.44222427351456456,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.8126,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.40367878960302256,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.7328,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3878452948297235,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.6766,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.4460323004358859,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.6327,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.41623353636179367,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.7224,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.4414137781565002,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6397,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.39615571862232873,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6526,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.42988070037803233,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6705,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4011019198453032,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.7098,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.4526937752135543,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6943,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.414733452155201,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.7004,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.49479347833939974,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6822,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4196809553904605,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.7237,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.4433795918252207,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6647,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.4730290908266958,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.726,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.41578174368248944,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6727,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.47562360950917576,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.7028,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.43046163526997777,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.6867,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.4386045894305353,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.6415,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.4250490242198921,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6778,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.43396467538084366,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.7061,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.41093749413933983,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6798,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.42716345262559524,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.7352,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.5342245884546438,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.7963,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.4869935685121873,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.7474,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.5377492034909038,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.7281,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.40252067300632566,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6188,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4732759934224106,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.7283,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.36939415621243193,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6588,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.4768166229427656,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6875,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.40826207503606476,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.6603,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.4342070575118136,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.7266,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.46573911518925726,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6629,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.4163287933190186,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.633,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.4599398479149586,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6399,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.4208391764061762,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.7828,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.3866556900143404,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6665,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.4383711039390692,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6948,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.6069140919286845,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.7724,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.4134702599835318,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6775,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.39534120734604034,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6894,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.4500504918539995,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.7333,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.526748092187756,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.7924,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.3870731710657465,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6952,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.40571545726534053,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6893,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.4575999319147385,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6865,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.4799332271789479,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.729,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.5887484421927903,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6209,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.43168284858047784,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.6994,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.539889322027179,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6928,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.4984417364968895,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.7475,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.39370007146273023,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.7119,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.38284303572252104,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.641,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.4101442442522563,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6702,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.40298946757857557,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6648,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.4983330016825017,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.7611,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.5495717532741172,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.816,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.454551235327257,
+      "learning_rate": 0.0,
+      "loss": 0.7579,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 543415710547968.0,
+      "train_loss": 0.7690219638824463,
+      "train_runtime": 9697.6766,
+      "train_samples_per_second": 1.031,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 543415710547968.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9b6b576818b64d18ee005ecc9a68eeb6922180d
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "down_proj",
+    "o_proj",
+    "k_proj",
+    "gate_proj",
+    "v_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..df1e3f17de7aaa6d2bc673852858fc75a3cafc79
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49193811350fceac6ff35d7a8e2c44d86e2897dd1908ea8688102ba6c4dd96d6
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..aee3bfefd48577574a87cb32a867d54e9e47eb88
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29d8d2b5eddb734787584954dba701ce75115d53d4b55cfe642afbb06871cc4f
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..59eb9c8ce67a02aa78b6f3284df17cde60cc7fdd
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9106693837526286,
+      "learning_rate": 2e-05,
+      "loss": 1.4166,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9150250397204797,
+      "learning_rate": 4e-05,
+      "loss": 1.4921,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7355679850665631,
+      "learning_rate": 6e-05,
+      "loss": 1.3632,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.7891276176019939,
+      "learning_rate": 8e-05,
+      "loss": 1.3311,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8090098490379414,
+      "learning_rate": 0.0001,
+      "loss": 1.1543,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.8151074320199535,
+      "learning_rate": 0.00012,
+      "loss": 1.0282,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.809346750846133,
+      "learning_rate": 0.00014,
+      "loss": 1.063,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6372252258219455,
+      "learning_rate": 0.00016,
+      "loss": 0.9674,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.47471192891119973,
+      "learning_rate": 0.00018,
+      "loss": 0.9224,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.43416804788081265,
+      "learning_rate": 0.0002,
+      "loss": 0.969,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.41593483043885393,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.947,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.46251156589792036,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.9553,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.42505280208450474,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.9056,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.498402240030992,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.9016,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.4916638726847002,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.9454,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.49106508322501824,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9412,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.43278849589900825,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.8991,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.4526138687244381,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.8968,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.37678340559461915,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.861,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.39240358043942264,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.9177,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.40094288576049547,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.8959,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.45673105078652454,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.8908,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.3926631168011837,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.8175,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.45952477937088554,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8398,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.43896138205695384,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.9402,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.3406810043098025,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.8302,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.39297163285135195,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.8725,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.36185852199663304,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.8859,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.3106879438825548,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.818,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.37724029656618624,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.888,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.35228625590220625,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.8856,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.37806390979081067,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.8491,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.35078254894013106,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.8164,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.37913950320639267,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8466,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.4181875377614008,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.9186,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4043945986637868,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.8783,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.3174107328080895,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.8019,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.3345640199753624,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.7977,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.3339024057811489,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.7785,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4680186115901849,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.78,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.3503475171946669,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.7886,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.33065456175873204,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8181,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.30800754949755055,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.7451,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.36298560164696553,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8762,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.3287578419703281,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.7494,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.3375297717853042,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8056,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.3607363104653381,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.7577,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.3253489154409118,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.7929,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.35589811863739607,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.8088,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.35912939752872436,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8456,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.3600222012087073,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.8688,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.32996798554134926,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.7576,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.36570201256321944,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.8121,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3379751853927354,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8599,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.3262085269268287,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.7844,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.31957254739098756,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.7951,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.3558840401659398,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.8208,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.33425963122126895,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.7737,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.3694001379431872,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.798,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.32885346367760754,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.8075,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.3285771099087272,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.7653,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.35925157593697005,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.8244,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.38310318033177926,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.8022,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.38208895577179514,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.8795,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.34205407357191786,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.8261,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.3335723562628723,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.7516,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.39029161117617034,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.8797,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.3186738975446756,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.797,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.3492691721803383,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.8084,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3386299809469852,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.8003,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.3432083736446383,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.8036,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.32411203608189576,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8045,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.34205019992181324,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.8177,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.3393413929561874,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.7829,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3403930731634467,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.7705,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.4563705848356069,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.7094,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.3356557868672747,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.7877,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.3343961782832242,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.799,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.29896123475559366,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.7427,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3884174599267879,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7676,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.3762609083923659,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.8712,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3238605671110351,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.815,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.3318596057413296,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.7978,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.37648246871592933,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8206,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3609318128948997,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.8217,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.3240938832354287,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.7493,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.46518441453936255,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.7919,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.3885763765239273,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.7677,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.34957619797079137,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.7629,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.33173392274808255,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8256,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.4146598743193586,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.7728,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.3292399124225849,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.776,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.3278418203942441,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.7878,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.2936962503701319,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.7713,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.37106134444980204,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.7824,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3591030974883882,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.7817,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.3467533106037834,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.7725,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.32975563402977487,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.7484,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.31496869619076157,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.7379,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3402749041335752,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.7306,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.31941766632295754,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.7572,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.3015192715759972,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.7023,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.32148409942770195,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.7991,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.290285665482951,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7163,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.3610296010151392,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.792,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.32500368638370364,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.7554,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.2940274549940774,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.7591,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.34867458636709453,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.7721,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.40554825556846064,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.7677,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3644334745678942,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.7893,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.3226736096226785,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.7982,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.29905498065898395,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7288,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4616258882615652,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.7713,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.38088529191812087,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.772,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3568898489498515,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.7564,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.2865326409362873,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7457,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.35191316566879094,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.7629,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.3047226349467024,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.7326,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.29800211432506263,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.6982,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.34687981717363264,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.8012,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.3306311760889097,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.7926,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.3406212491870728,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7864,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.32377838938476877,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.7276,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.29897427831595963,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.703,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.37386473627609185,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.8466,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.3698269446125909,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.8009,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.3368127495556458,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.8202,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.3237476463764862,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.7675,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3281958141162337,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.7686,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.36031942199209543,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.7342,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.3157432648234459,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.7574,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.34776666895501035,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7506,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.3012211162298138,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.7199,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3232381306031035,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.729,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.368877912773277,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.7859,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.34015263865850554,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7422,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.303778036696223,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.7562,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.30744559225466855,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7674,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.3451312795488895,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.7793,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4812586743513979,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.6997,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.2795763029314522,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.721,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.33010817156052036,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7574,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.4017483091261422,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.7281,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.2873340263031428,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7483,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.29693018943938276,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.7638,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3193708762335958,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.7897,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3467588383503259,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.7431,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3201402192283141,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.7126,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.449068735615328,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7841,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3493031254413651,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7467,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3369590279859377,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.7988,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.5187049166800942,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.7486,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3091547818218103,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.6941,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.3280449119692185,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7202,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3167129579508421,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.7113,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.34194146462229297,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7672,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.3394190438831376,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.7664,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.2866043239924286,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.692,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.3193779930417426,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.7195,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3036615572504567,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7385,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.3211627469185093,
+      "learning_rate": 0.0001,
+      "loss": 0.7513,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.30853424987560296,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7098,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.34048566014865345,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.8106,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3555812501300445,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.6588,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.30003114018974325,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.7057,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.34098331573546814,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.7539,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.31843957799356415,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.744,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.2827599722340685,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.7057,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.35937660615313516,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.7613,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.32390203597909806,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7375,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.2982931570036195,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.7293,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.37475104123017583,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.7729,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.400425482560844,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.7764,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.30693636509591,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.735,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3312361648741088,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.7684,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.342117677610307,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7056,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4026302548599891,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.7536,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.36524291372842144,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.7166,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.34879105119066595,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.741,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.35691755313184303,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.7353,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3063866868885555,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.7477,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.3354965106858531,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.7281,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.3438439178838029,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.7948,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.2813875656620238,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.696,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.32537325052888916,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.7581,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3969564770900358,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.8019,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.31388734051820155,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.7345,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3741797502750705,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.7549,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.25861948653068545,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.695,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.2857777460426956,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.6682,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.35310800733438846,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.7848,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.34819520216697436,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.6982,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.32569889775815364,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.7654,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.2945547780260103,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7129,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.31955365316009615,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.8102,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.2761627197268354,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7524,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.3480882847368592,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.8154,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.35497213312331755,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.8084,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.29073504801842387,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.7297,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3006796917539789,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.6998,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.31029757927940815,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.715,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.3162419517676427,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.712,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.31445541520912884,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.7179,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3572593415712166,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.7542,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.28509856829528524,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.7056,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.340773802794312,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.7473,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.3427328039402282,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.7442,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.29642040414266996,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7347,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.2969940936232079,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.7017,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3065499410397904,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.718,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.3225482706679557,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.707,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.3117434011223896,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.7452,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.3388025296915813,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.7173,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.33139733316731435,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.7571,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 1.039675293634295,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.742,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3358206672686245,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.7764,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.30238318926136615,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.6912,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.4638805485158481,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.7157,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.385267310334133,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.7601,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.313400211944292,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.7585,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.2893788658587654,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.7226,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3166132206984673,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7063,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.30630399306954587,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.6999,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.2854119528504984,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.7085,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.30178369626580587,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.7154,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.27405217029411794,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.6868,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3429699375170305,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.7437,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3896398683462038,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.7372,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.31882873277957113,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.6966,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.30475473837546774,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.7325,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.3054589522030496,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.7201,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3073347601274914,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.7077,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.39566012301677733,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.7389,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.28697664729194006,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7011,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3943475964025582,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.7491,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.33464501189479606,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.74,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.30655764106334443,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.6499,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3390622200008982,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7135,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.32370061204685463,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.6916,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3102019687302034,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.6939,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3650056108389055,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.7155,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.34275722823771554,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7095,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.2867086174953593,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.6926,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.3751058033742623,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.6866,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.32868973447271876,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.7616,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.32031865758716654,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7433,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.28430902291291127,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.6928,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.30244143016278335,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.703,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.35639534135164314,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.6909,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3192293375780227,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7211,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3498635928667987,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.7929,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.4134395803571079,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7873,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3450344604974233,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.7368,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.2969607355615406,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.6761,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.2891440442732942,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.7284,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.30147698742460943,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.6855,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.29579967648966576,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.6673,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.29733957118580456,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.6737,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3218007344725993,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.7439,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3491367262180285,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7418,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.3213424044123325,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.7144,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.35003192439763947,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7077,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.2949878045047657,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.672,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.30356393661543046,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7541,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.33072054793836925,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.6849,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.3268536790375609,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.7249,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.30520956028690377,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.7456,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3357206914705883,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.7142,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.5448399393246839,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.7308,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.28860683769411816,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.6767,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3100248803449033,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.6999,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.3327885931301477,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.6844,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.3138077626194803,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.7036,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3243992230675338,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.7094,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.32531144597392186,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.7082,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.2924185207467151,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.6355,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.29206907180124286,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.6772,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.2973521290026205,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7106,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.3145736899748929,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.7677,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3157293519172516,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.6904,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.2870726912108204,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.6586,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.30906828128304625,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7286,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.3182529993154354,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.6741,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.32179537551574167,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.7388,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.2810823397409509,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.7148,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.32333801256970945,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.6885,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.33254138632936137,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.6545,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.3343083470572911,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.6984,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3194235142724928,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.708,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.32418757353426036,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.7122,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.33992621355635866,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.6996,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3226877235528863,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.6956,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.3171383430893597,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.6694,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.290782512230605,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.7004,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3263108357933877,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.7165,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3634629078675466,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7765,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3377377234300096,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.6796,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.3065727226496677,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7016,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.31834091430652933,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.6788,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3292102282324205,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.7016,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.31793484496480684,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.6436,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.31122498533982473,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7317,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.38086901286840796,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.7416,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.29772767023521746,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.6916,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3532109944946722,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.7684,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.27767796371449827,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.701,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.32597557938540084,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.7163,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.27544315888032217,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.6651,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.3697743104755361,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.7284,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.2806673046158917,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.6828,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.30407061552813863,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.6736,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3850573291540645,
+      "learning_rate": 0.0,
+      "loss": 0.7934,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 786873517342720.0,
+      "train_loss": 0.7706633139497194,
+      "train_runtime": 9555.5288,
+      "train_samples_per_second": 1.047,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 786873517342720.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..350597a9a3f2f55f635f1b6a22363f469e3114c8
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "up_proj",
+    "k_proj",
+    "down_proj",
+    "v_proj",
+    "q_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e9baa578b5762262fbf12a880ab00f06136c0034
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0756bf3eef72821dbb19ca341b1fc1f85af345bfdb07bae896a6839f50499c53
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f52eae82bd669af732ef02c0e91f92d85a94a515
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb1c893384f7ee2d85fa351f4f4003c8e1569154f490c2023da29d9b714ba839
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9dae2d6fe729f0b36168a515787908873b7fd4ec
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,8792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.9738718905282882,
+      "learning_rate": 5.263157894736842e-06,
+      "loss": 1.3776,
+      "step": 1
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.9955982617174386,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.3593,
+      "step": 2
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 1.0292629278624237,
+      "learning_rate": 1.5789473684210526e-05,
+      "loss": 1.4638,
+      "step": 3
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8691424278818122,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.2431,
+      "step": 4
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.9261316600857898,
+      "learning_rate": 2.6315789473684212e-05,
+      "loss": 1.3732,
+      "step": 5
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.9065393240134191,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.3584,
+      "step": 6
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.7144291097146667,
+      "learning_rate": 3.6842105263157895e-05,
+      "loss": 1.0929,
+      "step": 7
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9158998506068002,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.1723,
+      "step": 8
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.777438611058757,
+      "learning_rate": 4.736842105263158e-05,
+      "loss": 1.0773,
+      "step": 9
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.314136422443474,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.1588,
+      "step": 10
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.8692683248594849,
+      "learning_rate": 5.789473684210527e-05,
+      "loss": 1.0121,
+      "step": 11
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7621402947276783,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 0.9942,
+      "step": 12
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.8434274430077191,
+      "learning_rate": 6.842105263157895e-05,
+      "loss": 1.0715,
+      "step": 13
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.7245209141052413,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.9671,
+      "step": 14
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.8256935204711093,
+      "learning_rate": 7.894736842105263e-05,
+      "loss": 0.9891,
+      "step": 15
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.6234866086859933,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.9121,
+      "step": 16
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.6670638916913187,
+      "learning_rate": 8.947368421052632e-05,
+      "loss": 0.9312,
+      "step": 17
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.5685270034758133,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.8739,
+      "step": 18
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.5662279425727015,
+      "learning_rate": 0.0001,
+      "loss": 0.9169,
+      "step": 19
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5321164781312352,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.8862,
+      "step": 20
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.6452928193096523,
+      "learning_rate": 0.0001105263157894737,
+      "loss": 0.939,
+      "step": 21
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.5889563887372142,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.9265,
+      "step": 22
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.5305268661758419,
+      "learning_rate": 0.00012105263157894738,
+      "loss": 0.8958,
+      "step": 23
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.4374645727689325,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8491,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.6265051901301006,
+      "learning_rate": 0.00013157894736842108,
+      "loss": 0.8892,
+      "step": 25
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.544618577492362,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.8929,
+      "step": 26
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.5020223053534353,
+      "learning_rate": 0.00014210526315789474,
+      "loss": 0.9309,
+      "step": 27
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.6790382740530038,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.9323,
+      "step": 28
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.6384546114121064,
+      "learning_rate": 0.00015263157894736845,
+      "loss": 1.0031,
+      "step": 29
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.5551995120063944,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8097,
+      "step": 30
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.5615200663677613,
+      "learning_rate": 0.0001631578947368421,
+      "loss": 0.8294,
+      "step": 31
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.47874175026665533,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.9338,
+      "step": 32
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.5645765986458627,
+      "learning_rate": 0.0001736842105263158,
+      "loss": 0.8836,
+      "step": 33
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.49163986066497906,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8248,
+      "step": 34
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.5207241540650344,
+      "learning_rate": 0.00018421052631578948,
+      "loss": 0.8682,
+      "step": 35
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5186393720573987,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.9449,
+      "step": 36
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.4900931272240655,
+      "learning_rate": 0.00019473684210526317,
+      "loss": 0.8828,
+      "step": 37
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.4816533288557279,
+      "learning_rate": 0.0002,
+      "loss": 0.8889,
+      "step": 38
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.5158896150522022,
+      "learning_rate": 0.00019999966405802826,
+      "loss": 0.8074,
+      "step": 39
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5850365403221622,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8836,
+      "step": 40
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.4701167474534878,
+      "learning_rate": 0.00019999697653579705,
+      "loss": 0.904,
+      "step": 41
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.5549098311815541,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.8947,
+      "step": 42
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.4648186239868878,
+      "learning_rate": 0.0001999916015635627,
+      "loss": 0.7969,
+      "step": 43
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.6391364745948533,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8843,
+      "step": 44
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.5076423625352378,
+      "learning_rate": 0.00019998353928577919,
+      "loss": 0.8081,
+      "step": 45
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.49126687423074844,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8972,
+      "step": 46
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.5017726655140315,
+      "learning_rate": 0.0001999727899191228,
+      "loss": 0.8712,
+      "step": 47
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.549440628986534,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.9312,
+      "step": 48
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.5355737096496868,
+      "learning_rate": 0.00019995935375248606,
+      "loss": 0.878,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.44999051862035433,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8084,
+      "step": 50
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.4900749507095087,
+      "learning_rate": 0.00019994323114697022,
+      "loss": 0.8316,
+      "step": 51
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5117994847013144,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8608,
+      "step": 52
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.5793930106305719,
+      "learning_rate": 0.0001999244225358753,
+      "loss": 0.8786,
+      "step": 53
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.6203928016888752,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.9021,
+      "step": 54
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.4395642052217729,
+      "learning_rate": 0.00019990292842468868,
+      "loss": 0.804,
+      "step": 55
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.6327858480994747,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.9353,
+      "step": 56
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.43822231696617303,
+      "learning_rate": 0.0001998787493910712,
+      "loss": 0.7973,
+      "step": 57
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.5686770926878106,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.7992,
+      "step": 58
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.49620141131497186,
+      "learning_rate": 0.000199851886084842,
+      "loss": 0.8901,
+      "step": 59
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5336489874875604,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.9136,
+      "step": 60
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.46455062000355124,
+      "learning_rate": 0.00019982233922796085,
+      "loss": 0.8072,
+      "step": 61
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5169487116900677,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8492,
+      "step": 62
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.5561981975277991,
+      "learning_rate": 0.00019979010961450878,
+      "loss": 0.9055,
+      "step": 63
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4812674310307808,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.8738,
+      "step": 64
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.5215732343292463,
+      "learning_rate": 0.00019975519811066663,
+      "loss": 0.8656,
+      "step": 65
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.5148244034689009,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8197,
+      "step": 66
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.4442934586215466,
+      "learning_rate": 0.0001997176056546921,
+      "loss": 0.7518,
+      "step": 67
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.6135294026474913,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.9257,
+      "step": 68
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.4947662845524055,
+      "learning_rate": 0.0001996773332568941,
+      "loss": 0.8183,
+      "step": 69
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.429456280197822,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.7379,
+      "step": 70
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.5542940527933353,
+      "learning_rate": 0.00019963438199960599,
+      "loss": 0.8827,
+      "step": 71
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5532480090464903,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8782,
+      "step": 72
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.4697101553880468,
+      "learning_rate": 0.00019958875303715615,
+      "loss": 0.8361,
+      "step": 73
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.4868559323245861,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.827,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.596419404041506,
+      "learning_rate": 0.0001995404475958373,
+      "loss": 0.9036,
+      "step": 75
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.4534360086341495,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.7968,
+      "step": 76
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.45360606459520647,
+      "learning_rate": 0.0001994894669738732,
+      "loss": 0.734,
+      "step": 77
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.4719919379702672,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.8371,
+      "step": 78
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.5251165433982562,
+      "learning_rate": 0.0001994358125413841,
+      "loss": 0.8563,
+      "step": 79
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.46847022350052253,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8448,
+      "step": 80
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.508156193308429,
+      "learning_rate": 0.0001993794857403495,
+      "loss": 0.8555,
+      "step": 81
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.39701579915605517,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.7728,
+      "step": 82
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.5398851920389962,
+      "learning_rate": 0.0001993204880845699,
+      "loss": 0.8473,
+      "step": 83
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4746705891110577,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.7767,
+      "step": 84
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.5227545409963591,
+      "learning_rate": 0.00019925882115962568,
+      "loss": 0.8389,
+      "step": 85
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.5047015184751248,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.8786,
+      "step": 86
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.5487063304465213,
+      "learning_rate": 0.00019919448662283478,
+      "loss": 0.8583,
+      "step": 87
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.5406478590263915,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8345,
+      "step": 88
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.4556906536085241,
+      "learning_rate": 0.00019912748620320794,
+      "loss": 0.8587,
+      "step": 89
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.46457156727140836,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.7514,
+      "step": 90
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.4734733161717522,
+      "learning_rate": 0.00019905782170140238,
+      "loss": 0.7886,
+      "step": 91
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.46010879443324293,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.7652,
+      "step": 92
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.4997404000177325,
+      "learning_rate": 0.00019898549498967343,
+      "loss": 0.8798,
+      "step": 93
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.4790654404778781,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.8564,
+      "step": 94
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.46309032055654636,
+      "learning_rate": 0.000198910508011824,
+      "loss": 0.8327,
+      "step": 95
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.7213308815478991,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.9652,
+      "step": 96
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.5197579060478195,
+      "learning_rate": 0.00019883286278315262,
+      "loss": 0.7527,
+      "step": 97
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.5250384341038005,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8537,
+      "step": 98
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.4679433715996057,
+      "learning_rate": 0.00019875256139039902,
+      "loss": 0.8266,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.519912535189573,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8234,
+      "step": 100
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.4760641770357719,
+      "learning_rate": 0.00019866960599168826,
+      "loss": 0.8254,
+      "step": 101
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.4603282007871293,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.8703,
+      "step": 102
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.5473069069521743,
+      "learning_rate": 0.0001985839988164726,
+      "loss": 0.8951,
+      "step": 103
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.5206514832514219,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.8368,
+      "step": 104
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.42955480551815406,
+      "learning_rate": 0.00019849574216547171,
+      "loss": 0.7217,
+      "step": 105
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.5008905219462,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8121,
+      "step": 106
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.47501177433724273,
+      "learning_rate": 0.00019840483841061058,
+      "loss": 0.781,
+      "step": 107
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.49681252121417474,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.818,
+      "step": 108
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.5914157668396626,
+      "learning_rate": 0.00019831128999495606,
+      "loss": 0.911,
+      "step": 109
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.46936059512369693,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.8305,
+      "step": 110
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.5234891664109574,
+      "learning_rate": 0.0001982150994326511,
+      "loss": 0.8048,
+      "step": 111
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.49901884728932266,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.7315,
+      "step": 112
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.5235893588604486,
+      "learning_rate": 0.0001981162693088471,
+      "loss": 0.8104,
+      "step": 113
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.4953910108921592,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8195,
+      "step": 114
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.4973017892932648,
+      "learning_rate": 0.0001980148022796345,
+      "loss": 0.7827,
+      "step": 115
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.5023532567363218,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7809,
+      "step": 116
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.48553556375331025,
+      "learning_rate": 0.00019791070107197153,
+      "loss": 0.8013,
+      "step": 117
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.4540201716573498,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.8449,
+      "step": 118
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.46022660068927584,
+      "learning_rate": 0.0001978039684836106,
+      "loss": 0.799,
+      "step": 119
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.9659308169721341,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.9912,
+      "step": 120
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.4884710181225921,
+      "learning_rate": 0.0001976946073830234,
+      "loss": 0.7529,
+      "step": 121
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.4690345640057607,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8658,
+      "step": 122
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.5420200831110911,
+      "learning_rate": 0.00019758262070932375,
+      "loss": 0.9115,
+      "step": 123
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.43988541415262633,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.787,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.49884742778224017,
+      "learning_rate": 0.00019746801147218842,
+      "loss": 0.8642,
+      "step": 125
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.7783246401612559,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.8818,
+      "step": 126
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.46775227339587644,
+      "learning_rate": 0.00019735078275177654,
+      "loss": 0.811,
+      "step": 127
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4463159134719736,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.7858,
+      "step": 128
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.43890830731604036,
+      "learning_rate": 0.00019723093769864663,
+      "loss": 0.7763,
+      "step": 129
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5169698296712399,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8259,
+      "step": 130
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.7162596832908007,
+      "learning_rate": 0.0001971084795336719,
+      "loss": 0.7395,
+      "step": 131
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.4938875225187638,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.7824,
+      "step": 132
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.48281860188060755,
+      "learning_rate": 0.00019698341154795389,
+      "loss": 0.7633,
+      "step": 133
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.552867873369316,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.9141,
+      "step": 134
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.4702107571476058,
+      "learning_rate": 0.00019685573710273376,
+      "loss": 0.782,
+      "step": 135
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.6591282984364222,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.9173,
+      "step": 136
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.4801391907009083,
+      "learning_rate": 0.00019672545962930215,
+      "loss": 0.8243,
+      "step": 137
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.5655266450419159,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.7905,
+      "step": 138
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.5214781136343931,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.8653,
+      "step": 139
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.49152275049944777,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8119,
+      "step": 140
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.521274832597228,
+      "learning_rate": 0.00019645710967265882,
+      "loss": 0.7943,
+      "step": 141
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.47112952096647953,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7712,
+      "step": 142
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.47792639731028924,
+      "learning_rate": 0.00019631904440143612,
+      "loss": 0.8015,
+      "step": 143
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.5031459552571963,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.7739,
+      "step": 144
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.4246137815285982,
+      "learning_rate": 0.00019617839052578603,
+      "loss": 0.6983,
+      "step": 145
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.6104344906082924,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.8464,
+      "step": 146
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.4249619010799741,
+      "learning_rate": 0.0001960351518258255,
+      "loss": 0.7615,
+      "step": 147
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5748627085289674,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.7686,
+      "step": 148
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.5137566844133159,
+      "learning_rate": 0.00019588933215113926,
+      "loss": 0.8338,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5002092420518957,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.7562,
+      "step": 150
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.47194552572414866,
+      "learning_rate": 0.00019574093542067673,
+      "loss": 0.7983,
+      "step": 151
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.5218303185521903,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.7851,
+      "step": 152
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.5264832068089446,
+      "learning_rate": 0.0001955899656226464,
+      "loss": 0.8371,
+      "step": 153
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.478203962759533,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.7616,
+      "step": 154
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.5438026741934794,
+      "learning_rate": 0.0001954364268144088,
+      "loss": 0.7615,
+      "step": 155
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.43902989895135003,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7639,
+      "step": 156
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.47051131378806954,
+      "learning_rate": 0.00019528032312236736,
+      "loss": 0.7129,
+      "step": 157
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.5002910987291495,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.71,
+      "step": 158
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.4076232209792227,
+      "learning_rate": 0.00019512165874185767,
+      "loss": 0.7918,
+      "step": 159
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4730770037833421,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.7892,
+      "step": 160
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.45664931470747816,
+      "learning_rate": 0.0001949604379370345,
+      "loss": 0.7543,
+      "step": 161
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.47178849520868527,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8437,
+      "step": 162
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.467588792179776,
+      "learning_rate": 0.00019479666504075736,
+      "loss": 0.8021,
+      "step": 163
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.4972992438538472,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.841,
+      "step": 164
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.4468720005820029,
+      "learning_rate": 0.0001946303444544741,
+      "loss": 0.7794,
+      "step": 165
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.545370771508763,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.838,
+      "step": 166
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.5454717084972628,
+      "learning_rate": 0.00019446148064810242,
+      "loss": 0.8629,
+      "step": 167
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.4357282147317954,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.7779,
+      "step": 168
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.48593556883421596,
+      "learning_rate": 0.00019429007815990993,
+      "loss": 0.807,
+      "step": 169
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.41599332497007774,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7772,
+      "step": 170
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.4836889364929825,
+      "learning_rate": 0.00019411614159639204,
+      "loss": 0.7779,
+      "step": 171
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5252176172912414,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7382,
+      "step": 172
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.597492267186273,
+      "learning_rate": 0.00019393967563214833,
+      "loss": 0.793,
+      "step": 173
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.50643481417428,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8025,
+      "step": 174
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.4863957284362853,
+      "learning_rate": 0.00019376068500975667,
+      "loss": 0.7685,
+      "step": 175
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4126442552699547,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.7851,
+      "step": 176
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.45326416402953634,
+      "learning_rate": 0.000193579174539646,
+      "loss": 0.7749,
+      "step": 177
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.4227596656395421,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7664,
+      "step": 178
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.42788815992693824,
+      "learning_rate": 0.00019339514909996706,
+      "loss": 0.7915,
+      "step": 179
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.6288877822241705,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8891,
+      "step": 180
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.4507238893173025,
+      "learning_rate": 0.00019320861363646095,
+      "loss": 0.7759,
+      "step": 181
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.47300214317734723,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.8498,
+      "step": 182
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.4401017701118483,
+      "learning_rate": 0.00019301957316232658,
+      "loss": 0.777,
+      "step": 183
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.5356566612020834,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.8127,
+      "step": 184
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.5113524460982788,
+      "learning_rate": 0.0001928280327580858,
+      "loss": 0.8786,
+      "step": 185
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.5291692741521824,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.8393,
+      "step": 186
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.45843548873895057,
+      "learning_rate": 0.00019263399757144683,
+      "loss": 0.7901,
+      "step": 187
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.46690384493517495,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8058,
+      "step": 188
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.42269965222515665,
+      "learning_rate": 0.000192437472817166,
+      "loss": 0.7408,
+      "step": 189
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.4744441145268392,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.8013,
+      "step": 190
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.589734999819277,
+      "learning_rate": 0.00019223846377690754,
+      "loss": 0.8505,
+      "step": 191
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.4294515422334175,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.7768,
+      "step": 192
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.4744430990923323,
+      "learning_rate": 0.00019203697579910154,
+      "loss": 0.7609,
+      "step": 193
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.4559501850152181,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7523,
+      "step": 194
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.46357087350772636,
+      "learning_rate": 0.00019183301429880043,
+      "loss": 0.7601,
+      "step": 195
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.46023959041271456,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.796,
+      "step": 196
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.4727064727182715,
+      "learning_rate": 0.00019162658475753327,
+      "loss": 0.7954,
+      "step": 197
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.4569969267217822,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8656,
+      "step": 198
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.5938746073907192,
+      "learning_rate": 0.00019141769272315858,
+      "loss": 0.7622,
+      "step": 199
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.48850488082059007,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8259,
+      "step": 200
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.4681251940836538,
+      "learning_rate": 0.00019120634380971496,
+      "loss": 0.7065,
+      "step": 201
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.4735748030725397,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8068,
+      "step": 202
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.47529092333896694,
+      "learning_rate": 0.0001909925436972706,
+      "loss": 0.7315,
+      "step": 203
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5902306759720399,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.8338,
+      "step": 204
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.5230818479222556,
+      "learning_rate": 0.00019077629813177036,
+      "loss": 0.8022,
+      "step": 205
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.4534663637266558,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7373,
+      "step": 206
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.4708997916866323,
+      "learning_rate": 0.00019055761292488142,
+      "loss": 0.7071,
+      "step": 207
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.49362197885694487,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7933,
+      "step": 208
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.45203709880515,
+      "learning_rate": 0.00019033649395383702,
+      "loss": 0.7832,
+      "step": 209
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.5493177817059253,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.8254,
+      "step": 210
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.5634237288206593,
+      "learning_rate": 0.00019011294716127867,
+      "loss": 0.8056,
+      "step": 211
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.42048467773105236,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7727,
+      "step": 212
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.4899154747867312,
+      "learning_rate": 0.0001898869785550963,
+      "loss": 0.8376,
+      "step": 213
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.5710692282020838,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.8309,
+      "step": 214
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.48806709144026006,
+      "learning_rate": 0.00018965859420826684,
+      "loss": 0.7442,
+      "step": 215
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.4838117679253375,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.787,
+      "step": 216
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.4580816421898731,
+      "learning_rate": 0.00018942780025869098,
+      "loss": 0.8085,
+      "step": 217
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.47596264671869476,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7975,
+      "step": 218
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.44108018864388043,
+      "learning_rate": 0.00018919460290902826,
+      "loss": 0.7784,
+      "step": 219
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.5149929952820737,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.6841,
+      "step": 220
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.6399958813065738,
+      "learning_rate": 0.0001889590084265304,
+      "loss": 0.8172,
+      "step": 221
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.4698697208453852,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7776,
+      "step": 222
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.4801018226598063,
+      "learning_rate": 0.0001887210231428727,
+      "loss": 0.7831,
+      "step": 223
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.5010754074291259,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.8059,
+      "step": 224
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.537583317946107,
+      "learning_rate": 0.0001884806534539841,
+      "loss": 0.8351,
+      "step": 225
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.5463136427932224,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8493,
+      "step": 226
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.4957532756389482,
+      "learning_rate": 0.0001882379058198751,
+      "loss": 0.7829,
+      "step": 227
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.4842215035332732,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.776,
+      "step": 228
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.48734113067673895,
+      "learning_rate": 0.00018799278676446423,
+      "loss": 0.7234,
+      "step": 229
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.47276663145448383,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7549,
+      "step": 230
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.43463312763994316,
+      "learning_rate": 0.00018774530287540278,
+      "loss": 0.8188,
+      "step": 231
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.5307171453933577,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.8506,
+      "step": 232
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.5793489529825944,
+      "learning_rate": 0.00018749546080389757,
+      "loss": 0.857,
+      "step": 233
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.41645727653838105,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7675,
+      "step": 234
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.4724064375837242,
+      "learning_rate": 0.00018724326726453244,
+      "loss": 0.815,
+      "step": 235
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.533774522533951,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7672,
+      "step": 236
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.5541004916699475,
+      "learning_rate": 0.00018698872903508755,
+      "loss": 0.8889,
+      "step": 237
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.5661791909987512,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.8735,
+      "step": 238
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.44873804371044396,
+      "learning_rate": 0.0001867318529563574,
+      "loss": 0.7042,
+      "step": 239
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.5304400188736889,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7533,
+      "step": 240
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.44327993489571377,
+      "learning_rate": 0.00018647264593196688,
+      "loss": 0.7866,
+      "step": 241
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.5255641319650115,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.8282,
+      "step": 242
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.46028409296582895,
+      "learning_rate": 0.00018621111492818585,
+      "loss": 0.7753,
+      "step": 243
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.5031247287857912,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.8181,
+      "step": 244
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.6173573256457165,
+      "learning_rate": 0.00018594726697374175,
+      "loss": 0.9006,
+      "step": 245
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.5298715602472197,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.839,
+      "step": 246
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.4911255833322417,
+      "learning_rate": 0.0001856811091596308,
+      "loss": 0.759,
+      "step": 247
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4906696061287372,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.8528,
+      "step": 248
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.4668567335860601,
+      "learning_rate": 0.00018541264863892754,
+      "loss": 0.8208,
+      "step": 249
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.4345032605839861,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.8167,
+      "step": 250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.4984211985447478,
+      "learning_rate": 0.00018514189262659235,
+      "loss": 0.8302,
+      "step": 251
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4572714117537344,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8184,
+      "step": 252
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.5206899190878905,
+      "learning_rate": 0.00018486884839927768,
+      "loss": 0.8066,
+      "step": 253
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.4854794107900831,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.7537,
+      "step": 254
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.4609677848342807,
+      "learning_rate": 0.0001845935232951325,
+      "loss": 0.8063,
+      "step": 255
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.45996489083268266,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.8506,
+      "step": 256
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.45707766102052144,
+      "learning_rate": 0.00018431592471360503,
+      "loss": 0.7912,
+      "step": 257
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4340542190902398,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7575,
+      "step": 258
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.5497680661505181,
+      "learning_rate": 0.000184036060115244,
+      "loss": 0.8477,
+      "step": 259
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.43255985506713984,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.7152,
+      "step": 260
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.4520541936610211,
+      "learning_rate": 0.00018375393702149787,
+      "loss": 0.7711,
+      "step": 261
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5552186923148112,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.8749,
+      "step": 262
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.5240005896907025,
+      "learning_rate": 0.00018346956301451304,
+      "loss": 0.8577,
+      "step": 263
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5080128301456308,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.8368,
+      "step": 264
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.4512704910911856,
+      "learning_rate": 0.00018318294573692985,
+      "loss": 0.7881,
+      "step": 265
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.425427268205857,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.7531,
+      "step": 266
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.46175851059255807,
+      "learning_rate": 0.0001828940928916772,
+      "loss": 0.7513,
+      "step": 267
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.43532549688496996,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7672,
+      "step": 268
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.5150647397664155,
+      "learning_rate": 0.00018260301224176558,
+      "loss": 0.8248,
+      "step": 269
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5015860279713167,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7788,
+      "step": 270
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.4094608213167549,
+      "learning_rate": 0.00018230971161007853,
+      "loss": 0.8107,
+      "step": 271
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4506121936050682,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.785,
+      "step": 272
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.38251503700435563,
+      "learning_rate": 0.00018201419887916214,
+      "loss": 0.7542,
+      "step": 273
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.6011698005621798,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.8012,
+      "step": 274
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.637325574994567,
+      "learning_rate": 0.00018171648199101346,
+      "loss": 0.8774,
+      "step": 275
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.48830475781118704,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7632,
+      "step": 276
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.437959458199466,
+      "learning_rate": 0.00018141656894686689,
+      "loss": 0.7968,
+      "step": 277
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.40995994254323775,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.7322,
+      "step": 278
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.4760848633445986,
+      "learning_rate": 0.00018111446780697929,
+      "loss": 0.8289,
+      "step": 279
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3968332181154129,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7367,
+      "step": 280
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.5853394391073075,
+      "learning_rate": 0.00018081018669041324,
+      "loss": 0.9217,
+      "step": 281
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.5055996499763276,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.7689,
+      "step": 282
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.5171528589084252,
+      "learning_rate": 0.00018050373377481878,
+      "loss": 0.8076,
+      "step": 283
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.48753167002443065,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7365,
+      "step": 284
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.5593692966320034,
+      "learning_rate": 0.0001801951172962139,
+      "loss": 0.8296,
+      "step": 285
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.4894130517380097,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7783,
+      "step": 286
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.4183032507797536,
+      "learning_rate": 0.0001798843455487629,
+      "loss": 0.7159,
+      "step": 287
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.4854918953495465,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7768,
+      "step": 288
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.45780025047823086,
+      "learning_rate": 0.00017957142688455362,
+      "loss": 0.8142,
+      "step": 289
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.5152620899441052,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7786,
+      "step": 290
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.5549668785671925,
+      "learning_rate": 0.00017925636971337304,
+      "loss": 0.7827,
+      "step": 291
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.544306140170751,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.8029,
+      "step": 292
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.46360456212875956,
+      "learning_rate": 0.00017893918250248104,
+      "loss": 0.7919,
+      "step": 293
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.5060777619170408,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.8377,
+      "step": 294
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.4496308552363371,
+      "learning_rate": 0.00017861987377638312,
+      "loss": 0.8139,
+      "step": 295
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5007864295328988,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.825,
+      "step": 296
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.4483008493163282,
+      "learning_rate": 0.0001782984521166011,
+      "loss": 0.7863,
+      "step": 297
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.4391112870508659,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7797,
+      "step": 298
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.47018233251197405,
+      "learning_rate": 0.00017797492616144256,
+      "loss": 0.7924,
+      "step": 299
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4798517777508943,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7745,
+      "step": 300
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.49095855616960954,
+      "learning_rate": 0.00017764930460576866,
+      "loss": 0.765,
+      "step": 301
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.48356101095398024,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7755,
+      "step": 302
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.5580403015191725,
+      "learning_rate": 0.00017732159620076053,
+      "loss": 0.7914,
+      "step": 303
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.4713539577022985,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7745,
+      "step": 304
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.5205108027761377,
+      "learning_rate": 0.00017699180975368396,
+      "loss": 0.7512,
+      "step": 305
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.46116331908739044,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.8286,
+      "step": 306
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.4577523085902634,
+      "learning_rate": 0.00017665995412765285,
+      "loss": 0.7494,
+      "step": 307
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.3847018815541359,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7431,
+      "step": 308
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.46143177187220114,
+      "learning_rate": 0.00017632603824139085,
+      "loss": 0.8167,
+      "step": 309
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.4546653053351597,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.7887,
+      "step": 310
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.48515315632333816,
+      "learning_rate": 0.0001759900710689918,
+      "loss": 0.7682,
+      "step": 311
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.42125610166318656,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8073,
+      "step": 312
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.5224634896260647,
+      "learning_rate": 0.00017565206163967846,
+      "loss": 0.7344,
+      "step": 313
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.4258122656750229,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8215,
+      "step": 314
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.4409516005421548,
+      "learning_rate": 0.00017531201903755994,
+      "loss": 0.7652,
+      "step": 315
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.5402199309929968,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.8455,
+      "step": 316
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.4327780968915563,
+      "learning_rate": 0.00017496995240138744,
+      "loss": 0.8579,
+      "step": 317
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.36305977653280086,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7184,
+      "step": 318
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.48196845883166595,
+      "learning_rate": 0.00017462587092430875,
+      "loss": 0.8101,
+      "step": 319
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.5203666483240921,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.8734,
+      "step": 320
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.4146613232763745,
+      "learning_rate": 0.00017427978385362112,
+      "loss": 0.7822,
+      "step": 321
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.6393924480453517,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.8473,
+      "step": 322
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.5014244971007478,
+      "learning_rate": 0.0001739317004905227,
+      "loss": 0.7994,
+      "step": 323
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4467471708728824,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.715,
+      "step": 324
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.44927464260407846,
+      "learning_rate": 0.00017358163018986282,
+      "loss": 0.7876,
+      "step": 325
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.452092010139311,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.7384,
+      "step": 326
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.4505483607042589,
+      "learning_rate": 0.00017322958235989016,
+      "loss": 0.7741,
+      "step": 327
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.45737335478669194,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7318,
+      "step": 328
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.505105062630817,
+      "learning_rate": 0.00017287556646200018,
+      "loss": 0.8462,
+      "step": 329
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.5485209182353045,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.8205,
+      "step": 330
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.47935014604862003,
+      "learning_rate": 0.00017251959201048083,
+      "loss": 0.7916,
+      "step": 331
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.46851260385721555,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.8087,
+      "step": 332
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.40742467259083315,
+      "learning_rate": 0.00017216166857225674,
+      "loss": 0.7288,
+      "step": 333
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.42412473687814045,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.7749,
+      "step": 334
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.4540663038141919,
+      "learning_rate": 0.00017180180576663228,
+      "loss": 0.8279,
+      "step": 335
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4814886187089166,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.7954,
+      "step": 336
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.38612348667503354,
+      "learning_rate": 0.00017144001326503273,
+      "loss": 0.684,
+      "step": 337
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.43797411883152226,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.8172,
+      "step": 338
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.38992309677963055,
+      "learning_rate": 0.00017107630079074478,
+      "loss": 0.7729,
+      "step": 339
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4317859019917533,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.8122,
+      "step": 340
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.42022534256609423,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.777,
+      "step": 341
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.40811977801292604,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7008,
+      "step": 342
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.4297400709769793,
+      "learning_rate": 0.00017034315507498635,
+      "loss": 0.7693,
+      "step": 343
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4801445875017632,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.8153,
+      "step": 344
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.4934539899070924,
+      "learning_rate": 0.00016997374153703625,
+      "loss": 0.8032,
+      "step": 345
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.42513152532831566,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.7752,
+      "step": 346
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.4794991935777699,
+      "learning_rate": 0.00016960244743290868,
+      "loss": 0.7782,
+      "step": 347
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.5087050287378877,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.8042,
+      "step": 348
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.393865561718882,
+      "learning_rate": 0.00016922928274124886,
+      "loss": 0.6906,
+      "step": 349
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.42383628171051063,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.7239,
+      "step": 350
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.6121834981137757,
+      "learning_rate": 0.00016885425749097444,
+      "loss": 0.828,
+      "step": 351
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4423952298587943,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7793,
+      "step": 352
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.41904903851629355,
+      "learning_rate": 0.00016847738176100632,
+      "loss": 0.7222,
+      "step": 353
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.47090218219971486,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7864,
+      "step": 354
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.3913200013056769,
+      "learning_rate": 0.0001680986656799975,
+      "loss": 0.7107,
+      "step": 355
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.4500172163246463,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.7604,
+      "step": 356
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.47203881705422795,
+      "learning_rate": 0.00016771811942606108,
+      "loss": 0.775,
+      "step": 357
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.449367277679672,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7392,
+      "step": 358
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.3621692554096837,
+      "learning_rate": 0.00016733575322649657,
+      "loss": 0.6913,
+      "step": 359
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4705659025575893,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.8159,
+      "step": 360
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.4391531794127377,
+      "learning_rate": 0.00016695157735751513,
+      "loss": 0.7539,
+      "step": 361
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.5471100071607476,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7569,
+      "step": 362
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.39983579832917754,
+      "learning_rate": 0.0001665656021439633,
+      "loss": 0.7594,
+      "step": 363
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.5433063322457978,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7739,
+      "step": 364
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.5158257022171956,
+      "learning_rate": 0.00016617783795904565,
+      "loss": 0.7932,
+      "step": 365
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.43334724350730025,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7747,
+      "step": 366
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.5418026742631086,
+      "learning_rate": 0.00016578829522404583,
+      "loss": 0.8507,
+      "step": 367
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.49728990873135537,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7716,
+      "step": 368
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.4930679346000872,
+      "learning_rate": 0.00016539698440804661,
+      "loss": 0.8152,
+      "step": 369
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.40703810585703953,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7123,
+      "step": 370
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.5386416550361496,
+      "learning_rate": 0.0001650039160276485,
+      "loss": 0.788,
+      "step": 371
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.6681961851973703,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7419,
+      "step": 372
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.46698423556575525,
+      "learning_rate": 0.0001646091006466871,
+      "loss": 0.7497,
+      "step": 373
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.4462693969319942,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.795,
+      "step": 374
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5162984669055487,
+      "learning_rate": 0.00016421254887594917,
+      "loss": 0.7959,
+      "step": 375
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.48234426398524705,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7843,
+      "step": 376
+    },
+    {
+      "epoch": 0.3016,
+      "grad_norm": 0.46155437780670966,
+      "learning_rate": 0.00016381427137288754,
+      "loss": 0.7643,
+      "step": 377
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.5104579264182729,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7618,
+      "step": 378
+    },
+    {
+      "epoch": 0.3032,
+      "grad_norm": 0.4576691284516531,
+      "learning_rate": 0.0001634142788413346,
+      "loss": 0.7593,
+      "step": 379
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.45151819648907454,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7874,
+      "step": 380
+    },
+    {
+      "epoch": 0.3048,
+      "grad_norm": 0.5027216665292459,
+      "learning_rate": 0.00016301258203121462,
+      "loss": 0.7623,
+      "step": 381
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.4322680615922771,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7644,
+      "step": 382
+    },
+    {
+      "epoch": 0.3064,
+      "grad_norm": 0.458154934846738,
+      "learning_rate": 0.00016260919173825508,
+      "loss": 0.7795,
+      "step": 383
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.46071618810128806,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.8209,
+      "step": 384
+    },
+    {
+      "epoch": 0.308,
+      "grad_norm": 0.4394451330648298,
+      "learning_rate": 0.00016220411880369601,
+      "loss": 0.7604,
+      "step": 385
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.42064813035167875,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7284,
+      "step": 386
+    },
+    {
+      "epoch": 0.3096,
+      "grad_norm": 0.4554895386626795,
+      "learning_rate": 0.00016179737411399926,
+      "loss": 0.7627,
+      "step": 387
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.5299365332857332,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.798,
+      "step": 388
+    },
+    {
+      "epoch": 0.3112,
+      "grad_norm": 0.4636899840429361,
+      "learning_rate": 0.00016138896860055555,
+      "loss": 0.8192,
+      "step": 389
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.46048503472651314,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7394,
+      "step": 390
+    },
+    {
+      "epoch": 0.3128,
+      "grad_norm": 0.4170627043386341,
+      "learning_rate": 0.00016097891323939062,
+      "loss": 0.763,
+      "step": 391
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.441786177043673,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7349,
+      "step": 392
+    },
+    {
+      "epoch": 0.3144,
+      "grad_norm": 0.3906300471935325,
+      "learning_rate": 0.00016056721905087056,
+      "loss": 0.7193,
+      "step": 393
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.3820125321487123,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.6502,
+      "step": 394
+    },
+    {
+      "epoch": 0.316,
+      "grad_norm": 0.5236625305576288,
+      "learning_rate": 0.00016015389709940538,
+      "loss": 0.7858,
+      "step": 395
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.49316645616575255,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7877,
+      "step": 396
+    },
+    {
+      "epoch": 0.3176,
+      "grad_norm": 0.45537293006273666,
+      "learning_rate": 0.0001597389584931517,
+      "loss": 0.744,
+      "step": 397
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.43520294503636914,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7514,
+      "step": 398
+    },
+    {
+      "epoch": 0.3192,
+      "grad_norm": 0.5263441910538149,
+      "learning_rate": 0.0001593224143837142,
+      "loss": 0.7887,
+      "step": 399
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.43733536031533465,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.8091,
+      "step": 400
+    },
+    {
+      "epoch": 0.3208,
+      "grad_norm": 0.4509836961161326,
+      "learning_rate": 0.00015890427596584617,
+      "loss": 0.7972,
+      "step": 401
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.3697325898617728,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7448,
+      "step": 402
+    },
+    {
+      "epoch": 0.3224,
+      "grad_norm": 0.4149836456688446,
+      "learning_rate": 0.00015848455447714822,
+      "loss": 0.7589,
+      "step": 403
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.5291721262591771,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.8298,
+      "step": 404
+    },
+    {
+      "epoch": 0.324,
+      "grad_norm": 0.4592801100745272,
+      "learning_rate": 0.00015806326119776663,
+      "loss": 0.6963,
+      "step": 405
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.48305455164459193,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7279,
+      "step": 406
+    },
+    {
+      "epoch": 0.3256,
+      "grad_norm": 0.43414795715124715,
+      "learning_rate": 0.00015764040745008988,
+      "loss": 0.7333,
+      "step": 407
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4232467191471432,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.8361,
+      "step": 408
+    },
+    {
+      "epoch": 0.3272,
+      "grad_norm": 0.37099563519311474,
+      "learning_rate": 0.00015721600459844468,
+      "loss": 0.6612,
+      "step": 409
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.463495269434473,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7525,
+      "step": 410
+    },
+    {
+      "epoch": 0.3288,
+      "grad_norm": 0.4051273675714783,
+      "learning_rate": 0.00015679006404879033,
+      "loss": 0.7357,
+      "step": 411
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.44065204587172313,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7433,
+      "step": 412
+    },
+    {
+      "epoch": 0.3304,
+      "grad_norm": 0.404647063494323,
+      "learning_rate": 0.00015636259724841222,
+      "loss": 0.6644,
+      "step": 413
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.4367938419748247,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.8084,
+      "step": 414
+    },
+    {
+      "epoch": 0.332,
+      "grad_norm": 0.4291710814932179,
+      "learning_rate": 0.00015593361568561428,
+      "loss": 0.6774,
+      "step": 415
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3710638351771584,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.6896,
+      "step": 416
+    },
+    {
+      "epoch": 0.3336,
+      "grad_norm": 0.47217298232188626,
+      "learning_rate": 0.0001555031308894101,
+      "loss": 0.8498,
+      "step": 417
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.5599795121408392,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.8025,
+      "step": 418
+    },
+    {
+      "epoch": 0.3352,
+      "grad_norm": 0.4820028429977975,
+      "learning_rate": 0.0001550711544292131,
+      "loss": 0.8019,
+      "step": 419
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.8103269160239212,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.8533,
+      "step": 420
+    },
+    {
+      "epoch": 0.3368,
+      "grad_norm": 0.43886002005983965,
+      "learning_rate": 0.00015463769791452574,
+      "loss": 0.7961,
+      "step": 421
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.4264274993725706,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7368,
+      "step": 422
+    },
+    {
+      "epoch": 0.3384,
+      "grad_norm": 0.37652450930794035,
+      "learning_rate": 0.00015420277299462736,
+      "loss": 0.6803,
+      "step": 423
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4843136136830526,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7653,
+      "step": 424
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.4645581295541479,
+      "learning_rate": 0.00015376639135826107,
+      "loss": 0.7339,
+      "step": 425
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.4039874199374106,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7783,
+      "step": 426
+    },
+    {
+      "epoch": 0.3416,
+      "grad_norm": 0.3968349526012385,
+      "learning_rate": 0.00015332856473331978,
+      "loss": 0.7276,
+      "step": 427
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.4033776124012642,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7334,
+      "step": 428
+    },
+    {
+      "epoch": 0.3432,
+      "grad_norm": 0.44970638069998753,
+      "learning_rate": 0.00015288930488653094,
+      "loss": 0.7736,
+      "step": 429
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.5501628335450935,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7011,
+      "step": 430
+    },
+    {
+      "epoch": 0.3448,
+      "grad_norm": 0.5046304397885234,
+      "learning_rate": 0.0001524486236231402,
+      "loss": 0.7764,
+      "step": 431
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.40623832410355104,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7166,
+      "step": 432
+    },
+    {
+      "epoch": 0.3464,
+      "grad_norm": 0.4276686115329016,
+      "learning_rate": 0.00015200653278659432,
+      "loss": 0.7184,
+      "step": 433
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.423668162878283,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7227,
+      "step": 434
+    },
+    {
+      "epoch": 0.348,
+      "grad_norm": 0.3927314621240658,
+      "learning_rate": 0.00015156304425822267,
+      "loss": 0.69,
+      "step": 435
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.5119999368926385,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7183,
+      "step": 436
+    },
+    {
+      "epoch": 0.3496,
+      "grad_norm": 0.43203093657186026,
+      "learning_rate": 0.00015111816995691809,
+      "loss": 0.743,
+      "step": 437
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.46063329097074923,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.8199,
+      "step": 438
+    },
+    {
+      "epoch": 0.3512,
+      "grad_norm": 0.41210379418090887,
+      "learning_rate": 0.00015067192183881658,
+      "loss": 0.7144,
+      "step": 439
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.40093045861548976,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7849,
+      "step": 440
+    },
+    {
+      "epoch": 0.3528,
+      "grad_norm": 0.6220898454077841,
+      "learning_rate": 0.00015022431189697568,
+      "loss": 0.8912,
+      "step": 441
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.4973328165816475,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8178,
+      "step": 442
+    },
+    {
+      "epoch": 0.3544,
+      "grad_norm": 0.418748668488691,
+      "learning_rate": 0.0001497753521610526,
+      "loss": 0.6893,
+      "step": 443
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.4275334770458131,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7458,
+      "step": 444
+    },
+    {
+      "epoch": 0.356,
+      "grad_norm": 0.4258922571938356,
+      "learning_rate": 0.00014932505469698052,
+      "loss": 0.7532,
+      "step": 445
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.47720760098269094,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.8392,
+      "step": 446
+    },
+    {
+      "epoch": 0.3576,
+      "grad_norm": 0.44154393073423315,
+      "learning_rate": 0.0001488734316066446,
+      "loss": 0.7193,
+      "step": 447
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.5401432877123872,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.843,
+      "step": 448
+    },
+    {
+      "epoch": 0.3592,
+      "grad_norm": 0.5545136321727958,
+      "learning_rate": 0.0001484204950275565,
+      "loss": 0.8185,
+      "step": 449
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5771994605485644,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.8423,
+      "step": 450
+    },
+    {
+      "epoch": 0.3608,
+      "grad_norm": 0.44985387889753464,
+      "learning_rate": 0.00014796625713252848,
+      "loss": 0.683,
+      "step": 451
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.3804634086933775,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.6856,
+      "step": 452
+    },
+    {
+      "epoch": 0.3624,
+      "grad_norm": 0.4327700820962517,
+      "learning_rate": 0.00014751073012934587,
+      "loss": 0.7165,
+      "step": 453
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.4595450183265034,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7779,
+      "step": 454
+    },
+    {
+      "epoch": 0.364,
+      "grad_norm": 0.41463583124442244,
+      "learning_rate": 0.0001470539262604393,
+      "loss": 0.6957,
+      "step": 455
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.4330408248305494,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7895,
+      "step": 456
+    },
+    {
+      "epoch": 0.3656,
+      "grad_norm": 0.4300916093302638,
+      "learning_rate": 0.00014659585780255556,
+      "loss": 0.6724,
+      "step": 457
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.44397586704291414,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.743,
+      "step": 458
+    },
+    {
+      "epoch": 0.3672,
+      "grad_norm": 0.40246850409442514,
+      "learning_rate": 0.0001461365370664276,
+      "loss": 0.7071,
+      "step": 459
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.45024266081088654,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.739,
+      "step": 460
+    },
+    {
+      "epoch": 0.3688,
+      "grad_norm": 0.4682043754286765,
+      "learning_rate": 0.00014567597639644387,
+      "loss": 0.8009,
+      "step": 461
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.48680932957793543,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7452,
+      "step": 462
+    },
+    {
+      "epoch": 0.3704,
+      "grad_norm": 0.41878898691483496,
+      "learning_rate": 0.00014521418817031628,
+      "loss": 0.7418,
+      "step": 463
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4319413092291526,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.6993,
+      "step": 464
+    },
+    {
+      "epoch": 0.372,
+      "grad_norm": 0.39720449382891126,
+      "learning_rate": 0.00014475118479874774,
+      "loss": 0.7198,
+      "step": 465
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.46131919117475345,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7732,
+      "step": 466
+    },
+    {
+      "epoch": 0.3736,
+      "grad_norm": 0.47845851428090297,
+      "learning_rate": 0.0001442869787250987,
+      "loss": 0.7577,
+      "step": 467
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4193860309688804,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.6731,
+      "step": 468
+    },
+    {
+      "epoch": 0.3752,
+      "grad_norm": 0.5199556138782738,
+      "learning_rate": 0.00014382158242505234,
+      "loss": 0.8273,
+      "step": 469
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.443457538816095,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7714,
+      "step": 470
+    },
+    {
+      "epoch": 0.3768,
+      "grad_norm": 0.4982474765380297,
+      "learning_rate": 0.00014335500840627986,
+      "loss": 0.7796,
+      "step": 471
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.45278238492691764,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.7355,
+      "step": 472
+    },
+    {
+      "epoch": 0.3784,
+      "grad_norm": 0.4328358546149195,
+      "learning_rate": 0.0001428872692081038,
+      "loss": 0.765,
+      "step": 473
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.39592843909567915,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7776,
+      "step": 474
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.46452178512022113,
+      "learning_rate": 0.00014241837740116132,
+      "loss": 0.7896,
+      "step": 475
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.41833633625501077,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7048,
+      "step": 476
+    },
+    {
+      "epoch": 0.3816,
+      "grad_norm": 0.514852927784867,
+      "learning_rate": 0.00014194834558706632,
+      "loss": 0.7816,
+      "step": 477
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.5257797374413313,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.8428,
+      "step": 478
+    },
+    {
+      "epoch": 0.3832,
+      "grad_norm": 0.45636398078426904,
+      "learning_rate": 0.0001414771863980707,
+      "loss": 0.8619,
+      "step": 479
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4502039476106067,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.7963,
+      "step": 480
+    },
+    {
+      "epoch": 0.3848,
+      "grad_norm": 0.4243820031639593,
+      "learning_rate": 0.00014100491249672498,
+      "loss": 0.784,
+      "step": 481
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.5184189715289644,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7735,
+      "step": 482
+    },
+    {
+      "epoch": 0.3864,
+      "grad_norm": 0.4547892891311182,
+      "learning_rate": 0.0001405315365755379,
+      "loss": 0.7617,
+      "step": 483
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.48233412931694103,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7788,
+      "step": 484
+    },
+    {
+      "epoch": 0.388,
+      "grad_norm": 0.40083273522540647,
+      "learning_rate": 0.00014005707135663527,
+      "loss": 0.7021,
+      "step": 485
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.6194283397199801,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.726,
+      "step": 486
+    },
+    {
+      "epoch": 0.3896,
+      "grad_norm": 0.39974681678200796,
+      "learning_rate": 0.00013958152959141825,
+      "loss": 0.7531,
+      "step": 487
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.4422340111807094,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.822,
+      "step": 488
+    },
+    {
+      "epoch": 0.3912,
+      "grad_norm": 0.3887067782631618,
+      "learning_rate": 0.00013910492406022033,
+      "loss": 0.6595,
+      "step": 489
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.553404541087564,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.894,
+      "step": 490
+    },
+    {
+      "epoch": 0.3928,
+      "grad_norm": 0.41896343138385883,
+      "learning_rate": 0.0001386272675719642,
+      "loss": 0.7361,
+      "step": 491
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.3738436164502759,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7367,
+      "step": 492
+    },
+    {
+      "epoch": 0.3944,
+      "grad_norm": 0.4483262760759903,
+      "learning_rate": 0.00013814857296381728,
+      "loss": 0.815,
+      "step": 493
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.4395461639913433,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7917,
+      "step": 494
+    },
+    {
+      "epoch": 0.396,
+      "grad_norm": 0.49483075047931446,
+      "learning_rate": 0.00013766885310084688,
+      "loss": 0.7537,
+      "step": 495
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.44992977192842065,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7668,
+      "step": 496
+    },
+    {
+      "epoch": 0.3976,
+      "grad_norm": 0.41950839692838165,
+      "learning_rate": 0.00013718812087567414,
+      "loss": 0.6612,
+      "step": 497
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.4087702174450992,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.6966,
+      "step": 498
+    },
+    {
+      "epoch": 0.3992,
+      "grad_norm": 0.4183706402748134,
+      "learning_rate": 0.000136706389208128,
+      "loss": 0.6523,
+      "step": 499
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.42319846444919756,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7032,
+      "step": 500
+    },
+    {
+      "epoch": 0.4008,
+      "grad_norm": 0.38483029451499023,
+      "learning_rate": 0.00013622367104489756,
+      "loss": 0.6604,
+      "step": 501
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.4034412002610339,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.7194,
+      "step": 502
+    },
+    {
+      "epoch": 0.4024,
+      "grad_norm": 0.4929472362741926,
+      "learning_rate": 0.0001357399793591844,
+      "loss": 0.7285,
+      "step": 503
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.42801487666111165,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7555,
+      "step": 504
+    },
+    {
+      "epoch": 0.404,
+      "grad_norm": 0.5219648385158142,
+      "learning_rate": 0.00013525532715035366,
+      "loss": 0.7975,
+      "step": 505
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.5078211284193989,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.8362,
+      "step": 506
+    },
+    {
+      "epoch": 0.4056,
+      "grad_norm": 0.39187948139759565,
+      "learning_rate": 0.00013476972744358507,
+      "loss": 0.6311,
+      "step": 507
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.45280979149866263,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.7127,
+      "step": 508
+    },
+    {
+      "epoch": 0.4072,
+      "grad_norm": 0.49302221491791043,
+      "learning_rate": 0.00013428319328952253,
+      "loss": 0.7644,
+      "step": 509
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.4348177938826427,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7979,
+      "step": 510
+    },
+    {
+      "epoch": 0.4088,
+      "grad_norm": 0.4406643805434921,
+      "learning_rate": 0.0001337957377639235,
+      "loss": 0.7121,
+      "step": 511
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.48066300539551704,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7621,
+      "step": 512
+    },
+    {
+      "epoch": 0.4104,
+      "grad_norm": 0.45414616840784405,
+      "learning_rate": 0.0001333073739673076,
+      "loss": 0.7462,
+      "step": 513
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.3984240043819688,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.6972,
+      "step": 514
+    },
+    {
+      "epoch": 0.412,
+      "grad_norm": 0.6071295742863728,
+      "learning_rate": 0.0001328181150246045,
+      "loss": 0.7026,
+      "step": 515
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4134558242804434,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.686,
+      "step": 516
+    },
+    {
+      "epoch": 0.4136,
+      "grad_norm": 0.35424121409366033,
+      "learning_rate": 0.00013232797408480127,
+      "loss": 0.6849,
+      "step": 517
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.40853146718648076,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7584,
+      "step": 518
+    },
+    {
+      "epoch": 0.4152,
+      "grad_norm": 0.4504575641258578,
+      "learning_rate": 0.00013183696432058888,
+      "loss": 0.7417,
+      "step": 519
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4048789202428273,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7273,
+      "step": 520
+    },
+    {
+      "epoch": 0.4168,
+      "grad_norm": 0.5159833545425263,
+      "learning_rate": 0.00013134509892800822,
+      "loss": 0.7396,
+      "step": 521
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.45391289698427356,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7066,
+      "step": 522
+    },
+    {
+      "epoch": 0.4184,
+      "grad_norm": 0.5330981927253413,
+      "learning_rate": 0.00013085239112609547,
+      "loss": 0.7406,
+      "step": 523
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4552377694570901,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7413,
+      "step": 524
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.4220690556860606,
+      "learning_rate": 0.00013035885415652685,
+      "loss": 0.7691,
+      "step": 525
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.4156735467250489,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.737,
+      "step": 526
+    },
+    {
+      "epoch": 0.4216,
+      "grad_norm": 0.4849548220572441,
+      "learning_rate": 0.00012986450128326266,
+      "loss": 0.802,
+      "step": 527
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3821922408993975,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7042,
+      "step": 528
+    },
+    {
+      "epoch": 0.4232,
+      "grad_norm": 0.4391419153908085,
+      "learning_rate": 0.00012936934579219094,
+      "loss": 0.7476,
+      "step": 529
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.48605094281470546,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7299,
+      "step": 530
+    },
+    {
+      "epoch": 0.4248,
+      "grad_norm": 0.44719999619049045,
+      "learning_rate": 0.00012887340099077024,
+      "loss": 0.6288,
+      "step": 531
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.37042463200431835,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.6195,
+      "step": 532
+    },
+    {
+      "epoch": 0.4264,
+      "grad_norm": 0.44775857852904416,
+      "learning_rate": 0.0001283766802076722,
+      "loss": 0.7556,
+      "step": 533
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.42829522654490987,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.681,
+      "step": 534
+    },
+    {
+      "epoch": 0.428,
+      "grad_norm": 0.5928330924887514,
+      "learning_rate": 0.00012787919679242306,
+      "loss": 0.8601,
+      "step": 535
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4594659671403964,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7738,
+      "step": 536
+    },
+    {
+      "epoch": 0.4296,
+      "grad_norm": 0.4465430347318887,
+      "learning_rate": 0.00012738096411504522,
+      "loss": 0.7374,
+      "step": 537
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.4268335673175506,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.709,
+      "step": 538
+    },
+    {
+      "epoch": 0.4312,
+      "grad_norm": 0.41663005710297984,
+      "learning_rate": 0.00012688199556569753,
+      "loss": 0.7658,
+      "step": 539
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.5040094125671729,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7884,
+      "step": 540
+    },
+    {
+      "epoch": 0.4328,
+      "grad_norm": 0.5921053420891155,
+      "learning_rate": 0.0001263823045543158,
+      "loss": 0.7267,
+      "step": 541
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.46146114546649125,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.7407,
+      "step": 542
+    },
+    {
+      "epoch": 0.4344,
+      "grad_norm": 0.524385845131032,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.7407,
+      "step": 543
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.3908346833006848,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.6748,
+      "step": 544
+    },
+    {
+      "epoch": 0.436,
+      "grad_norm": 0.6078837577120809,
+      "learning_rate": 0.00012538080888191408,
+      "loss": 0.8619,
+      "step": 545
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.5322345512743014,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.8594,
+      "step": 546
+    },
+    {
+      "epoch": 0.4376,
+      "grad_norm": 0.4634894284627779,
+      "learning_rate": 0.00012487903113640337,
+      "loss": 0.7469,
+      "step": 547
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.39088436223660744,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.6933,
+      "step": 548
+    },
+    {
+      "epoch": 0.4392,
+      "grad_norm": 0.6228952211808515,
+      "learning_rate": 0.00012437658475915377,
+      "loss": 0.924,
+      "step": 549
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.39138157660741213,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7763,
+      "step": 550
+    },
+    {
+      "epoch": 0.4408,
+      "grad_norm": 0.3461019712679376,
+      "learning_rate": 0.00012387348325356874,
+      "loss": 0.701,
+      "step": 551
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.4471624419737993,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.7772,
+      "step": 552
+    },
+    {
+      "epoch": 0.4424,
+      "grad_norm": 0.4150378407307194,
+      "learning_rate": 0.00012336974014065844,
+      "loss": 0.8258,
+      "step": 553
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.37942087611900394,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7186,
+      "step": 554
+    },
+    {
+      "epoch": 0.444,
+      "grad_norm": 0.47123832417396183,
+      "learning_rate": 0.00012286536895867654,
+      "loss": 0.8035,
+      "step": 555
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.4504207833314274,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7521,
+      "step": 556
+    },
+    {
+      "epoch": 0.4456,
+      "grad_norm": 0.4428425121890588,
+      "learning_rate": 0.00012236038326275626,
+      "loss": 0.7257,
+      "step": 557
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.4015281007614013,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7097,
+      "step": 558
+    },
+    {
+      "epoch": 0.4472,
+      "grad_norm": 0.4478899906730631,
+      "learning_rate": 0.00012185479662454595,
+      "loss": 0.7757,
+      "step": 559
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.5119729739688551,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7681,
+      "step": 560
+    },
+    {
+      "epoch": 0.4488,
+      "grad_norm": 0.40673486902303113,
+      "learning_rate": 0.00012134862263184467,
+      "loss": 0.7142,
+      "step": 561
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.3726644043800367,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.7259,
+      "step": 562
+    },
+    {
+      "epoch": 0.4504,
+      "grad_norm": 0.4138540260403324,
+      "learning_rate": 0.00012084187488823657,
+      "loss": 0.722,
+      "step": 563
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.41276783524920124,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7666,
+      "step": 564
+    },
+    {
+      "epoch": 0.452,
+      "grad_norm": 0.3931117358126695,
+      "learning_rate": 0.00012033456701272576,
+      "loss": 0.7441,
+      "step": 565
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.5149228654463832,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7127,
+      "step": 566
+    },
+    {
+      "epoch": 0.4536,
+      "grad_norm": 0.38730792703104727,
+      "learning_rate": 0.00011982671263936995,
+      "loss": 0.6845,
+      "step": 567
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.5042696123128442,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7188,
+      "step": 568
+    },
+    {
+      "epoch": 0.4552,
+      "grad_norm": 0.5211459225121182,
+      "learning_rate": 0.00011931832541691418,
+      "loss": 0.7415,
+      "step": 569
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.47237500746277383,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.6767,
+      "step": 570
+    },
+    {
+      "epoch": 0.4568,
+      "grad_norm": 0.44101750359701125,
+      "learning_rate": 0.00011880941900842397,
+      "loss": 0.7766,
+      "step": 571
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.4376179162106715,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.6794,
+      "step": 572
+    },
+    {
+      "epoch": 0.4584,
+      "grad_norm": 0.443132823304689,
+      "learning_rate": 0.00011830000709091815,
+      "loss": 0.7069,
+      "step": 573
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.4121264584438154,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7014,
+      "step": 574
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.5438632693866221,
+      "learning_rate": 0.0001177901033550012,
+      "loss": 0.7113,
+      "step": 575
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.4363082522071395,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.6613,
+      "step": 576
+    },
+    {
+      "epoch": 0.4616,
+      "grad_norm": 0.5094548228524022,
+      "learning_rate": 0.00011727972150449544,
+      "loss": 0.7701,
+      "step": 577
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.42663679470568217,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7509,
+      "step": 578
+    },
+    {
+      "epoch": 0.4632,
+      "grad_norm": 0.371674990751324,
+      "learning_rate": 0.00011676887525607271,
+      "loss": 0.7048,
+      "step": 579
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.42718377435661903,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7327,
+      "step": 580
+    },
+    {
+      "epoch": 0.4648,
+      "grad_norm": 0.4654016531114095,
+      "learning_rate": 0.00011625757833888551,
+      "loss": 0.7099,
+      "step": 581
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.42069923667116677,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.766,
+      "step": 582
+    },
+    {
+      "epoch": 0.4664,
+      "grad_norm": 0.5169680150545506,
+      "learning_rate": 0.0001157458444941984,
+      "loss": 0.7042,
+      "step": 583
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.523955961930124,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7869,
+      "step": 584
+    },
+    {
+      "epoch": 0.468,
+      "grad_norm": 0.4385714543537812,
+      "learning_rate": 0.00011523368747501839,
+      "loss": 0.8399,
+      "step": 585
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.6340599411671402,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7222,
+      "step": 586
+    },
+    {
+      "epoch": 0.4696,
+      "grad_norm": 0.47338371501262316,
+      "learning_rate": 0.00011472112104572547,
+      "loss": 0.6802,
+      "step": 587
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.4151024888514726,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.7212,
+      "step": 588
+    },
+    {
+      "epoch": 0.4712,
+      "grad_norm": 0.4266234124682058,
+      "learning_rate": 0.0001142081589817027,
+      "loss": 0.6584,
+      "step": 589
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.43762734425987504,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7147,
+      "step": 590
+    },
+    {
+      "epoch": 0.4728,
+      "grad_norm": 0.4134839753893968,
+      "learning_rate": 0.00011369481506896582,
+      "loss": 0.7353,
+      "step": 591
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.40694547601123865,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7192,
+      "step": 592
+    },
+    {
+      "epoch": 0.4744,
+      "grad_norm": 0.3564513017284066,
+      "learning_rate": 0.00011318110310379301,
+      "loss": 0.6465,
+      "step": 593
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.3845545303416391,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7286,
+      "step": 594
+    },
+    {
+      "epoch": 0.476,
+      "grad_norm": 0.4163438169383608,
+      "learning_rate": 0.00011266703689235394,
+      "loss": 0.6846,
+      "step": 595
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.47824121432178024,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.6886,
+      "step": 596
+    },
+    {
+      "epoch": 0.4776,
+      "grad_norm": 0.5266270866168399,
+      "learning_rate": 0.00011215263025033869,
+      "loss": 0.7648,
+      "step": 597
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.5822376742880172,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.8485,
+      "step": 598
+    },
+    {
+      "epoch": 0.4792,
+      "grad_norm": 0.5542816264186629,
+      "learning_rate": 0.00011163789700258655,
+      "loss": 0.7644,
+      "step": 599
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.45585766253161186,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7989,
+      "step": 600
+    },
+    {
+      "epoch": 0.4808,
+      "grad_norm": 0.4333560334878548,
+      "learning_rate": 0.00011112285098271451,
+      "loss": 0.7552,
+      "step": 601
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.4609090181524356,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.7392,
+      "step": 602
+    },
+    {
+      "epoch": 0.4824,
+      "grad_norm": 0.4800939290535579,
+      "learning_rate": 0.00011060750603274535,
+      "loss": 0.7406,
+      "step": 603
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.39598022061191823,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.6952,
+      "step": 604
+    },
+    {
+      "epoch": 0.484,
+      "grad_norm": 0.5500799247212219,
+      "learning_rate": 0.00011009187600273566,
+      "loss": 0.7921,
+      "step": 605
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.47244542913085974,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.8139,
+      "step": 606
+    },
+    {
+      "epoch": 0.4856,
+      "grad_norm": 0.42058907283470487,
+      "learning_rate": 0.00010957597475040373,
+      "loss": 0.7463,
+      "step": 607
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.5014730249496743,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7452,
+      "step": 608
+    },
+    {
+      "epoch": 0.4872,
+      "grad_norm": 0.4745672807425004,
+      "learning_rate": 0.00010905981614075693,
+      "loss": 0.781,
+      "step": 609
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.5376525537845632,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.8153,
+      "step": 610
+    },
+    {
+      "epoch": 0.4888,
+      "grad_norm": 0.4653577531089348,
+      "learning_rate": 0.00010854341404571928,
+      "loss": 0.7893,
+      "step": 611
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.4538491457245736,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.7392,
+      "step": 612
+    },
+    {
+      "epoch": 0.4904,
+      "grad_norm": 0.5227869138818918,
+      "learning_rate": 0.00010802678234375851,
+      "loss": 0.8347,
+      "step": 613
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.4550754633589422,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7233,
+      "step": 614
+    },
+    {
+      "epoch": 0.492,
+      "grad_norm": 0.36374678340939803,
+      "learning_rate": 0.0001075099349195131,
+      "loss": 0.6873,
+      "step": 615
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4656775039023448,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.7299,
+      "step": 616
+    },
+    {
+      "epoch": 0.4936,
+      "grad_norm": 0.49469206366660884,
+      "learning_rate": 0.00010699288566341914,
+      "loss": 0.772,
+      "step": 617
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.392463643989344,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7124,
+      "step": 618
+    },
+    {
+      "epoch": 0.4952,
+      "grad_norm": 0.4133034375857957,
+      "learning_rate": 0.000106475648471337,
+      "loss": 0.7065,
+      "step": 619
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4821380238744263,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7867,
+      "step": 620
+    },
+    {
+      "epoch": 0.4968,
+      "grad_norm": 0.5300861647192535,
+      "learning_rate": 0.00010595823724417795,
+      "loss": 0.8107,
+      "step": 621
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.4491881480521485,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.7575,
+      "step": 622
+    },
+    {
+      "epoch": 0.4984,
+      "grad_norm": 0.46263469968642656,
+      "learning_rate": 0.00010544066588753044,
+      "loss": 0.7372,
+      "step": 623
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.46276631407191143,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.7248,
+      "step": 624
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.477287224280608,
+      "learning_rate": 0.00010492294831128641,
+      "loss": 0.787,
+      "step": 625
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.41135177209160717,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7162,
+      "step": 626
+    },
+    {
+      "epoch": 0.5016,
+      "grad_norm": 0.3898977361266114,
+      "learning_rate": 0.00010440509842926767,
+      "loss": 0.6744,
+      "step": 627
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.47144778165038753,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.8,
+      "step": 628
+    },
+    {
+      "epoch": 0.5032,
+      "grad_norm": 0.45383248559923595,
+      "learning_rate": 0.00010388713015885161,
+      "loss": 0.7055,
+      "step": 629
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.4265627843656265,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.7289,
+      "step": 630
+    },
+    {
+      "epoch": 0.5048,
+      "grad_norm": 0.4077987016103004,
+      "learning_rate": 0.00010336905742059742,
+      "loss": 0.7298,
+      "step": 631
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.4991524391829501,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7616,
+      "step": 632
+    },
+    {
+      "epoch": 0.5064,
+      "grad_norm": 0.4473279610179817,
+      "learning_rate": 0.0001028508941378719,
+      "loss": 0.7199,
+      "step": 633
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.433327670096491,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7401,
+      "step": 634
+    },
+    {
+      "epoch": 0.508,
+      "grad_norm": 0.5549931816783576,
+      "learning_rate": 0.00010233265423647523,
+      "loss": 0.8405,
+      "step": 635
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.44976384207067494,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.7796,
+      "step": 636
+    },
+    {
+      "epoch": 0.5096,
+      "grad_norm": 0.40329785746260033,
+      "learning_rate": 0.00010181435164426676,
+      "loss": 0.6764,
+      "step": 637
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.4734808460625881,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.678,
+      "step": 638
+    },
+    {
+      "epoch": 0.5112,
+      "grad_norm": 0.4324182697579957,
+      "learning_rate": 0.00010129600029079072,
+      "loss": 0.7169,
+      "step": 639
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.5221590387949478,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7415,
+      "step": 640
+    },
+    {
+      "epoch": 0.5128,
+      "grad_norm": 0.5332044694962755,
+      "learning_rate": 0.00010077761410690172,
+      "loss": 0.8014,
+      "step": 641
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.571806435015935,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.8016,
+      "step": 642
+    },
+    {
+      "epoch": 0.5144,
+      "grad_norm": 0.49967439182922213,
+      "learning_rate": 0.00010025920702439051,
+      "loss": 0.7302,
+      "step": 643
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.49118346108438604,
+      "learning_rate": 0.0001,
+      "loss": 0.6872,
+      "step": 644
+    },
+    {
+      "epoch": 0.516,
+      "grad_norm": 0.48913866368633124,
+      "learning_rate": 9.97407929756095e-05,
+      "loss": 0.7992,
+      "step": 645
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.47013137899782054,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7257,
+      "step": 646
+    },
+    {
+      "epoch": 0.5176,
+      "grad_norm": 0.4658911806163373,
+      "learning_rate": 9.92223858930983e-05,
+      "loss": 0.7299,
+      "step": 647
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.39712521020306313,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.6794,
+      "step": 648
+    },
+    {
+      "epoch": 0.5192,
+      "grad_norm": 0.4180112431814468,
+      "learning_rate": 9.870399970920932e-05,
+      "loss": 0.677,
+      "step": 649
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.4284928495017929,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7733,
+      "step": 650
+    },
+    {
+      "epoch": 0.5208,
+      "grad_norm": 0.41243122791327375,
+      "learning_rate": 9.818564835573323e-05,
+      "loss": 0.701,
+      "step": 651
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3688516156765538,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.6871,
+      "step": 652
+    },
+    {
+      "epoch": 0.5224,
+      "grad_norm": 0.4313396253823104,
+      "learning_rate": 9.766734576352478e-05,
+      "loss": 0.7334,
+      "step": 653
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.5298882466577094,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.7525,
+      "step": 654
+    },
+    {
+      "epoch": 0.524,
+      "grad_norm": 0.4753678963133096,
+      "learning_rate": 9.714910586212816e-05,
+      "loss": 0.7625,
+      "step": 655
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4197428604140736,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.6601,
+      "step": 656
+    },
+    {
+      "epoch": 0.5256,
+      "grad_norm": 0.525079564441233,
+      "learning_rate": 9.663094257940258e-05,
+      "loss": 0.7347,
+      "step": 657
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.4677503452316048,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7699,
+      "step": 658
+    },
+    {
+      "epoch": 0.5272,
+      "grad_norm": 0.42874889487505047,
+      "learning_rate": 9.611286984114841e-05,
+      "loss": 0.7174,
+      "step": 659
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4850354392599207,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.8163,
+      "step": 660
+    },
+    {
+      "epoch": 0.5288,
+      "grad_norm": 0.5325336062346893,
+      "learning_rate": 9.559490157073236e-05,
+      "loss": 0.7202,
+      "step": 661
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.3720437406408467,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.6578,
+      "step": 662
+    },
+    {
+      "epoch": 0.5304,
+      "grad_norm": 0.4637695460351074,
+      "learning_rate": 9.507705168871358e-05,
+      "loss": 0.715,
+      "step": 663
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.41919769770671306,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6684,
+      "step": 664
+    },
+    {
+      "epoch": 0.532,
+      "grad_norm": 0.47399139930057216,
+      "learning_rate": 9.455933411246958e-05,
+      "loss": 0.7167,
+      "step": 665
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.4656387312722789,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.7297,
+      "step": 666
+    },
+    {
+      "epoch": 0.5336,
+      "grad_norm": 0.431769090576024,
+      "learning_rate": 9.404176275582208e-05,
+      "loss": 0.7581,
+      "step": 667
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3803590215625145,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.6779,
+      "step": 668
+    },
+    {
+      "epoch": 0.5352,
+      "grad_norm": 0.5603234705750568,
+      "learning_rate": 9.352435152866298e-05,
+      "loss": 0.7024,
+      "step": 669
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.5154067125968048,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.9104,
+      "step": 670
+    },
+    {
+      "epoch": 0.5368,
+      "grad_norm": 0.5549373309443812,
+      "learning_rate": 9.300711433658087e-05,
+      "loss": 0.815,
+      "step": 671
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.453591654438188,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7581,
+      "step": 672
+    },
+    {
+      "epoch": 0.5384,
+      "grad_norm": 0.4401941647094853,
+      "learning_rate": 9.249006508048694e-05,
+      "loss": 0.6741,
+      "step": 673
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.40812457058737384,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.6743,
+      "step": 674
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.3504583347093562,
+      "learning_rate": 9.197321765624152e-05,
+      "loss": 0.6269,
+      "step": 675
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.4825232064555932,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7192,
+      "step": 676
+    },
+    {
+      "epoch": 0.5416,
+      "grad_norm": 0.4462020124779728,
+      "learning_rate": 9.145658595428074e-05,
+      "loss": 0.6748,
+      "step": 677
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.4621544035458387,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7384,
+      "step": 678
+    },
+    {
+      "epoch": 0.5432,
+      "grad_norm": 0.4431544880545007,
+      "learning_rate": 9.09401838592431e-05,
+      "loss": 0.6995,
+      "step": 679
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.5180969404201087,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.796,
+      "step": 680
+    },
+    {
+      "epoch": 0.5448,
+      "grad_norm": 0.44818368733319086,
+      "learning_rate": 9.04240252495963e-05,
+      "loss": 0.726,
+      "step": 681
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.480921118362398,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.7016,
+      "step": 682
+    },
+    {
+      "epoch": 0.5464,
+      "grad_norm": 0.4537241838345266,
+      "learning_rate": 8.990812399726435e-05,
+      "loss": 0.6825,
+      "step": 683
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.44379818024999296,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7028,
+      "step": 684
+    },
+    {
+      "epoch": 0.548,
+      "grad_norm": 0.4827124310633417,
+      "learning_rate": 8.939249396725467e-05,
+      "loss": 0.7388,
+      "step": 685
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.45717227866934373,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.8172,
+      "step": 686
+    },
+    {
+      "epoch": 0.5496,
+      "grad_norm": 0.5407420530627465,
+      "learning_rate": 8.887714901728551e-05,
+      "loss": 0.7344,
+      "step": 687
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.5476220204120361,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.8205,
+      "step": 688
+    },
+    {
+      "epoch": 0.5512,
+      "grad_norm": 0.41236078890642863,
+      "learning_rate": 8.836210299741346e-05,
+      "loss": 0.6615,
+      "step": 689
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.429507843425321,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7424,
+      "step": 690
+    },
+    {
+      "epoch": 0.5528,
+      "grad_norm": 0.45789088417995133,
+      "learning_rate": 8.784736974966135e-05,
+      "loss": 0.814,
+      "step": 691
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.49840226614272215,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.7585,
+      "step": 692
+    },
+    {
+      "epoch": 0.5544,
+      "grad_norm": 0.4482599702245133,
+      "learning_rate": 8.733296310764611e-05,
+      "loss": 0.777,
+      "step": 693
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.41224527407806694,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.6788,
+      "step": 694
+    },
+    {
+      "epoch": 0.556,
+      "grad_norm": 0.41987569808059994,
+      "learning_rate": 8.6818896896207e-05,
+      "loss": 0.7615,
+      "step": 695
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.4323759431067474,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7643,
+      "step": 696
+    },
+    {
+      "epoch": 0.5576,
+      "grad_norm": 0.3675059909087662,
+      "learning_rate": 8.63051849310342e-05,
+      "loss": 0.72,
+      "step": 697
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.45712053432264943,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.7873,
+      "step": 698
+    },
+    {
+      "epoch": 0.5592,
+      "grad_norm": 0.41935204964647294,
+      "learning_rate": 8.579184101829734e-05,
+      "loss": 0.7268,
+      "step": 699
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4739297222554467,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6672,
+      "step": 700
+    },
+    {
+      "epoch": 0.5608,
+      "grad_norm": 0.44991199007186033,
+      "learning_rate": 8.527887895427454e-05,
+      "loss": 0.7452,
+      "step": 701
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.42938766367993775,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.693,
+      "step": 702
+    },
+    {
+      "epoch": 0.5624,
+      "grad_norm": 0.41665659561685137,
+      "learning_rate": 8.476631252498162e-05,
+      "loss": 0.6783,
+      "step": 703
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.46341772855505137,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7273,
+      "step": 704
+    },
+    {
+      "epoch": 0.564,
+      "grad_norm": 0.4182288254216991,
+      "learning_rate": 8.425415550580162e-05,
+      "loss": 0.6811,
+      "step": 705
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.5676463520496755,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6955,
+      "step": 706
+    },
+    {
+      "epoch": 0.5656,
+      "grad_norm": 0.5505236747614208,
+      "learning_rate": 8.374242166111448e-05,
+      "loss": 0.7125,
+      "step": 707
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.5167180992588334,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.6924,
+      "step": 708
+    },
+    {
+      "epoch": 0.5672,
+      "grad_norm": 0.4050515316722716,
+      "learning_rate": 8.323112474392731e-05,
+      "loss": 0.6586,
+      "step": 709
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.3797701587048492,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.6761,
+      "step": 710
+    },
+    {
+      "epoch": 0.5688,
+      "grad_norm": 0.3820742783131323,
+      "learning_rate": 8.272027849550457e-05,
+      "loss": 0.7037,
+      "step": 711
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4207268768764361,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7005,
+      "step": 712
+    },
+    {
+      "epoch": 0.5704,
+      "grad_norm": 0.5872033549506375,
+      "learning_rate": 8.220989664499878e-05,
+      "loss": 0.751,
+      "step": 713
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.41182080280736416,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.681,
+      "step": 714
+    },
+    {
+      "epoch": 0.572,
+      "grad_norm": 0.48090174763861654,
+      "learning_rate": 8.169999290908188e-05,
+      "loss": 0.7419,
+      "step": 715
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.5548055849716488,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.7366,
+      "step": 716
+    },
+    {
+      "epoch": 0.5736,
+      "grad_norm": 0.4361079974941745,
+      "learning_rate": 8.119058099157604e-05,
+      "loss": 0.6864,
+      "step": 717
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.48496754655537594,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7747,
+      "step": 718
+    },
+    {
+      "epoch": 0.5752,
+      "grad_norm": 0.4781167281093951,
+      "learning_rate": 8.068167458308582e-05,
+      "loss": 0.738,
+      "step": 719
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4206306860043934,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.7593,
+      "step": 720
+    },
+    {
+      "epoch": 0.5768,
+      "grad_norm": 0.40300826612193685,
+      "learning_rate": 8.017328736063006e-05,
+      "loss": 0.6671,
+      "step": 721
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.3756572888549628,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6853,
+      "step": 722
+    },
+    {
+      "epoch": 0.5784,
+      "grad_norm": 0.3869805827574144,
+      "learning_rate": 7.966543298727425e-05,
+      "loss": 0.6424,
+      "step": 723
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.43657568486927284,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.7578,
+      "step": 724
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.40679262808071176,
+      "learning_rate": 7.915812511176347e-05,
+      "loss": 0.7224,
+      "step": 725
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.441240906989389,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.7256,
+      "step": 726
+    },
+    {
+      "epoch": 0.5816,
+      "grad_norm": 0.4522037045041075,
+      "learning_rate": 7.865137736815535e-05,
+      "loss": 0.6383,
+      "step": 727
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.43698704397416593,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6941,
+      "step": 728
+    },
+    {
+      "epoch": 0.5832,
+      "grad_norm": 0.4765084690325073,
+      "learning_rate": 7.814520337545406e-05,
+      "loss": 0.7139,
+      "step": 729
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.6562954145610375,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.7757,
+      "step": 730
+    },
+    {
+      "epoch": 0.5848,
+      "grad_norm": 0.4105045945825311,
+      "learning_rate": 7.763961673724379e-05,
+      "loss": 0.6781,
+      "step": 731
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.5251939327477422,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6777,
+      "step": 732
+    },
+    {
+      "epoch": 0.5864,
+      "grad_norm": 0.5695134795172152,
+      "learning_rate": 7.713463104132345e-05,
+      "loss": 0.7216,
+      "step": 733
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.5135868556910134,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.7754,
+      "step": 734
+    },
+    {
+      "epoch": 0.588,
+      "grad_norm": 0.47165858925231896,
+      "learning_rate": 7.663025985934158e-05,
+      "loss": 0.7808,
+      "step": 735
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3880230579417375,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.6446,
+      "step": 736
+    },
+    {
+      "epoch": 0.5896,
+      "grad_norm": 0.5177400802415317,
+      "learning_rate": 7.61265167464313e-05,
+      "loss": 0.7938,
+      "step": 737
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.5341808993716982,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.6853,
+      "step": 738
+    },
+    {
+      "epoch": 0.5912,
+      "grad_norm": 0.40063988029996117,
+      "learning_rate": 7.562341524084623e-05,
+      "loss": 0.6887,
+      "step": 739
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.44864706816073324,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.7598,
+      "step": 740
+    },
+    {
+      "epoch": 0.5928,
+      "grad_norm": 0.42089206618338676,
+      "learning_rate": 7.512096886359664e-05,
+      "loss": 0.7536,
+      "step": 741
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.45402519381629813,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.7069,
+      "step": 742
+    },
+    {
+      "epoch": 0.5944,
+      "grad_norm": 0.46182799064401525,
+      "learning_rate": 7.461919111808595e-05,
+      "loss": 0.6344,
+      "step": 743
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.4044480795325247,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.6877,
+      "step": 744
+    },
+    {
+      "epoch": 0.596,
+      "grad_norm": 0.4452394241287927,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.724,
+      "step": 745
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.42470769576354267,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6801,
+      "step": 746
+    },
+    {
+      "epoch": 0.5976,
+      "grad_norm": 0.38070722851677474,
+      "learning_rate": 7.361769544568425e-05,
+      "loss": 0.6786,
+      "step": 747
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4279682851412783,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.7142,
+      "step": 748
+    },
+    {
+      "epoch": 0.5992,
+      "grad_norm": 0.5153988835037497,
+      "learning_rate": 7.311800443430251e-05,
+      "loss": 0.8058,
+      "step": 749
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.5186003033423139,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7485,
+      "step": 750
+    },
+    {
+      "epoch": 0.6008,
+      "grad_norm": 0.44215384668103297,
+      "learning_rate": 7.26190358849548e-05,
+      "loss": 0.6622,
+      "step": 751
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3953883665933851,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.6566,
+      "step": 752
+    },
+    {
+      "epoch": 0.6024,
+      "grad_norm": 0.5268540652115712,
+      "learning_rate": 7.212080320757695e-05,
+      "loss": 0.7611,
+      "step": 753
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.4374834330320054,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.7196,
+      "step": 754
+    },
+    {
+      "epoch": 0.604,
+      "grad_norm": 0.46517122973130165,
+      "learning_rate": 7.162331979232783e-05,
+      "loss": 0.7478,
+      "step": 755
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.3966377135474766,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.6236,
+      "step": 756
+    },
+    {
+      "epoch": 0.6056,
+      "grad_norm": 0.36283076324324215,
+      "learning_rate": 7.112659900922976e-05,
+      "loss": 0.6768,
+      "step": 757
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.4703446177196834,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7447,
+      "step": 758
+    },
+    {
+      "epoch": 0.6072,
+      "grad_norm": 0.4055321259051819,
+      "learning_rate": 7.06306542078091e-05,
+      "loss": 0.6761,
+      "step": 759
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4617131807146065,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.7615,
+      "step": 760
+    },
+    {
+      "epoch": 0.6088,
+      "grad_norm": 0.40717874666449283,
+      "learning_rate": 7.013549871673736e-05,
+      "loss": 0.669,
+      "step": 761
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.5200251141687845,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6868,
+      "step": 762
+    },
+    {
+      "epoch": 0.6104,
+      "grad_norm": 0.46930700494470096,
+      "learning_rate": 6.964114584347316e-05,
+      "loss": 0.6526,
+      "step": 763
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.4217451501463285,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6825,
+      "step": 764
+    },
+    {
+      "epoch": 0.612,
+      "grad_norm": 0.4976416749467991,
+      "learning_rate": 6.914760887390452e-05,
+      "loss": 0.77,
+      "step": 765
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.4451828871570694,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6973,
+      "step": 766
+    },
+    {
+      "epoch": 0.6136,
+      "grad_norm": 0.5609963646621109,
+      "learning_rate": 6.865490107199181e-05,
+      "loss": 0.6858,
+      "step": 767
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.46504384233714136,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.7657,
+      "step": 768
+    },
+    {
+      "epoch": 0.6152,
+      "grad_norm": 0.4346667091896403,
+      "learning_rate": 6.816303567941112e-05,
+      "loss": 0.7039,
+      "step": 769
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.516554360428688,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.7589,
+      "step": 770
+    },
+    {
+      "epoch": 0.6168,
+      "grad_norm": 0.571785898731525,
+      "learning_rate": 6.767202591519875e-05,
+      "loss": 0.7852,
+      "step": 771
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3745411828670773,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.6745,
+      "step": 772
+    },
+    {
+      "epoch": 0.6184,
+      "grad_norm": 0.5054649628331804,
+      "learning_rate": 6.718188497539554e-05,
+      "loss": 0.6905,
+      "step": 773
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.4619358628489545,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.6517,
+      "step": 774
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.5036440082809436,
+      "learning_rate": 6.669262603269246e-05,
+      "loss": 0.7575,
+      "step": 775
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.49748178875585825,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.7668,
+      "step": 776
+    },
+    {
+      "epoch": 0.6216,
+      "grad_norm": 0.4050494646861793,
+      "learning_rate": 6.620426223607654e-05,
+      "loss": 0.6809,
+      "step": 777
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.41510237254019805,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7229,
+      "step": 778
+    },
+    {
+      "epoch": 0.6232,
+      "grad_norm": 0.4679126409798029,
+      "learning_rate": 6.571680671047749e-05,
+      "loss": 0.7461,
+      "step": 779
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.49923074616833046,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7556,
+      "step": 780
+    },
+    {
+      "epoch": 0.6248,
+      "grad_norm": 0.4683006967237606,
+      "learning_rate": 6.523027255641493e-05,
+      "loss": 0.7257,
+      "step": 781
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.4369295941400887,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.6254,
+      "step": 782
+    },
+    {
+      "epoch": 0.6264,
+      "grad_norm": 0.4169258503447942,
+      "learning_rate": 6.474467284964634e-05,
+      "loss": 0.7278,
+      "step": 783
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.5018850814288661,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.7542,
+      "step": 784
+    },
+    {
+      "epoch": 0.628,
+      "grad_norm": 0.4095436898801255,
+      "learning_rate": 6.426002064081565e-05,
+      "loss": 0.7353,
+      "step": 785
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.4094236842920161,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7252,
+      "step": 786
+    },
+    {
+      "epoch": 0.6296,
+      "grad_norm": 0.48128697326604447,
+      "learning_rate": 6.377632895510248e-05,
+      "loss": 0.6489,
+      "step": 787
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.44410206397414576,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6078,
+      "step": 788
+    },
+    {
+      "epoch": 0.6312,
+      "grad_norm": 0.4367963196859822,
+      "learning_rate": 6.329361079187199e-05,
+      "loss": 0.6302,
+      "step": 789
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.42876170705834227,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.6557,
+      "step": 790
+    },
+    {
+      "epoch": 0.6328,
+      "grad_norm": 0.46487569496018677,
+      "learning_rate": 6.281187912432587e-05,
+      "loss": 0.6632,
+      "step": 791
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.4117070353675247,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6992,
+      "step": 792
+    },
+    {
+      "epoch": 0.6344,
+      "grad_norm": 0.4108913694228595,
+      "learning_rate": 6.233114689915316e-05,
+      "loss": 0.7661,
+      "step": 793
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.5056805355792021,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.78,
+      "step": 794
+    },
+    {
+      "epoch": 0.636,
+      "grad_norm": 0.4783807645427945,
+      "learning_rate": 6.18514270361827e-05,
+      "loss": 0.6945,
+      "step": 795
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.3964619349449278,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.6853,
+      "step": 796
+    },
+    {
+      "epoch": 0.6376,
+      "grad_norm": 0.48729188871411383,
+      "learning_rate": 6.13727324280358e-05,
+      "loss": 0.7326,
+      "step": 797
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.4332830196550788,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.5924,
+      "step": 798
+    },
+    {
+      "epoch": 0.6392,
+      "grad_norm": 0.4069626710751544,
+      "learning_rate": 6.08950759397797e-05,
+      "loss": 0.6983,
+      "step": 799
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.438367791246098,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6913,
+      "step": 800
+    },
+    {
+      "epoch": 0.6408,
+      "grad_norm": 0.4302631410104595,
+      "learning_rate": 6.0418470408581774e-05,
+      "loss": 0.7465,
+      "step": 801
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.5021691106301757,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.7842,
+      "step": 802
+    },
+    {
+      "epoch": 0.6424,
+      "grad_norm": 0.37936693978621344,
+      "learning_rate": 5.9942928643364724e-05,
+      "loss": 0.7254,
+      "step": 803
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.44276835352005717,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6512,
+      "step": 804
+    },
+    {
+      "epoch": 0.644,
+      "grad_norm": 0.4273416655809347,
+      "learning_rate": 5.946846342446214e-05,
+      "loss": 0.6946,
+      "step": 805
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.4301337196717757,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.7054,
+      "step": 806
+    },
+    {
+      "epoch": 0.6456,
+      "grad_norm": 0.45556920191856143,
+      "learning_rate": 5.899508750327501e-05,
+      "loss": 0.6885,
+      "step": 807
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4644653362345782,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.7561,
+      "step": 808
+    },
+    {
+      "epoch": 0.6472,
+      "grad_norm": 0.4256074124149417,
+      "learning_rate": 5.8522813601929324e-05,
+      "loss": 0.6806,
+      "step": 809
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.3926899001419619,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6499,
+      "step": 810
+    },
+    {
+      "epoch": 0.6488,
+      "grad_norm": 0.38092749541547033,
+      "learning_rate": 5.80516544129337e-05,
+      "loss": 0.7165,
+      "step": 811
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.4405630576123063,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6989,
+      "step": 812
+    },
+    {
+      "epoch": 0.6504,
+      "grad_norm": 0.41373361036064654,
+      "learning_rate": 5.758162259883867e-05,
+      "loss": 0.6885,
+      "step": 813
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.3889753043133514,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.6932,
+      "step": 814
+    },
+    {
+      "epoch": 0.652,
+      "grad_norm": 0.3779158448959292,
+      "learning_rate": 5.7112730791896207e-05,
+      "loss": 0.6705,
+      "step": 815
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.4087540029329042,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.7023,
+      "step": 816
+    },
+    {
+      "epoch": 0.6536,
+      "grad_norm": 0.42587075011487324,
+      "learning_rate": 5.664499159372017e-05,
+      "loss": 0.6852,
+      "step": 817
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.49807996258620635,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.7229,
+      "step": 818
+    },
+    {
+      "epoch": 0.6552,
+      "grad_norm": 0.373270181622042,
+      "learning_rate": 5.617841757494762e-05,
+      "loss": 0.632,
+      "step": 819
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4750774107066468,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.7389,
+      "step": 820
+    },
+    {
+      "epoch": 0.6568,
+      "grad_norm": 0.41478198578627196,
+      "learning_rate": 5.5713021274901335e-05,
+      "loss": 0.6885,
+      "step": 821
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.41697709076204126,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.7298,
+      "step": 822
+    },
+    {
+      "epoch": 0.6584,
+      "grad_norm": 0.40997889032425583,
+      "learning_rate": 5.524881520125229e-05,
+      "loss": 0.6547,
+      "step": 823
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.42595443046949505,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6919,
+      "step": 824
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.4272331275871181,
+      "learning_rate": 5.4785811829683764e-05,
+      "loss": 0.69,
+      "step": 825
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.4301365121290822,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.7551,
+      "step": 826
+    },
+    {
+      "epoch": 0.6616,
+      "grad_norm": 0.46430582102646983,
+      "learning_rate": 5.432402360355615e-05,
+      "loss": 0.7078,
+      "step": 827
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.43954462128486094,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.6947,
+      "step": 828
+    },
+    {
+      "epoch": 0.6632,
+      "grad_norm": 0.410877966121775,
+      "learning_rate": 5.386346293357242e-05,
+      "loss": 0.7663,
+      "step": 829
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.45636934444526156,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6881,
+      "step": 830
+    },
+    {
+      "epoch": 0.6648,
+      "grad_norm": 0.4406791030344553,
+      "learning_rate": 5.3404142197444506e-05,
+      "loss": 0.656,
+      "step": 831
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.44908779405293825,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.731,
+      "step": 832
+    },
+    {
+      "epoch": 0.6664,
+      "grad_norm": 0.43081103370266594,
+      "learning_rate": 5.2946073739560706e-05,
+      "loss": 0.6481,
+      "step": 833
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.3790940002229224,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.66,
+      "step": 834
+    },
+    {
+      "epoch": 0.668,
+      "grad_norm": 0.7779848932614899,
+      "learning_rate": 5.248926987065417e-05,
+      "loss": 0.6787,
+      "step": 835
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3686335127303108,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6806,
+      "step": 836
+    },
+    {
+      "epoch": 0.6696,
+      "grad_norm": 0.4439345348202538,
+      "learning_rate": 5.203374286747158e-05,
+      "loss": 0.6912,
+      "step": 837
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.4424913014595794,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.773,
+      "step": 838
+    },
+    {
+      "epoch": 0.6712,
+      "grad_norm": 0.45621744412818693,
+      "learning_rate": 5.15795049724435e-05,
+      "loss": 0.6821,
+      "step": 839
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.394941762067283,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.6682,
+      "step": 840
+    },
+    {
+      "epoch": 0.6728,
+      "grad_norm": 0.5333301131620256,
+      "learning_rate": 5.112656839335543e-05,
+      "loss": 0.7751,
+      "step": 841
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.4110730024792835,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7006,
+      "step": 842
+    },
+    {
+      "epoch": 0.6744,
+      "grad_norm": 0.42121987615334794,
+      "learning_rate": 5.0674945303019526e-05,
+      "loss": 0.652,
+      "step": 843
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4200436383052014,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6618,
+      "step": 844
+    },
+    {
+      "epoch": 0.676,
+      "grad_norm": 0.4739336833529893,
+      "learning_rate": 5.022464783894744e-05,
+      "loss": 0.7237,
+      "step": 845
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.47305673327349285,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7095,
+      "step": 846
+    },
+    {
+      "epoch": 0.6776,
+      "grad_norm": 0.4537154952100946,
+      "learning_rate": 4.977568810302432e-05,
+      "loss": 0.7467,
+      "step": 847
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.4843827307605648,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.645,
+      "step": 848
+    },
+    {
+      "epoch": 0.6792,
+      "grad_norm": 0.46432031803046925,
+      "learning_rate": 4.9328078161183464e-05,
+      "loss": 0.7235,
+      "step": 849
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.3876084772895051,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.7036,
+      "step": 850
+    },
+    {
+      "epoch": 0.6808,
+      "grad_norm": 0.42753374737712524,
+      "learning_rate": 4.88818300430819e-05,
+      "loss": 0.7343,
+      "step": 851
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.3749563864309047,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6464,
+      "step": 852
+    },
+    {
+      "epoch": 0.6824,
+      "grad_norm": 0.4659616812478141,
+      "learning_rate": 4.843695574177737e-05,
+      "loss": 0.7521,
+      "step": 853
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.5857125777562097,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.6817,
+      "step": 854
+    },
+    {
+      "epoch": 0.684,
+      "grad_norm": 0.46210346823419846,
+      "learning_rate": 4.7993467213405706e-05,
+      "loss": 0.6828,
+      "step": 855
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.42221537021220834,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6214,
+      "step": 856
+    },
+    {
+      "epoch": 0.6856,
+      "grad_norm": 0.4457048252425589,
+      "learning_rate": 4.755137637685979e-05,
+      "loss": 0.7197,
+      "step": 857
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.48528059872326096,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.6839,
+      "step": 858
+    },
+    {
+      "epoch": 0.6872,
+      "grad_norm": 0.45852419572461484,
+      "learning_rate": 4.7110695113469085e-05,
+      "loss": 0.7411,
+      "step": 859
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.5617887402455709,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.7202,
+      "step": 860
+    },
+    {
+      "epoch": 0.6888,
+      "grad_norm": 0.4498211895755781,
+      "learning_rate": 4.6671435266680216e-05,
+      "loss": 0.7038,
+      "step": 861
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.42503251286810007,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.6424,
+      "step": 862
+    },
+    {
+      "epoch": 0.6904,
+      "grad_norm": 0.44170758653529296,
+      "learning_rate": 4.623360864173893e-05,
+      "loss": 0.62,
+      "step": 863
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.35610057393796085,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.6383,
+      "step": 864
+    },
+    {
+      "epoch": 0.692,
+      "grad_norm": 0.37425435234487286,
+      "learning_rate": 4.579722700537268e-05,
+      "loss": 0.6754,
+      "step": 865
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.3936776282263319,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6601,
+      "step": 866
+    },
+    {
+      "epoch": 0.6936,
+      "grad_norm": 0.46034838081495977,
+      "learning_rate": 4.5362302085474254e-05,
+      "loss": 0.6943,
+      "step": 867
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.4066225141906522,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.627,
+      "step": 868
+    },
+    {
+      "epoch": 0.6952,
+      "grad_norm": 0.3858640367503925,
+      "learning_rate": 4.492884557078688e-05,
+      "loss": 0.6342,
+      "step": 869
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.44831807101287013,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7039,
+      "step": 870
+    },
+    {
+      "epoch": 0.6968,
+      "grad_norm": 0.46454349710235776,
+      "learning_rate": 4.449686911058992e-05,
+      "loss": 0.7448,
+      "step": 871
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.43185515154717197,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.6478,
+      "step": 872
+    },
+    {
+      "epoch": 0.6984,
+      "grad_norm": 0.4039925961968265,
+      "learning_rate": 4.406638431438576e-05,
+      "loss": 0.6721,
+      "step": 873
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.4685884685660754,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.7064,
+      "step": 874
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.437495368402922,
+      "learning_rate": 4.36374027515878e-05,
+      "loss": 0.697,
+      "step": 875
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.5013609622097205,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6348,
+      "step": 876
+    },
+    {
+      "epoch": 0.7016,
+      "grad_norm": 0.3928807046293955,
+      "learning_rate": 4.320993595120969e-05,
+      "loss": 0.6506,
+      "step": 877
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.41851771065401694,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.695,
+      "step": 878
+    },
+    {
+      "epoch": 0.7032,
+      "grad_norm": 0.4148397310479281,
+      "learning_rate": 4.278399540155536e-05,
+      "loss": 0.6843,
+      "step": 879
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.34632427812634764,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.6448,
+      "step": 880
+    },
+    {
+      "epoch": 0.7048,
+      "grad_norm": 0.4434044271178348,
+      "learning_rate": 4.2359592549910145e-05,
+      "loss": 0.6966,
+      "step": 881
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.4185228933267323,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.6827,
+      "step": 882
+    },
+    {
+      "epoch": 0.7064,
+      "grad_norm": 0.38459801766096763,
+      "learning_rate": 4.193673880223339e-05,
+      "loss": 0.6889,
+      "step": 883
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.3943276928660913,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6177,
+      "step": 884
+    },
+    {
+      "epoch": 0.708,
+      "grad_norm": 0.41176764896341617,
+      "learning_rate": 4.1515445522851784e-05,
+      "loss": 0.6971,
+      "step": 885
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.38025970545569326,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6455,
+      "step": 886
+    },
+    {
+      "epoch": 0.7096,
+      "grad_norm": 0.4420415156862601,
+      "learning_rate": 4.109572403415386e-05,
+      "loss": 0.6057,
+      "step": 887
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.43611147714187426,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.6748,
+      "step": 888
+    },
+    {
+      "epoch": 0.7112,
+      "grad_norm": 0.397038492096301,
+      "learning_rate": 4.0677585616285774e-05,
+      "loss": 0.6918,
+      "step": 889
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.4664109049186901,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.7114,
+      "step": 890
+    },
+    {
+      "epoch": 0.7128,
+      "grad_norm": 0.39762390801481073,
+      "learning_rate": 4.026104150684835e-05,
+      "loss": 0.6474,
+      "step": 891
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.58165867026539,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6996,
+      "step": 892
+    },
+    {
+      "epoch": 0.7144,
+      "grad_norm": 0.37818800588359214,
+      "learning_rate": 3.984610290059467e-05,
+      "loss": 0.6581,
+      "step": 893
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.41370612659127415,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.6719,
+      "step": 894
+    },
+    {
+      "epoch": 0.716,
+      "grad_norm": 0.4234246150614126,
+      "learning_rate": 3.943278094912946e-05,
+      "loss": 0.6955,
+      "step": 895
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.45271432227329883,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.7278,
+      "step": 896
+    },
+    {
+      "epoch": 0.7176,
+      "grad_norm": 0.3896901191977369,
+      "learning_rate": 3.902108676060937e-05,
+      "loss": 0.6036,
+      "step": 897
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.5609736262184741,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.7335,
+      "step": 898
+    },
+    {
+      "epoch": 0.7192,
+      "grad_norm": 0.38077211332926136,
+      "learning_rate": 3.861103139944449e-05,
+      "loss": 0.6798,
+      "step": 899
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.4064709776626615,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.6812,
+      "step": 900
+    },
+    {
+      "epoch": 0.7208,
+      "grad_norm": 0.37490874266779545,
+      "learning_rate": 3.820262588600074e-05,
+      "loss": 0.6342,
+      "step": 901
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.4330143977466438,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6877,
+      "step": 902
+    },
+    {
+      "epoch": 0.7224,
+      "grad_norm": 0.39609531533999726,
+      "learning_rate": 3.7795881196303995e-05,
+      "loss": 0.6796,
+      "step": 903
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.5746872198742553,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.7169,
+      "step": 904
+    },
+    {
+      "epoch": 0.724,
+      "grad_norm": 0.6955975779348033,
+      "learning_rate": 3.739080826174498e-05,
+      "loss": 0.6939,
+      "step": 905
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.4583001053565282,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.6649,
+      "step": 906
+    },
+    {
+      "epoch": 0.7256,
+      "grad_norm": 0.335961766805353,
+      "learning_rate": 3.6987417968785366e-05,
+      "loss": 0.6217,
+      "step": 907
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.4625730889324945,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.7732,
+      "step": 908
+    },
+    {
+      "epoch": 0.7272,
+      "grad_norm": 0.44874139521442424,
+      "learning_rate": 3.658572115866541e-05,
+      "loss": 0.7845,
+      "step": 909
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4216141196611632,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6761,
+      "step": 910
+    },
+    {
+      "epoch": 0.7288,
+      "grad_norm": 0.44065164143016794,
+      "learning_rate": 3.618572862711247e-05,
+      "loss": 0.6558,
+      "step": 911
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.4864648977419046,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.7345,
+      "step": 912
+    },
+    {
+      "epoch": 0.7304,
+      "grad_norm": 0.49567456143923777,
+      "learning_rate": 3.578745112405083e-05,
+      "loss": 0.6331,
+      "step": 913
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.38520363824229475,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.6846,
+      "step": 914
+    },
+    {
+      "epoch": 0.732,
+      "grad_norm": 0.4298873914826667,
+      "learning_rate": 3.539089935331294e-05,
+      "loss": 0.7205,
+      "step": 915
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4182063839645925,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6546,
+      "step": 916
+    },
+    {
+      "epoch": 0.7336,
+      "grad_norm": 0.7984264772972497,
+      "learning_rate": 3.4996083972351515e-05,
+      "loss": 0.7568,
+      "step": 917
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.45938886079604774,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6834,
+      "step": 918
+    },
+    {
+      "epoch": 0.7352,
+      "grad_norm": 0.3602631663968547,
+      "learning_rate": 3.4603015591953395e-05,
+      "loss": 0.6831,
+      "step": 919
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.45573146538831666,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.7273,
+      "step": 920
+    },
+    {
+      "epoch": 0.7368,
+      "grad_norm": 0.4411017629549524,
+      "learning_rate": 3.421170477595419e-05,
+      "loss": 0.7174,
+      "step": 921
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.4115477835089943,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7038,
+      "step": 922
+    },
+    {
+      "epoch": 0.7384,
+      "grad_norm": 0.4397253268529848,
+      "learning_rate": 3.3822162040954354e-05,
+      "loss": 0.6749,
+      "step": 923
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.4153110037054647,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6694,
+      "step": 924
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.45330892199582196,
+      "learning_rate": 3.34343978560367e-05,
+      "loss": 0.7337,
+      "step": 925
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.45014042080415023,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.7086,
+      "step": 926
+    },
+    {
+      "epoch": 0.7416,
+      "grad_norm": 0.5453929600104398,
+      "learning_rate": 3.3048422642484886e-05,
+      "loss": 0.6513,
+      "step": 927
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3941390107280701,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6532,
+      "step": 928
+    },
+    {
+      "epoch": 0.7432,
+      "grad_norm": 0.46690009399066584,
+      "learning_rate": 3.266424677350346e-05,
+      "loss": 0.6889,
+      "step": 929
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4544476435460685,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6279,
+      "step": 930
+    },
+    {
+      "epoch": 0.7448,
+      "grad_norm": 0.4666156504306541,
+      "learning_rate": 3.228188057393895e-05,
+      "loss": 0.7089,
+      "step": 931
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.4000486478664046,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.6598,
+      "step": 932
+    },
+    {
+      "epoch": 0.7464,
+      "grad_norm": 0.3959166116092314,
+      "learning_rate": 3.190133432000252e-05,
+      "loss": 0.6406,
+      "step": 933
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.4999745227355347,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.7318,
+      "step": 934
+    },
+    {
+      "epoch": 0.748,
+      "grad_norm": 0.5220824551660728,
+      "learning_rate": 3.1522618238993725e-05,
+      "loss": 0.7625,
+      "step": 935
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.39539992669258,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.6679,
+      "step": 936
+    },
+    {
+      "epoch": 0.7496,
+      "grad_norm": 0.5036688882847062,
+      "learning_rate": 3.114574250902558e-05,
+      "loss": 0.7887,
+      "step": 937
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.48227512199081035,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.7103,
+      "step": 938
+    },
+    {
+      "epoch": 0.7512,
+      "grad_norm": 0.5949449071155793,
+      "learning_rate": 3.077071725875116e-05,
+      "loss": 0.7775,
+      "step": 939
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.40727637141086087,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6921,
+      "step": 940
+    },
+    {
+      "epoch": 0.7528,
+      "grad_norm": 0.4009110429283926,
+      "learning_rate": 3.0397552567091337e-05,
+      "loss": 0.674,
+      "step": 941
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.3697649649258211,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.64,
+      "step": 942
+    },
+    {
+      "epoch": 0.7544,
+      "grad_norm": 0.4667293936982586,
+      "learning_rate": 3.0026258462963787e-05,
+      "loss": 0.7639,
+      "step": 943
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.39096891763990427,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.6833,
+      "step": 944
+    },
+    {
+      "epoch": 0.756,
+      "grad_norm": 0.4295317564427617,
+      "learning_rate": 2.9656844925013637e-05,
+      "loss": 0.6138,
+      "step": 945
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.49473067071643123,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6537,
+      "step": 946
+    },
+    {
+      "epoch": 0.7576,
+      "grad_norm": 0.43121447094036847,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.749,
+      "step": 947
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3945702962951126,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6462,
+      "step": 948
+    },
+    {
+      "epoch": 0.7592,
+      "grad_norm": 0.4256126539096834,
+      "learning_rate": 2.8923699209255284e-05,
+      "loss": 0.6409,
+      "step": 949
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.4209330020909794,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.7048,
+      "step": 950
+    },
+    {
+      "epoch": 0.7608,
+      "grad_norm": 0.44465171515956664,
+      "learning_rate": 2.8559986734967282e-05,
+      "loss": 0.7486,
+      "step": 951
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.470146543029111,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6958,
+      "step": 952
+    },
+    {
+      "epoch": 0.7624,
+      "grad_norm": 0.3430542192799981,
+      "learning_rate": 2.819819423336775e-05,
+      "loss": 0.6681,
+      "step": 953
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.4278662707906049,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6733,
+      "step": 954
+    },
+    {
+      "epoch": 0.764,
+      "grad_norm": 0.46376730813190903,
+      "learning_rate": 2.7838331427743282e-05,
+      "loss": 0.7474,
+      "step": 955
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.4520046856226995,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6573,
+      "step": 956
+    },
+    {
+      "epoch": 0.7656,
+      "grad_norm": 0.32907579332774634,
+      "learning_rate": 2.7480407989519198e-05,
+      "loss": 0.5786,
+      "step": 957
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.42564964837736396,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.668,
+      "step": 958
+    },
+    {
+      "epoch": 0.7672,
+      "grad_norm": 0.3959031941752177,
+      "learning_rate": 2.712443353799984e-05,
+      "loss": 0.6476,
+      "step": 959
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4029997278045812,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6632,
+      "step": 960
+    },
+    {
+      "epoch": 0.7688,
+      "grad_norm": 0.43738493230571895,
+      "learning_rate": 2.677041764010988e-05,
+      "loss": 0.6769,
+      "step": 961
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.45933231703250693,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6613,
+      "step": 962
+    },
+    {
+      "epoch": 0.7704,
+      "grad_norm": 0.39551741707098925,
+      "learning_rate": 2.6418369810137188e-05,
+      "loss": 0.6997,
+      "step": 963
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.4034196705758546,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6626,
+      "step": 964
+    },
+    {
+      "epoch": 0.772,
+      "grad_norm": 0.4129745482055319,
+      "learning_rate": 2.6068299509477266e-05,
+      "loss": 0.6623,
+      "step": 965
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.5171749789515476,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6979,
+      "step": 966
+    },
+    {
+      "epoch": 0.7736,
+      "grad_norm": 0.4285692421209914,
+      "learning_rate": 2.5720216146378917e-05,
+      "loss": 0.6502,
+      "step": 967
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4081031515560949,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.6701,
+      "step": 968
+    },
+    {
+      "epoch": 0.7752,
+      "grad_norm": 0.3674226067898384,
+      "learning_rate": 2.5374129075691265e-05,
+      "loss": 0.6832,
+      "step": 969
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.3756956369593102,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.6561,
+      "step": 970
+    },
+    {
+      "epoch": 0.7768,
+      "grad_norm": 0.4487599902084762,
+      "learning_rate": 2.503004759861258e-05,
+      "loss": 0.6587,
+      "step": 971
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.46317087209149344,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6616,
+      "step": 972
+    },
+    {
+      "epoch": 0.7784,
+      "grad_norm": 0.46869974121680025,
+      "learning_rate": 2.4687980962440072e-05,
+      "loss": 0.6819,
+      "step": 973
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.3929096221124496,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6349,
+      "step": 974
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.38727743479523596,
+      "learning_rate": 2.4347938360321566e-05,
+      "loss": 0.6367,
+      "step": 975
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.4463347660896269,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6847,
+      "step": 976
+    },
+    {
+      "epoch": 0.7816,
+      "grad_norm": 0.3458659314793,
+      "learning_rate": 2.400992893100822e-05,
+      "loss": 0.6058,
+      "step": 977
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.4659795346966299,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6699,
+      "step": 978
+    },
+    {
+      "epoch": 0.7832,
+      "grad_norm": 0.43772876372166897,
+      "learning_rate": 2.3673961758609152e-05,
+      "loss": 0.6882,
+      "step": 979
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.41809773512693893,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.7028,
+      "step": 980
+    },
+    {
+      "epoch": 0.7848,
+      "grad_norm": 0.4515957698474849,
+      "learning_rate": 2.334004587234717e-05,
+      "loss": 0.6731,
+      "step": 981
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.5262501752393429,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.7666,
+      "step": 982
+    },
+    {
+      "epoch": 0.7864,
+      "grad_norm": 0.36199340044564093,
+      "learning_rate": 2.300819024631603e-05,
+      "loss": 0.716,
+      "step": 983
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.44144751312494,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6346,
+      "step": 984
+    },
+    {
+      "epoch": 0.788,
+      "grad_norm": 0.4342055593224267,
+      "learning_rate": 2.26784037992395e-05,
+      "loss": 0.7005,
+      "step": 985
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.3626795796653051,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6281,
+      "step": 986
+    },
+    {
+      "epoch": 0.7896,
+      "grad_norm": 0.4388778436201537,
+      "learning_rate": 2.2350695394231345e-05,
+      "loss": 0.6291,
+      "step": 987
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.4350075171081566,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.7171,
+      "step": 988
+    },
+    {
+      "epoch": 0.7912,
+      "grad_norm": 0.46335806131502627,
+      "learning_rate": 2.2025073838557454e-05,
+      "loss": 0.7209,
+      "step": 989
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.404156302898499,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6586,
+      "step": 990
+    },
+    {
+      "epoch": 0.7928,
+      "grad_norm": 0.4560385946479281,
+      "learning_rate": 2.1701547883398922e-05,
+      "loss": 0.7372,
+      "step": 991
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.3512054610727605,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.583,
+      "step": 992
+    },
+    {
+      "epoch": 0.7944,
+      "grad_norm": 0.5163389460296613,
+      "learning_rate": 2.138012622361689e-05,
+      "loss": 0.7242,
+      "step": 993
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.40836637784071556,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6489,
+      "step": 994
+    },
+    {
+      "epoch": 0.796,
+      "grad_norm": 0.38595969998255003,
+      "learning_rate": 2.106081749751897e-05,
+      "loss": 0.6705,
+      "step": 995
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.44458769974886425,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.7151,
+      "step": 996
+    },
+    {
+      "epoch": 0.7976,
+      "grad_norm": 0.387572491963222,
+      "learning_rate": 2.0743630286627002e-05,
+      "loss": 0.6634,
+      "step": 997
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.39831307254179926,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6851,
+      "step": 998
+    },
+    {
+      "epoch": 0.7992,
+      "grad_norm": 0.4326793722012506,
+      "learning_rate": 2.0428573115446392e-05,
+      "loss": 0.6522,
+      "step": 999
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3761518009550033,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6898,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8008,
+      "grad_norm": 0.5318168930636895,
+      "learning_rate": 2.011565445123711e-05,
+      "loss": 0.7398,
+      "step": 1001
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.4402774976471995,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.6764,
+      "step": 1002
+    },
+    {
+      "epoch": 0.8024,
+      "grad_norm": 0.3636615923719273,
+      "learning_rate": 1.980488270378612e-05,
+      "loss": 0.5973,
+      "step": 1003
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.5615009760015723,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.7686,
+      "step": 1004
+    },
+    {
+      "epoch": 0.804,
+      "grad_norm": 0.5488451248282121,
+      "learning_rate": 1.9496266225181248e-05,
+      "loss": 0.6594,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.5068495893501076,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7588,
+      "step": 1006
+    },
+    {
+      "epoch": 0.8056,
+      "grad_norm": 0.6407632462410784,
+      "learning_rate": 1.918981330958678e-05,
+      "loss": 0.7611,
+      "step": 1007
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.4900238542939908,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6692,
+      "step": 1008
+    },
+    {
+      "epoch": 0.8072,
+      "grad_norm": 0.4458753891615236,
+      "learning_rate": 1.8885532193020704e-05,
+      "loss": 0.7154,
+      "step": 1009
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.4268728281983383,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7403,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8088,
+      "grad_norm": 0.4163307353758894,
+      "learning_rate": 1.8583431053133127e-05,
+      "loss": 0.6751,
+      "step": 1011
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3790711693886177,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.6714,
+      "step": 1012
+    },
+    {
+      "epoch": 0.8104,
+      "grad_norm": 0.43613734243006946,
+      "learning_rate": 1.8283518008986567e-05,
+      "loss": 0.6329,
+      "step": 1013
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.416625334207955,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.582,
+      "step": 1014
+    },
+    {
+      "epoch": 0.812,
+      "grad_norm": 0.3573621377255294,
+      "learning_rate": 1.7985801120837865e-05,
+      "loss": 0.6391,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.44838677241013075,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6694,
+      "step": 1016
+    },
+    {
+      "epoch": 0.8136,
+      "grad_norm": 0.49598155654857595,
+      "learning_rate": 1.7690288389921493e-05,
+      "loss": 0.7922,
+      "step": 1017
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.4195459170668918,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.665,
+      "step": 1018
+    },
+    {
+      "epoch": 0.8152,
+      "grad_norm": 0.40749648619205064,
+      "learning_rate": 1.739698775823442e-05,
+      "loss": 0.6148,
+      "step": 1019
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.4928095362771939,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.741,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8168,
+      "grad_norm": 0.42196252796864997,
+      "learning_rate": 1.7105907108322816e-05,
+      "loss": 0.648,
+      "step": 1021
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.407490799028459,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.6745,
+      "step": 1022
+    },
+    {
+      "epoch": 0.8184,
+      "grad_norm": 0.4430637115129798,
+      "learning_rate": 1.6817054263070174e-05,
+      "loss": 0.6647,
+      "step": 1023
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.41572550766040145,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6626,
+      "step": 1024
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.48442852541343595,
+      "learning_rate": 1.6530436985486996e-05,
+      "loss": 0.7474,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.39576276773544145,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6829,
+      "step": 1026
+    },
+    {
+      "epoch": 0.8216,
+      "grad_norm": 0.40957410219070856,
+      "learning_rate": 1.6246062978502164e-05,
+      "loss": 0.6913,
+      "step": 1027
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.40815395918551955,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6332,
+      "step": 1028
+    },
+    {
+      "epoch": 0.8232,
+      "grad_norm": 0.45848707393438665,
+      "learning_rate": 1.5963939884756042e-05,
+      "loss": 0.6935,
+      "step": 1029
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.3955987497629303,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6329,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8248,
+      "grad_norm": 0.48396955421661225,
+      "learning_rate": 1.5684075286394985e-05,
+      "loss": 0.6775,
+      "step": 1031
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.37748761806878395,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6406,
+      "step": 1032
+    },
+    {
+      "epoch": 0.8264,
+      "grad_norm": 0.35216481069312683,
+      "learning_rate": 1.5406476704867524e-05,
+      "loss": 0.6134,
+      "step": 1033
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.44214502121194277,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.644,
+      "step": 1034
+    },
+    {
+      "epoch": 0.828,
+      "grad_norm": 0.4885588405453753,
+      "learning_rate": 1.5131151600722337e-05,
+      "loss": 0.6568,
+      "step": 1035
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.4203449899965025,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6871,
+      "step": 1036
+    },
+    {
+      "epoch": 0.8296,
+      "grad_norm": 0.3679142957176437,
+      "learning_rate": 1.485810737340767e-05,
+      "loss": 0.6178,
+      "step": 1037
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.4319693861040839,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.635,
+      "step": 1038
+    },
+    {
+      "epoch": 0.8312,
+      "grad_norm": 0.40801722426942594,
+      "learning_rate": 1.4587351361072454e-05,
+      "loss": 0.6086,
+      "step": 1039
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.4317790754967962,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.7299,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8328,
+      "grad_norm": 0.39203068360957366,
+      "learning_rate": 1.4318890840369182e-05,
+      "loss": 0.6272,
+      "step": 1041
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.42847622643373917,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.7315,
+      "step": 1042
+    },
+    {
+      "epoch": 0.8344,
+      "grad_norm": 0.4515897736109138,
+      "learning_rate": 1.4052733026258281e-05,
+      "loss": 0.6656,
+      "step": 1043
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.46418313495739943,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.7922,
+      "step": 1044
+    },
+    {
+      "epoch": 0.836,
+      "grad_norm": 0.4190693005291634,
+      "learning_rate": 1.3788885071814172e-05,
+      "loss": 0.6779,
+      "step": 1045
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.4748448416607454,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6192,
+      "step": 1046
+    },
+    {
+      "epoch": 0.8376,
+      "grad_norm": 0.4570517756847288,
+      "learning_rate": 1.3527354068033139e-05,
+      "loss": 0.6838,
+      "step": 1047
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.43346042947082064,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.627,
+      "step": 1048
+    },
+    {
+      "epoch": 0.8392,
+      "grad_norm": 0.6083316409096777,
+      "learning_rate": 1.326814704364262e-05,
+      "loss": 0.7133,
+      "step": 1049
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.4149318177968615,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6812,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8408,
+      "grad_norm": 0.40279033289445737,
+      "learning_rate": 1.3011270964912459e-05,
+      "loss": 0.6491,
+      "step": 1051
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.3661942288150413,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6409,
+      "step": 1052
+    },
+    {
+      "epoch": 0.8424,
+      "grad_norm": 0.3492518693723157,
+      "learning_rate": 1.275673273546758e-05,
+      "loss": 0.6449,
+      "step": 1053
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.38664075688037525,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6476,
+      "step": 1054
+    },
+    {
+      "epoch": 0.844,
+      "grad_norm": 0.3732950185886913,
+      "learning_rate": 1.2504539196102439e-05,
+      "loss": 0.6451,
+      "step": 1055
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.40586156896060066,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6676,
+      "step": 1056
+    },
+    {
+      "epoch": 0.8456,
+      "grad_norm": 0.39914721626542893,
+      "learning_rate": 1.2254697124597237e-05,
+      "loss": 0.6688,
+      "step": 1057
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.5858423597476015,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.7109,
+      "step": 1058
+    },
+    {
+      "epoch": 0.8472,
+      "grad_norm": 0.39004996372302436,
+      "learning_rate": 1.2007213235535786e-05,
+      "loss": 0.6711,
+      "step": 1059
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4422846916746057,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.7074,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8488,
+      "grad_norm": 0.38360376191728496,
+      "learning_rate": 1.176209418012495e-05,
+      "loss": 0.6687,
+      "step": 1061
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.4103838694367495,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6366,
+      "step": 1062
+    },
+    {
+      "epoch": 0.8504,
+      "grad_norm": 0.4543607701441768,
+      "learning_rate": 1.1519346546015907e-05,
+      "loss": 0.6662,
+      "step": 1063
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.481168479070166,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.7172,
+      "step": 1064
+    },
+    {
+      "epoch": 0.852,
+      "grad_norm": 0.3571374357856971,
+      "learning_rate": 1.1278976857127311e-05,
+      "loss": 0.5997,
+      "step": 1065
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.4656901277210353,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6883,
+      "step": 1066
+    },
+    {
+      "epoch": 0.8536,
+      "grad_norm": 0.3883852086224729,
+      "learning_rate": 1.1040991573469629e-05,
+      "loss": 0.6142,
+      "step": 1067
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.46282118180927284,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6742,
+      "step": 1068
+    },
+    {
+      "epoch": 0.8552,
+      "grad_norm": 0.44416827498862815,
+      "learning_rate": 1.0805397090971737e-05,
+      "loss": 0.7332,
+      "step": 1069
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.3764661067145374,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6429,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8568,
+      "grad_norm": 0.462275565700437,
+      "learning_rate": 1.057219974130903e-05,
+      "loss": 0.6839,
+      "step": 1071
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.4730388536693979,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6604,
+      "step": 1072
+    },
+    {
+      "epoch": 0.8584,
+      "grad_norm": 0.4795687153560455,
+      "learning_rate": 1.0341405791733183e-05,
+      "loss": 0.7347,
+      "step": 1073
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.4716673153974263,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.696,
+      "step": 1074
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.4664103686844258,
+      "learning_rate": 1.0113021444903726e-05,
+      "loss": 0.7794,
+      "step": 1075
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.5018422631057337,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.7322,
+      "step": 1076
+    },
+    {
+      "epoch": 0.8616,
+      "grad_norm": 0.42993250880327977,
+      "learning_rate": 9.887052838721322e-06,
+      "loss": 0.6518,
+      "step": 1077
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.44171724743541363,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.7116,
+      "step": 1078
+    },
+    {
+      "epoch": 0.8632,
+      "grad_norm": 0.4419988093536476,
+      "learning_rate": 9.663506046162985e-06,
+      "loss": 0.6499,
+      "step": 1079
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.5430040198384588,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6606,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8648,
+      "grad_norm": 0.4258664961000874,
+      "learning_rate": 9.44238707511862e-06,
+      "loss": 0.6453,
+      "step": 1081
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.4620311190948906,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.7282,
+      "step": 1082
+    },
+    {
+      "epoch": 0.8664,
+      "grad_norm": 0.474533642643054,
+      "learning_rate": 9.22370186822965e-06,
+      "loss": 0.7126,
+      "step": 1083
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.4419312126447903,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6813,
+      "step": 1084
+    },
+    {
+      "epoch": 0.868,
+      "grad_norm": 0.4726161631102984,
+      "learning_rate": 9.0074563027294e-06,
+      "loss": 0.6794,
+      "step": 1085
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.3932056993668791,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6492,
+      "step": 1086
+    },
+    {
+      "epoch": 0.8696,
+      "grad_norm": 0.38928214123694843,
+      "learning_rate": 8.79365619028507e-06,
+      "loss": 0.6556,
+      "step": 1087
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4518171591491437,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.7284,
+      "step": 1088
+    },
+    {
+      "epoch": 0.8712,
+      "grad_norm": 0.44001482188386976,
+      "learning_rate": 8.582307276841462e-06,
+      "loss": 0.6851,
+      "step": 1089
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.40605390396561136,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.7082,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8728,
+      "grad_norm": 0.47335094277664147,
+      "learning_rate": 8.37341524246672e-06,
+      "loss": 0.7335,
+      "step": 1091
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.3744189862391875,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.6864,
+      "step": 1092
+    },
+    {
+      "epoch": 0.8744,
+      "grad_norm": 0.45417328532589746,
+      "learning_rate": 8.166985701199582e-06,
+      "loss": 0.6185,
+      "step": 1093
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.43987961812149784,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.7014,
+      "step": 1094
+    },
+    {
+      "epoch": 0.876,
+      "grad_norm": 0.42984884893442815,
+      "learning_rate": 7.963024200898462e-06,
+      "loss": 0.716,
+      "step": 1095
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.47596598245542965,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.761,
+      "step": 1096
+    },
+    {
+      "epoch": 0.8776,
+      "grad_norm": 0.39859236660861885,
+      "learning_rate": 7.761536223092458e-06,
+      "loss": 0.6249,
+      "step": 1097
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.4176226059997992,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6545,
+      "step": 1098
+    },
+    {
+      "epoch": 0.8792,
+      "grad_norm": 0.4091784905788249,
+      "learning_rate": 7.562527182833978e-06,
+      "loss": 0.6413,
+      "step": 1099
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4479446913269882,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6985,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8808,
+      "grad_norm": 0.4274965995923488,
+      "learning_rate": 7.366002428553153e-06,
+      "loss": 0.6279,
+      "step": 1101
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.4175199943420985,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.7025,
+      "step": 1102
+    },
+    {
+      "epoch": 0.8824,
+      "grad_norm": 0.3692704830319196,
+      "learning_rate": 7.171967241914224e-06,
+      "loss": 0.5538,
+      "step": 1103
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.41193975984275244,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6576,
+      "step": 1104
+    },
+    {
+      "epoch": 0.884,
+      "grad_norm": 0.3879903506222446,
+      "learning_rate": 6.980426837673437e-06,
+      "loss": 0.5984,
+      "step": 1105
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.4788245547001526,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.7042,
+      "step": 1106
+    },
+    {
+      "epoch": 0.8856,
+      "grad_norm": 0.3825999151613301,
+      "learning_rate": 6.791386363539065e-06,
+      "loss": 0.6431,
+      "step": 1107
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.3874923256907664,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.7392,
+      "step": 1108
+    },
+    {
+      "epoch": 0.8872,
+      "grad_norm": 0.49706772007250527,
+      "learning_rate": 6.604850900032955e-06,
+      "loss": 0.6588,
+      "step": 1109
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.446642706622744,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6563,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8888,
+      "grad_norm": 0.43648055985384054,
+      "learning_rate": 6.420825460353974e-06,
+      "loss": 0.675,
+      "step": 1111
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.4487207757830329,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6355,
+      "step": 1112
+    },
+    {
+      "epoch": 0.8904,
+      "grad_norm": 0.6904081084293971,
+      "learning_rate": 6.239314990243339e-06,
+      "loss": 0.7009,
+      "step": 1113
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.374706566591849,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.6127,
+      "step": 1114
+    },
+    {
+      "epoch": 0.892,
+      "grad_norm": 0.409270401776318,
+      "learning_rate": 6.0603243678516995e-06,
+      "loss": 0.7169,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.5035641222209561,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.656,
+      "step": 1116
+    },
+    {
+      "epoch": 0.8936,
+      "grad_norm": 0.4707754134426772,
+      "learning_rate": 5.883858403607967e-06,
+      "loss": 0.7027,
+      "step": 1117
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.4249444770999359,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6529,
+      "step": 1118
+    },
+    {
+      "epoch": 0.8952,
+      "grad_norm": 0.3998860715897474,
+      "learning_rate": 5.7099218400900716e-06,
+      "loss": 0.6776,
+      "step": 1119
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.4674436263252115,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6552,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8968,
+      "grad_norm": 0.44788320358106837,
+      "learning_rate": 5.538519351897575e-06,
+      "loss": 0.6159,
+      "step": 1121
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.41531233991063055,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6908,
+      "step": 1122
+    },
+    {
+      "epoch": 0.8984,
+      "grad_norm": 0.393963733771613,
+      "learning_rate": 5.369655545525909e-06,
+      "loss": 0.6512,
+      "step": 1123
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.44347314190488557,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6791,
+      "step": 1124
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.41861294210651295,
+      "learning_rate": 5.2033349592426335e-06,
+      "loss": 0.7193,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.47729281110819066,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.643,
+      "step": 1126
+    },
+    {
+      "epoch": 0.9016,
+      "grad_norm": 0.3943339306536462,
+      "learning_rate": 5.039562062965508e-06,
+      "loss": 0.6673,
+      "step": 1127
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.44250450829581267,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.69,
+      "step": 1128
+    },
+    {
+      "epoch": 0.9032,
+      "grad_norm": 0.43638229451147903,
+      "learning_rate": 4.87834125814235e-06,
+      "loss": 0.6687,
+      "step": 1129
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.47806195481300895,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6697,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9048,
+      "grad_norm": 0.37866928780170195,
+      "learning_rate": 4.719676877632639e-06,
+      "loss": 0.6488,
+      "step": 1131
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.4270131231094788,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6531,
+      "step": 1132
+    },
+    {
+      "epoch": 0.9064,
+      "grad_norm": 0.4239074659792173,
+      "learning_rate": 4.563573185591219e-06,
+      "loss": 0.672,
+      "step": 1133
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.4010380547708696,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6814,
+      "step": 1134
+    },
+    {
+      "epoch": 0.908,
+      "grad_norm": 0.38254446097538725,
+      "learning_rate": 4.4100343773536225e-06,
+      "loss": 0.59,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3816544876495538,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6516,
+      "step": 1136
+    },
+    {
+      "epoch": 0.9096,
+      "grad_norm": 0.4149851969056768,
+      "learning_rate": 4.259064579323302e-06,
+      "loss": 0.6342,
+      "step": 1137
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.4939913273078448,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.7097,
+      "step": 1138
+    },
+    {
+      "epoch": 0.9112,
+      "grad_norm": 0.4473620301826761,
+      "learning_rate": 4.1106678488607495e-06,
+      "loss": 0.6842,
+      "step": 1139
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.347898190178297,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.6866,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9128,
+      "grad_norm": 0.4677631604683346,
+      "learning_rate": 3.964848174174541e-06,
+      "loss": 0.7153,
+      "step": 1141
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.36857826653537523,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.6308,
+      "step": 1142
+    },
+    {
+      "epoch": 0.9144,
+      "grad_norm": 0.4208952222294186,
+      "learning_rate": 3.821609474213983e-06,
+      "loss": 0.6234,
+      "step": 1143
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3615947693575601,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.613,
+      "step": 1144
+    },
+    {
+      "epoch": 0.916,
+      "grad_norm": 0.3756762767045553,
+      "learning_rate": 3.6809555985639068e-06,
+      "loss": 0.6585,
+      "step": 1145
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.4327928230702486,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6826,
+      "step": 1146
+    },
+    {
+      "epoch": 0.9176,
+      "grad_norm": 0.4042377275237032,
+      "learning_rate": 3.5428903273411863e-06,
+      "loss": 0.7428,
+      "step": 1147
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.431581157071277,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6422,
+      "step": 1148
+    },
+    {
+      "epoch": 0.9192,
+      "grad_norm": 0.3996661084377291,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.6932,
+      "step": 1149
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.3986990368694086,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6977,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9208,
+      "grad_norm": 0.4280089557109094,
+      "learning_rate": 3.2745403706978872e-06,
+      "loss": 0.7035,
+      "step": 1151
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4437933253446788,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6996,
+      "step": 1152
+    },
+    {
+      "epoch": 0.9224,
+      "grad_norm": 0.4337974583319791,
+      "learning_rate": 3.1442628972662704e-06,
+      "loss": 0.6574,
+      "step": 1153
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.5209740419143642,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6811,
+      "step": 1154
+    },
+    {
+      "epoch": 0.924,
+      "grad_norm": 0.5128391656199703,
+      "learning_rate": 3.0165884520461316e-06,
+      "loss": 0.6759,
+      "step": 1155
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.5064398613672889,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.711,
+      "step": 1156
+    },
+    {
+      "epoch": 0.9256,
+      "grad_norm": 0.4040717907377643,
+      "learning_rate": 2.8915204663281013e-06,
+      "loss": 0.6127,
+      "step": 1157
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.4900021425263775,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6503,
+      "step": 1158
+    },
+    {
+      "epoch": 0.9272,
+      "grad_norm": 0.40148166495479204,
+      "learning_rate": 2.7690623013533976e-06,
+      "loss": 0.6751,
+      "step": 1159
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.478024833109711,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.7441,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9288,
+      "grad_norm": 0.3966442014026454,
+      "learning_rate": 2.649217248223468e-06,
+      "loss": 0.6398,
+      "step": 1161
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.4346210876122322,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6345,
+      "step": 1162
+    },
+    {
+      "epoch": 0.9304,
+      "grad_norm": 0.4518751493966605,
+      "learning_rate": 2.5319885278115906e-06,
+      "loss": 0.6722,
+      "step": 1163
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.45645819261345977,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.7766,
+      "step": 1164
+    },
+    {
+      "epoch": 0.932,
+      "grad_norm": 0.3906524036530478,
+      "learning_rate": 2.4173792906762804e-06,
+      "loss": 0.6913,
+      "step": 1165
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.43531277252955825,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.7008,
+      "step": 1166
+    },
+    {
+      "epoch": 0.9336,
+      "grad_norm": 0.3861596842374385,
+      "learning_rate": 2.3053926169765984e-06,
+      "loss": 0.6621,
+      "step": 1167
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3889786430539552,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6828,
+      "step": 1168
+    },
+    {
+      "epoch": 0.9352,
+      "grad_norm": 0.4686956061808216,
+      "learning_rate": 2.1960315163894075e-06,
+      "loss": 0.7167,
+      "step": 1169
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.4139587613272724,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.5721,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9368,
+      "grad_norm": 0.3707594717031069,
+      "learning_rate": 2.0892989280284823e-06,
+      "loss": 0.6226,
+      "step": 1171
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.41238141651385263,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.7213,
+      "step": 1172
+    },
+    {
+      "epoch": 0.9384,
+      "grad_norm": 0.38596134444921354,
+      "learning_rate": 1.9851977203654835e-06,
+      "loss": 0.6312,
+      "step": 1173
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.5294431848347445,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.7428,
+      "step": 1174
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.3708889449347871,
+      "learning_rate": 1.8837306911529184e-06,
+      "loss": 0.647,
+      "step": 1175
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.39920534073294506,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6604,
+      "step": 1176
+    },
+    {
+      "epoch": 0.9416,
+      "grad_norm": 0.39860181638799774,
+      "learning_rate": 1.7849005673489127e-06,
+      "loss": 0.6281,
+      "step": 1177
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.46231369728897465,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.7061,
+      "step": 1178
+    },
+    {
+      "epoch": 0.9432,
+      "grad_norm": 0.45745721560795477,
+      "learning_rate": 1.6887100050439587e-06,
+      "loss": 0.6742,
+      "step": 1179
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4689754580880514,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.7039,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9448,
+      "grad_norm": 0.4823618491509199,
+      "learning_rate": 1.595161589389449e-06,
+      "loss": 0.662,
+      "step": 1181
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.46336471271106144,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.673,
+      "step": 1182
+    },
+    {
+      "epoch": 0.9464,
+      "grad_norm": 0.35396268660446345,
+      "learning_rate": 1.5042578345283108e-06,
+      "loss": 0.6535,
+      "step": 1183
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.5057640826513973,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6975,
+      "step": 1184
+    },
+    {
+      "epoch": 0.948,
+      "grad_norm": 0.3795138987493162,
+      "learning_rate": 1.4160011835273934e-06,
+      "loss": 0.6151,
+      "step": 1185
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.44355391727254384,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.6406,
+      "step": 1186
+    },
+    {
+      "epoch": 0.9496,
+      "grad_norm": 0.5399971038799832,
+      "learning_rate": 1.3303940083117527e-06,
+      "loss": 0.6719,
+      "step": 1187
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.42414602522240696,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6897,
+      "step": 1188
+    },
+    {
+      "epoch": 0.9512,
+      "grad_norm": 0.37502017005224747,
+      "learning_rate": 1.2474386096010039e-06,
+      "loss": 0.6794,
+      "step": 1189
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4314696948175594,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.6796,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9528,
+      "grad_norm": 0.34774182154981,
+      "learning_rate": 1.1671372168474138e-06,
+      "loss": 0.6331,
+      "step": 1191
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.4168739496808594,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6785,
+      "step": 1192
+    },
+    {
+      "epoch": 0.9544,
+      "grad_norm": 0.3909911905539331,
+      "learning_rate": 1.089491988176017e-06,
+      "loss": 0.6534,
+      "step": 1193
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.5097445480440692,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.7623,
+      "step": 1194
+    },
+    {
+      "epoch": 0.956,
+      "grad_norm": 0.4059124588275709,
+      "learning_rate": 1.014505010326583e-06,
+      "loss": 0.6667,
+      "step": 1195
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.43295360086793055,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.7039,
+      "step": 1196
+    },
+    {
+      "epoch": 0.9576,
+      "grad_norm": 0.3585355316944381,
+      "learning_rate": 9.421782985976068e-07,
+      "loss": 0.6358,
+      "step": 1197
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.38145025657316156,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6338,
+      "step": 1198
+    },
+    {
+      "epoch": 0.9592,
+      "grad_norm": 0.40944540455644174,
+      "learning_rate": 8.725137967920738e-07,
+      "loss": 0.7277,
+      "step": 1199
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.4001505668384557,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6646,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9608,
+      "grad_norm": 0.4297407107858297,
+      "learning_rate": 8.055133771652345e-07,
+      "loss": 0.7087,
+      "step": 1201
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.4628818699611473,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6693,
+      "step": 1202
+    },
+    {
+      "epoch": 0.9624,
+      "grad_norm": 0.46878217054932964,
+      "learning_rate": 7.411788403743237e-07,
+      "loss": 0.6508,
+      "step": 1203
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.4662714739357374,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.7321,
+      "step": 1204
+    },
+    {
+      "epoch": 0.964,
+      "grad_norm": 0.4341883993170443,
+      "learning_rate": 6.7951191543012e-07,
+      "loss": 0.7093,
+      "step": 1205
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.42279216696853417,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.7288,
+      "step": 1206
+    },
+    {
+      "epoch": 0.9656,
+      "grad_norm": 0.33331493654272815,
+      "learning_rate": 6.205142596505176e-07,
+      "loss": 0.5991,
+      "step": 1207
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.43282013465082525,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.625,
+      "step": 1208
+    },
+    {
+      "epoch": 0.9672,
+      "grad_norm": 0.4787444650581861,
+      "learning_rate": 5.64187458615939e-07,
+      "loss": 0.6435,
+      "step": 1209
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.45048649980398436,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6765,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9688,
+      "grad_norm": 0.4651771376062177,
+      "learning_rate": 5.105330261267916e-07,
+      "loss": 0.6223,
+      "step": 1211
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.4224991553320907,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6566,
+      "step": 1212
+    },
+    {
+      "epoch": 0.9704,
+      "grad_norm": 0.3582193752613605,
+      "learning_rate": 4.5955240416271084e-07,
+      "loss": 0.6721,
+      "step": 1213
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.4734287505841682,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.7126,
+      "step": 1214
+    },
+    {
+      "epoch": 0.972,
+      "grad_norm": 0.45700700134634636,
+      "learning_rate": 4.112469628438365e-07,
+      "loss": 0.6413,
+      "step": 1215
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.44247153043906856,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6816,
+      "step": 1216
+    },
+    {
+      "epoch": 0.9736,
+      "grad_norm": 0.47470973799676847,
+      "learning_rate": 3.6561800039403016e-07,
+      "loss": 0.7297,
+      "step": 1217
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.5446957137639532,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.643,
+      "step": 1218
+    },
+    {
+      "epoch": 0.9752,
+      "grad_norm": 0.46334686798287195,
+      "learning_rate": 3.2266674310589273e-07,
+      "loss": 0.7692,
+      "step": 1219
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.41091671416864994,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6502,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9768,
+      "grad_norm": 0.39459758391103256,
+      "learning_rate": 2.8239434530792365e-07,
+      "loss": 0.6413,
+      "step": 1221
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.4864800116717401,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6379,
+      "step": 1222
+    },
+    {
+      "epoch": 0.9784,
+      "grad_norm": 0.416298793772082,
+      "learning_rate": 2.448018893333681e-07,
+      "loss": 0.6878,
+      "step": 1223
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.36409187772858065,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6677,
+      "step": 1224
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.4101215847095293,
+      "learning_rate": 2.098903854912515e-07,
+      "loss": 0.6696,
+      "step": 1225
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.3847472437653507,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.5913,
+      "step": 1226
+    },
+    {
+      "epoch": 0.9816,
+      "grad_norm": 0.4247932929572247,
+      "learning_rate": 1.7766077203915655e-07,
+      "loss": 0.5759,
+      "step": 1227
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3553625121999897,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.635,
+      "step": 1228
+    },
+    {
+      "epoch": 0.9832,
+      "grad_norm": 0.43164768345714893,
+      "learning_rate": 1.481139151579991e-07,
+      "loss": 0.6749,
+      "step": 1229
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.46418670622765706,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6552,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9848,
+      "grad_norm": 0.46037860768689964,
+      "learning_rate": 1.2125060892881346e-07,
+      "loss": 0.6209,
+      "step": 1231
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3954814721452663,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.669,
+      "step": 1232
+    },
+    {
+      "epoch": 0.9864,
+      "grad_norm": 0.4632088681723823,
+      "learning_rate": 9.707157531134713e-08,
+      "loss": 0.7539,
+      "step": 1233
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.3612381684901188,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.7106,
+      "step": 1234
+    },
+    {
+      "epoch": 0.988,
+      "grad_norm": 0.570964249199408,
+      "learning_rate": 7.557746412468758e-08,
+      "loss": 0.7016,
+      "step": 1235
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.43290226218463623,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6845,
+      "step": 1236
+    },
+    {
+      "epoch": 0.9896,
+      "grad_norm": 0.4336396133365297,
+      "learning_rate": 5.6768853029787184e-08,
+      "loss": 0.6387,
+      "step": 1237
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.34928051538481464,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6331,
+      "step": 1238
+    },
+    {
+      "epoch": 0.9912,
+      "grad_norm": 0.43635113247538154,
+      "learning_rate": 4.064624751394242e-08,
+      "loss": 0.7015,
+      "step": 1239
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.44316229365990106,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6052,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9928,
+      "grad_norm": 0.4277488879744065,
+      "learning_rate": 2.7210080877237976e-08,
+      "loss": 0.6274,
+      "step": 1241
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.4713402250138666,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.7099,
+      "step": 1242
+    },
+    {
+      "epoch": 0.9944,
+      "grad_norm": 0.37742729038730555,
+      "learning_rate": 1.646071422083395e-08,
+      "loss": 0.6548,
+      "step": 1243
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.42519893779646084,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6707,
+      "step": 1244
+    },
+    {
+      "epoch": 0.996,
+      "grad_norm": 0.39409367052304717,
+      "learning_rate": 8.398436437317969e-09,
+      "loss": 0.7144,
+      "step": 1245
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.4200414963488034,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6496,
+      "step": 1246
+    },
+    {
+      "epoch": 0.9976,
+      "grad_norm": 0.4802378502631879,
+      "learning_rate": 3.023464202944748e-09,
+      "loss": 0.6364,
+      "step": 1247
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.4196396964710776,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.5843,
+      "step": 1248
+    },
+    {
+      "epoch": 0.9992,
+      "grad_norm": 0.48797004369630237,
+      "learning_rate": 3.3594197175190745e-10,
+      "loss": 0.6952,
+      "step": 1249
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.38172998753419185,
+      "learning_rate": 0.0,
+      "loss": 0.6778,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0,
+      "step": 1250,
+      "total_flos": 1075826133532672.0,
+      "train_loss": 0.7417976837635041,
+      "train_runtime": 19115.8196,
+      "train_samples_per_second": 1.046,
+      "train_steps_per_second": 0.065
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1075826133532672.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1f292a7f1683eaf79bddef8cffe2dee5554e074
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cd7dd1d66ff4ab5dc97a6609c1dc54722fe489a9
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:085d39296c22c360e7c4d534d4f697116d34e17d698aae4ec57a6b035f902fbf
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c1a629de374a3741bbd76d893f1c4b600a0de224
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27bbfba65972d389b8ad7ee2efb761df350c0f7a182b5f4e76e444e472720d34
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..67a95898d230bff50174be6b7bb656fbfec7ca45
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,8792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.9795816727549139,
+      "learning_rate": 5.263157894736842e-06,
+      "loss": 1.3776,
+      "step": 1
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.001400272852517,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.3593,
+      "step": 2
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 1.386084578439465,
+      "learning_rate": 1.5789473684210526e-05,
+      "loss": 1.4641,
+      "step": 3
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8968642175430531,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.2433,
+      "step": 4
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.9172606992324467,
+      "learning_rate": 2.6315789473684212e-05,
+      "loss": 1.3735,
+      "step": 5
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 1.0104451804190506,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.3595,
+      "step": 6
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.7202998913128105,
+      "learning_rate": 3.6842105263157895e-05,
+      "loss": 1.0944,
+      "step": 7
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.909331910756895,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.1729,
+      "step": 8
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.7677918672527282,
+      "learning_rate": 4.736842105263158e-05,
+      "loss": 1.0781,
+      "step": 9
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.320728027486979,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.1596,
+      "step": 10
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.8528174600822885,
+      "learning_rate": 5.789473684210527e-05,
+      "loss": 1.0121,
+      "step": 11
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7537754667752942,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 0.995,
+      "step": 12
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.8658238321920337,
+      "learning_rate": 6.842105263157895e-05,
+      "loss": 1.0719,
+      "step": 13
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 1.120365218132729,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.9672,
+      "step": 14
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.8933667962965879,
+      "learning_rate": 7.894736842105263e-05,
+      "loss": 0.9901,
+      "step": 15
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.6626033473553431,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.9127,
+      "step": 16
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.6598743400498236,
+      "learning_rate": 8.947368421052632e-05,
+      "loss": 0.9322,
+      "step": 17
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.5687357947833772,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.8733,
+      "step": 18
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.5745221794718155,
+      "learning_rate": 0.0001,
+      "loss": 0.9183,
+      "step": 19
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5335547076637125,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.8869,
+      "step": 20
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.6527646967762165,
+      "learning_rate": 0.0001105263157894737,
+      "loss": 0.9387,
+      "step": 21
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.6029701200334054,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.9282,
+      "step": 22
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.5356669071254307,
+      "learning_rate": 0.00012105263157894738,
+      "loss": 0.8965,
+      "step": 23
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.4468237694215002,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8501,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.6820914041506152,
+      "learning_rate": 0.00013157894736842108,
+      "loss": 0.8875,
+      "step": 25
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.5468818728749623,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.894,
+      "step": 26
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.5108690786169384,
+      "learning_rate": 0.00014210526315789474,
+      "loss": 0.9317,
+      "step": 27
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5823547589849747,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.9312,
+      "step": 28
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.6348175797361912,
+      "learning_rate": 0.00015263157894736845,
+      "loss": 1.0034,
+      "step": 29
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.5036760470288159,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8095,
+      "step": 30
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.5086285965001636,
+      "learning_rate": 0.0001631578947368421,
+      "loss": 0.8288,
+      "step": 31
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.4810370375467901,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.9338,
+      "step": 32
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.5120024960432132,
+      "learning_rate": 0.0001736842105263158,
+      "loss": 0.8819,
+      "step": 33
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.5024966363071183,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8253,
+      "step": 34
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.5239072940748722,
+      "learning_rate": 0.00018421052631578948,
+      "loss": 0.8691,
+      "step": 35
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5230322392779483,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.9452,
+      "step": 36
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.48779597363311394,
+      "learning_rate": 0.00019473684210526317,
+      "loss": 0.8819,
+      "step": 37
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.4752187441381419,
+      "learning_rate": 0.0002,
+      "loss": 0.8895,
+      "step": 38
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.48115741632287223,
+      "learning_rate": 0.00019999966405802826,
+      "loss": 0.8083,
+      "step": 39
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5849041040140422,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8825,
+      "step": 40
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.4725278601678821,
+      "learning_rate": 0.00019999697653579705,
+      "loss": 0.9055,
+      "step": 41
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.5523659132085241,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.8939,
+      "step": 42
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.4638269541138185,
+      "learning_rate": 0.0001999916015635627,
+      "loss": 0.7956,
+      "step": 43
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.6416053734788665,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8854,
+      "step": 44
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.46754119288552975,
+      "learning_rate": 0.00019998353928577919,
+      "loss": 0.8075,
+      "step": 45
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.49357320270679894,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8955,
+      "step": 46
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.5265191782061447,
+      "learning_rate": 0.0001999727899191228,
+      "loss": 0.873,
+      "step": 47
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5440466788183455,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.9327,
+      "step": 48
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.5517236902037315,
+      "learning_rate": 0.00019995935375248606,
+      "loss": 0.8772,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.43798313657061366,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8085,
+      "step": 50
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.4508153636398181,
+      "learning_rate": 0.00019994323114697022,
+      "loss": 0.8308,
+      "step": 51
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5119638374510451,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8612,
+      "step": 52
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.5382792357634911,
+      "learning_rate": 0.0001999244225358753,
+      "loss": 0.8798,
+      "step": 53
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.6076236517255771,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.9024,
+      "step": 54
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.441952994803446,
+      "learning_rate": 0.00019990292842468868,
+      "loss": 0.8043,
+      "step": 55
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.6106597165054768,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.9344,
+      "step": 56
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.4406300276624783,
+      "learning_rate": 0.0001998787493910712,
+      "loss": 0.7952,
+      "step": 57
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.564217760542486,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.7998,
+      "step": 58
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.48469011133740136,
+      "learning_rate": 0.000199851886084842,
+      "loss": 0.8903,
+      "step": 59
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5288039702729751,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.9089,
+      "step": 60
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.48006551682464205,
+      "learning_rate": 0.00019982233922796085,
+      "loss": 0.8082,
+      "step": 61
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5505340709331455,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.848,
+      "step": 62
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.5503316668230102,
+      "learning_rate": 0.00019979010961450878,
+      "loss": 0.9087,
+      "step": 63
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4761815526904352,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.8735,
+      "step": 64
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.5228152494367051,
+      "learning_rate": 0.00019975519811066663,
+      "loss": 0.8651,
+      "step": 65
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.48727193746734115,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8213,
+      "step": 66
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.4541746743520036,
+      "learning_rate": 0.0001997176056546921,
+      "loss": 0.7564,
+      "step": 67
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.606930240393257,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.9255,
+      "step": 68
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.5068161831554241,
+      "learning_rate": 0.0001996773332568941,
+      "loss": 0.8169,
+      "step": 69
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.424067157048311,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.7367,
+      "step": 70
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.6252522883664977,
+      "learning_rate": 0.00019963438199960599,
+      "loss": 0.8798,
+      "step": 71
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5223020930728051,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8766,
+      "step": 72
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.4941228814132986,
+      "learning_rate": 0.00019958875303715615,
+      "loss": 0.834,
+      "step": 73
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.4873684591333978,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8248,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5963432196918743,
+      "learning_rate": 0.0001995404475958373,
+      "loss": 0.9005,
+      "step": 75
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.45748157914102094,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.7965,
+      "step": 76
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.45401161358757675,
+      "learning_rate": 0.0001994894669738732,
+      "loss": 0.7329,
+      "step": 77
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.4519919461414791,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.8374,
+      "step": 78
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.5153754388703433,
+      "learning_rate": 0.0001994358125413841,
+      "loss": 0.8594,
+      "step": 79
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4650985354020782,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8447,
+      "step": 80
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.5124898203889733,
+      "learning_rate": 0.0001993794857403495,
+      "loss": 0.8575,
+      "step": 81
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.3987512573205426,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.7722,
+      "step": 82
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.5263385255738214,
+      "learning_rate": 0.0001993204880845699,
+      "loss": 0.8442,
+      "step": 83
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.42770353755918095,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.7781,
+      "step": 84
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.5136893553522667,
+      "learning_rate": 0.00019925882115962568,
+      "loss": 0.8361,
+      "step": 85
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.5308907844287711,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.8802,
+      "step": 86
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.6081975357536167,
+      "learning_rate": 0.00019919448662283478,
+      "loss": 0.8561,
+      "step": 87
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.5131090276495259,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8325,
+      "step": 88
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.4584634943727301,
+      "learning_rate": 0.00019912748620320794,
+      "loss": 0.8573,
+      "step": 89
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.4458868662131762,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.7505,
+      "step": 90
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.4795068156261207,
+      "learning_rate": 0.00019905782170140238,
+      "loss": 0.7912,
+      "step": 91
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.454801285925989,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.7622,
+      "step": 92
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.5133998231490974,
+      "learning_rate": 0.00019898549498967343,
+      "loss": 0.8776,
+      "step": 93
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.5023671300581364,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.8506,
+      "step": 94
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.4747696577123322,
+      "learning_rate": 0.000198910508011824,
+      "loss": 0.8295,
+      "step": 95
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.7440171252294648,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.9612,
+      "step": 96
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.47532276432413834,
+      "learning_rate": 0.00019883286278315262,
+      "loss": 0.7502,
+      "step": 97
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.5933290446396249,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.854,
+      "step": 98
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.45899362568312485,
+      "learning_rate": 0.00019875256139039902,
+      "loss": 0.8233,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5100520778137256,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8262,
+      "step": 100
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.5061044066072292,
+      "learning_rate": 0.00019866960599168826,
+      "loss": 0.8266,
+      "step": 101
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.4644796197561921,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.8673,
+      "step": 102
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.532889532692695,
+      "learning_rate": 0.0001985839988164726,
+      "loss": 0.8944,
+      "step": 103
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.5402398258930571,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.8318,
+      "step": 104
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.4315294793014923,
+      "learning_rate": 0.00019849574216547171,
+      "loss": 0.7183,
+      "step": 105
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.5059474728944265,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8116,
+      "step": 106
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.7754971525338998,
+      "learning_rate": 0.00019840483841061058,
+      "loss": 0.7798,
+      "step": 107
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5170633989504952,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.8156,
+      "step": 108
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.578973362036397,
+      "learning_rate": 0.00019831128999495606,
+      "loss": 0.9072,
+      "step": 109
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.46987237459032416,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.832,
+      "step": 110
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.5289940796495135,
+      "learning_rate": 0.0001982150994326511,
+      "loss": 0.8089,
+      "step": 111
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5130783014268392,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.7312,
+      "step": 112
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.5204585053542395,
+      "learning_rate": 0.0001981162693088471,
+      "loss": 0.8148,
+      "step": 113
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.49355582746057164,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8228,
+      "step": 114
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.49244184486559767,
+      "learning_rate": 0.0001980148022796345,
+      "loss": 0.7831,
+      "step": 115
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4856545194485971,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7825,
+      "step": 116
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.48638246715216615,
+      "learning_rate": 0.00019791070107197153,
+      "loss": 0.7993,
+      "step": 117
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.45328815654698607,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.8438,
+      "step": 118
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.4699347966122583,
+      "learning_rate": 0.0001978039684836106,
+      "loss": 0.8007,
+      "step": 119
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.954925514227352,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.993,
+      "step": 120
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.4563360256347201,
+      "learning_rate": 0.0001976946073830234,
+      "loss": 0.7498,
+      "step": 121
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.4660893825770606,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8683,
+      "step": 122
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.5353770623306547,
+      "learning_rate": 0.00019758262070932375,
+      "loss": 0.912,
+      "step": 123
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4549228446911368,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7854,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.503929509008853,
+      "learning_rate": 0.00019746801147218842,
+      "loss": 0.8639,
+      "step": 125
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.5357250808655094,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.8876,
+      "step": 126
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.4660867449746972,
+      "learning_rate": 0.00019735078275177654,
+      "loss": 0.8103,
+      "step": 127
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.45966669204874755,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.7822,
+      "step": 128
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.42467515154790736,
+      "learning_rate": 0.00019723093769864663,
+      "loss": 0.7763,
+      "step": 129
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5278863210130068,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8289,
+      "step": 130
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.6213394450284235,
+      "learning_rate": 0.0001971084795336719,
+      "loss": 0.7384,
+      "step": 131
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.5198214516650237,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.7867,
+      "step": 132
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.4815304018541252,
+      "learning_rate": 0.00019698341154795389,
+      "loss": 0.7645,
+      "step": 133
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.5445023318738487,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.9136,
+      "step": 134
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.44610566324522344,
+      "learning_rate": 0.00019685573710273376,
+      "loss": 0.781,
+      "step": 135
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.6020596210092647,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.9109,
+      "step": 136
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.4769291715566396,
+      "learning_rate": 0.00019672545962930215,
+      "loss": 0.8249,
+      "step": 137
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.5547657722340099,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.7991,
+      "step": 138
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.5051002370148648,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.8698,
+      "step": 139
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.4916122523523175,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8153,
+      "step": 140
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.5166782007595769,
+      "learning_rate": 0.00019645710967265882,
+      "loss": 0.7959,
+      "step": 141
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.44309227763417436,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7681,
+      "step": 142
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.46787767001269476,
+      "learning_rate": 0.00019631904440143612,
+      "loss": 0.8047,
+      "step": 143
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.5048137999449103,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.7741,
+      "step": 144
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.4115538527551283,
+      "learning_rate": 0.00019617839052578603,
+      "loss": 0.6979,
+      "step": 145
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.47580725304964017,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.8439,
+      "step": 146
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.4284484275975892,
+      "learning_rate": 0.0001960351518258255,
+      "loss": 0.7631,
+      "step": 147
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.4575503969957535,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.7662,
+      "step": 148
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.4787216610881153,
+      "learning_rate": 0.00019588933215113926,
+      "loss": 0.8322,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.472925175131021,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.7471,
+      "step": 150
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.45698351899719436,
+      "learning_rate": 0.00019574093542067673,
+      "loss": 0.7955,
+      "step": 151
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.5035216462112196,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.7837,
+      "step": 152
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.47296199423739266,
+      "learning_rate": 0.0001955899656226464,
+      "loss": 0.833,
+      "step": 153
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.4647638019652673,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.7565,
+      "step": 154
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.4831036228689567,
+      "learning_rate": 0.0001954364268144088,
+      "loss": 0.7574,
+      "step": 155
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.44156563283458045,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7636,
+      "step": 156
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.44245203192588406,
+      "learning_rate": 0.00019528032312236736,
+      "loss": 0.7123,
+      "step": 157
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.582444811054162,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7096,
+      "step": 158
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.39873187756149414,
+      "learning_rate": 0.00019512165874185767,
+      "loss": 0.7913,
+      "step": 159
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.41708637739457183,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.7844,
+      "step": 160
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.4446437065954701,
+      "learning_rate": 0.0001949604379370345,
+      "loss": 0.7571,
+      "step": 161
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.4619420767214924,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8421,
+      "step": 162
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.4581600049161185,
+      "learning_rate": 0.00019479666504075736,
+      "loss": 0.8037,
+      "step": 163
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.4795136121564061,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8403,
+      "step": 164
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.4428534252300689,
+      "learning_rate": 0.0001946303444544741,
+      "loss": 0.7776,
+      "step": 165
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.5381828922235592,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8372,
+      "step": 166
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.44833639039360296,
+      "learning_rate": 0.00019446148064810242,
+      "loss": 0.8648,
+      "step": 167
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.4282405270094912,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.7795,
+      "step": 168
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.4774713545866125,
+      "learning_rate": 0.00019429007815990993,
+      "loss": 0.8042,
+      "step": 169
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.410935491785557,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7735,
+      "step": 170
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.4759143841412201,
+      "learning_rate": 0.00019411614159639204,
+      "loss": 0.774,
+      "step": 171
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5206608221789917,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7397,
+      "step": 172
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.5960796914978171,
+      "learning_rate": 0.00019393967563214833,
+      "loss": 0.7975,
+      "step": 173
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.4871539656184539,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.7976,
+      "step": 174
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.44498736464472977,
+      "learning_rate": 0.00019376068500975667,
+      "loss": 0.7699,
+      "step": 175
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4150771843721881,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.78,
+      "step": 176
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.44553083248616643,
+      "learning_rate": 0.000193579174539646,
+      "loss": 0.7706,
+      "step": 177
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.4224036590709276,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7666,
+      "step": 178
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.42371940924346846,
+      "learning_rate": 0.00019339514909996706,
+      "loss": 0.7901,
+      "step": 179
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.591708075847747,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8886,
+      "step": 180
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.44833292498327687,
+      "learning_rate": 0.00019320861363646095,
+      "loss": 0.7761,
+      "step": 181
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.47086220959100905,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.8488,
+      "step": 182
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.4262782763041816,
+      "learning_rate": 0.00019301957316232658,
+      "loss": 0.776,
+      "step": 183
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.5214668899140502,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.8112,
+      "step": 184
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.5001587396984958,
+      "learning_rate": 0.0001928280327580858,
+      "loss": 0.8792,
+      "step": 185
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.4979113579842723,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.8363,
+      "step": 186
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.4497772642309642,
+      "learning_rate": 0.00019263399757144683,
+      "loss": 0.788,
+      "step": 187
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.45321282554780185,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8022,
+      "step": 188
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.4231460933527123,
+      "learning_rate": 0.000192437472817166,
+      "loss": 0.7393,
+      "step": 189
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.4640456987297124,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.8033,
+      "step": 190
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.5859044824831487,
+      "learning_rate": 0.00019223846377690754,
+      "loss": 0.8414,
+      "step": 191
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.43572744793730855,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.7739,
+      "step": 192
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.5226529387531288,
+      "learning_rate": 0.00019203697579910154,
+      "loss": 0.7577,
+      "step": 193
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.4497304299400067,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7549,
+      "step": 194
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.48876913351196766,
+      "learning_rate": 0.00019183301429880043,
+      "loss": 0.7676,
+      "step": 195
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.45111221462046336,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.7981,
+      "step": 196
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.4585065393312021,
+      "learning_rate": 0.00019162658475753327,
+      "loss": 0.792,
+      "step": 197
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.4674904268057127,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8676,
+      "step": 198
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.47586101388093477,
+      "learning_rate": 0.00019141769272315858,
+      "loss": 0.7624,
+      "step": 199
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.47274832683360907,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8281,
+      "step": 200
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.4276018360251776,
+      "learning_rate": 0.00019120634380971496,
+      "loss": 0.711,
+      "step": 201
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.4590346746005837,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8043,
+      "step": 202
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.4816141964583813,
+      "learning_rate": 0.0001909925436972706,
+      "loss": 0.734,
+      "step": 203
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5692250295310609,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.8311,
+      "step": 204
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.5168964734560001,
+      "learning_rate": 0.00019077629813177036,
+      "loss": 0.8003,
+      "step": 205
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.4434269203102765,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7309,
+      "step": 206
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.556711306277704,
+      "learning_rate": 0.00019055761292488142,
+      "loss": 0.6997,
+      "step": 207
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4805482120156597,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7914,
+      "step": 208
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.43647431344551174,
+      "learning_rate": 0.00019033649395383702,
+      "loss": 0.7815,
+      "step": 209
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.5744012343987621,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.8203,
+      "step": 210
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.5684780483215756,
+      "learning_rate": 0.00019011294716127867,
+      "loss": 0.8015,
+      "step": 211
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.4303508524158662,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7652,
+      "step": 212
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.4896619399112934,
+      "learning_rate": 0.0001898869785550963,
+      "loss": 0.8379,
+      "step": 213
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.5320091111275196,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.8287,
+      "step": 214
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.45903465195702364,
+      "learning_rate": 0.00018965859420826684,
+      "loss": 0.74,
+      "step": 215
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.5694127309219911,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.7837,
+      "step": 216
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.4689176702326866,
+      "learning_rate": 0.00018942780025869098,
+      "loss": 0.807,
+      "step": 217
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.4697709903272958,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7965,
+      "step": 218
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.4370312389959886,
+      "learning_rate": 0.00018919460290902826,
+      "loss": 0.7761,
+      "step": 219
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4765623072933397,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.6862,
+      "step": 220
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.6342609583307439,
+      "learning_rate": 0.0001889590084265304,
+      "loss": 0.8148,
+      "step": 221
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.4799134250255987,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7785,
+      "step": 222
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.48212206697204846,
+      "learning_rate": 0.0001887210231428727,
+      "loss": 0.7898,
+      "step": 223
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4655827018767983,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.8062,
+      "step": 224
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5265800855384186,
+      "learning_rate": 0.0001884806534539841,
+      "loss": 0.8375,
+      "step": 225
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.5774244433936847,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8595,
+      "step": 226
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.522574221020107,
+      "learning_rate": 0.0001882379058198751,
+      "loss": 0.7785,
+      "step": 227
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.4658974495563153,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.7776,
+      "step": 228
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.48850391555818595,
+      "learning_rate": 0.00018799278676446423,
+      "loss": 0.7231,
+      "step": 229
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.4900387100247314,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7539,
+      "step": 230
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.4470268992096281,
+      "learning_rate": 0.00018774530287540278,
+      "loss": 0.8205,
+      "step": 231
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.5372584408183166,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.8523,
+      "step": 232
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.5786736632864303,
+      "learning_rate": 0.00018749546080389757,
+      "loss": 0.8516,
+      "step": 233
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.4150327411851708,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.767,
+      "step": 234
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.4744247607116235,
+      "learning_rate": 0.00018724326726453244,
+      "loss": 0.8131,
+      "step": 235
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5212775801914824,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7665,
+      "step": 236
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.5315130773715251,
+      "learning_rate": 0.00018698872903508755,
+      "loss": 0.8862,
+      "step": 237
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.5811528871742159,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.8707,
+      "step": 238
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.44617622461432815,
+      "learning_rate": 0.0001867318529563574,
+      "loss": 0.7049,
+      "step": 239
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.5110629204553769,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7552,
+      "step": 240
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.45105577641636846,
+      "learning_rate": 0.00018647264593196688,
+      "loss": 0.7799,
+      "step": 241
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.4988548737364565,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.8198,
+      "step": 242
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.44820254586860836,
+      "learning_rate": 0.00018621111492818585,
+      "loss": 0.7746,
+      "step": 243
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.5064379659275585,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.8171,
+      "step": 244
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.6307811170822041,
+      "learning_rate": 0.00018594726697374175,
+      "loss": 0.9037,
+      "step": 245
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.5709592111841547,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.8456,
+      "step": 246
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.4827907950086345,
+      "learning_rate": 0.0001856811091596308,
+      "loss": 0.7598,
+      "step": 247
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.48715082409785165,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.8504,
+      "step": 248
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.5022225482599009,
+      "learning_rate": 0.00018541264863892754,
+      "loss": 0.8216,
+      "step": 249
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.4461203089196808,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.8183,
+      "step": 250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.48705056288283083,
+      "learning_rate": 0.00018514189262659235,
+      "loss": 0.8283,
+      "step": 251
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.49774987618821515,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8191,
+      "step": 252
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.5339328705007534,
+      "learning_rate": 0.00018486884839927768,
+      "loss": 0.8081,
+      "step": 253
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.4791336870020177,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.7499,
+      "step": 254
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.45800363192527943,
+      "learning_rate": 0.0001845935232951325,
+      "loss": 0.7997,
+      "step": 255
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.46519629604160584,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.8501,
+      "step": 256
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.4715616737015773,
+      "learning_rate": 0.00018431592471360503,
+      "loss": 0.7897,
+      "step": 257
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4245101255916471,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7595,
+      "step": 258
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.5362847783633511,
+      "learning_rate": 0.000184036060115244,
+      "loss": 0.846,
+      "step": 259
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4482413130364821,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.7155,
+      "step": 260
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.45952161523507334,
+      "learning_rate": 0.00018375393702149787,
+      "loss": 0.77,
+      "step": 261
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5546853267775046,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.8748,
+      "step": 262
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.5427836514907146,
+      "learning_rate": 0.00018346956301451304,
+      "loss": 0.8554,
+      "step": 263
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5170096339247156,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.8374,
+      "step": 264
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.5048070790052696,
+      "learning_rate": 0.00018318294573692985,
+      "loss": 0.7852,
+      "step": 265
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.42519373761514895,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.7553,
+      "step": 266
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.45921813676682904,
+      "learning_rate": 0.0001828940928916772,
+      "loss": 0.7488,
+      "step": 267
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.4383175333338921,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7668,
+      "step": 268
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.5232721465390389,
+      "learning_rate": 0.00018260301224176558,
+      "loss": 0.822,
+      "step": 269
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.4569931890966662,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7834,
+      "step": 270
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.40542299303164925,
+      "learning_rate": 0.00018230971161007853,
+      "loss": 0.8149,
+      "step": 271
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.44084215043019426,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.785,
+      "step": 272
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.38904576075065306,
+      "learning_rate": 0.00018201419887916214,
+      "loss": 0.7591,
+      "step": 273
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.6117423420347984,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.8003,
+      "step": 274
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.6406690655995851,
+      "learning_rate": 0.00018171648199101346,
+      "loss": 0.8713,
+      "step": 275
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.49705391186855263,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7654,
+      "step": 276
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.4477634962335582,
+      "learning_rate": 0.00018141656894686689,
+      "loss": 0.7992,
+      "step": 277
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.41943609720436703,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.7339,
+      "step": 278
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.5026472739157812,
+      "learning_rate": 0.00018111446780697929,
+      "loss": 0.835,
+      "step": 279
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.39988503480991233,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7375,
+      "step": 280
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.5975134595609247,
+      "learning_rate": 0.00018081018669041324,
+      "loss": 0.9192,
+      "step": 281
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.49964857187934897,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.7672,
+      "step": 282
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.507087097547393,
+      "learning_rate": 0.00018050373377481878,
+      "loss": 0.8104,
+      "step": 283
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.49188156989526316,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7378,
+      "step": 284
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.5668204474599812,
+      "learning_rate": 0.0001801951172962139,
+      "loss": 0.8318,
+      "step": 285
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.49264377554081845,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7835,
+      "step": 286
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.4073619173169081,
+      "learning_rate": 0.0001798843455487629,
+      "loss": 0.7141,
+      "step": 287
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.49355344807027934,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7787,
+      "step": 288
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.4663434613832926,
+      "learning_rate": 0.00017957142688455362,
+      "loss": 0.8168,
+      "step": 289
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.5299134075969804,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7807,
+      "step": 290
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.5537745502297323,
+      "learning_rate": 0.00017925636971337304,
+      "loss": 0.7876,
+      "step": 291
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.5671415781776914,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.8057,
+      "step": 292
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.47147141641940005,
+      "learning_rate": 0.00017893918250248104,
+      "loss": 0.7887,
+      "step": 293
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.5135239239131087,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.8399,
+      "step": 294
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.4567751279486991,
+      "learning_rate": 0.00017861987377638312,
+      "loss": 0.816,
+      "step": 295
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.530766676935918,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.8247,
+      "step": 296
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.4392004139140252,
+      "learning_rate": 0.0001782984521166011,
+      "loss": 0.7879,
+      "step": 297
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.4613367032132493,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7787,
+      "step": 298
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.474045585760139,
+      "learning_rate": 0.00017797492616144256,
+      "loss": 0.7962,
+      "step": 299
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5066219495984166,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7755,
+      "step": 300
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.48675505607185726,
+      "learning_rate": 0.00017764930460576866,
+      "loss": 0.7671,
+      "step": 301
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.47948629724067315,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7716,
+      "step": 302
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.5599768756820828,
+      "learning_rate": 0.00017732159620076053,
+      "loss": 0.7916,
+      "step": 303
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.49403730031729814,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7698,
+      "step": 304
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.5301548619115438,
+      "learning_rate": 0.00017699180975368396,
+      "loss": 0.7496,
+      "step": 305
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.45832223663782296,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.8301,
+      "step": 306
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.47129035121504553,
+      "learning_rate": 0.00017665995412765285,
+      "loss": 0.7459,
+      "step": 307
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.3886867598075655,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7442,
+      "step": 308
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.48484427387495405,
+      "learning_rate": 0.00017632603824139085,
+      "loss": 0.816,
+      "step": 309
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.47656839689614183,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.7921,
+      "step": 310
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.48291875194295386,
+      "learning_rate": 0.0001759900710689918,
+      "loss": 0.7739,
+      "step": 311
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.43066624272424553,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8113,
+      "step": 312
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.4921583706771416,
+      "learning_rate": 0.00017565206163967846,
+      "loss": 0.7332,
+      "step": 313
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.4635208516179352,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8202,
+      "step": 314
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.42605236208308095,
+      "learning_rate": 0.00017531201903755994,
+      "loss": 0.7638,
+      "step": 315
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.5928432540931313,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.8448,
+      "step": 316
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.42223495580594994,
+      "learning_rate": 0.00017496995240138744,
+      "loss": 0.8594,
+      "step": 317
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.36180009658690626,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.718,
+      "step": 318
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.5055409710562159,
+      "learning_rate": 0.00017462587092430875,
+      "loss": 0.8116,
+      "step": 319
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.5258821591435593,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.8743,
+      "step": 320
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.42039164828489956,
+      "learning_rate": 0.00017427978385362112,
+      "loss": 0.7835,
+      "step": 321
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.5398918499239291,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.8457,
+      "step": 322
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.517493056428269,
+      "learning_rate": 0.0001739317004905227,
+      "loss": 0.801,
+      "step": 323
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4592443902096945,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.7163,
+      "step": 324
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.4570421656040498,
+      "learning_rate": 0.00017358163018986282,
+      "loss": 0.7891,
+      "step": 325
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.4653670519039479,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.7404,
+      "step": 326
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.4700823361619639,
+      "learning_rate": 0.00017322958235989016,
+      "loss": 0.7739,
+      "step": 327
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.46741465910878005,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7353,
+      "step": 328
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.5102078699593384,
+      "learning_rate": 0.00017287556646200018,
+      "loss": 0.843,
+      "step": 329
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.5575580445388119,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.8205,
+      "step": 330
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.5053123994869361,
+      "learning_rate": 0.00017251959201048083,
+      "loss": 0.7919,
+      "step": 331
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.47986139425074736,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.8036,
+      "step": 332
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.4106720871631404,
+      "learning_rate": 0.00017216166857225674,
+      "loss": 0.7301,
+      "step": 333
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.45195261587163915,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.7721,
+      "step": 334
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.4513538733673665,
+      "learning_rate": 0.00017180180576663228,
+      "loss": 0.8287,
+      "step": 335
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4916534731703709,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.7991,
+      "step": 336
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.3944159257016331,
+      "learning_rate": 0.00017144001326503273,
+      "loss": 0.6861,
+      "step": 337
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 1.0015835190531766,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.8172,
+      "step": 338
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.40573909671129543,
+      "learning_rate": 0.00017107630079074478,
+      "loss": 0.7758,
+      "step": 339
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4754532932792592,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.8139,
+      "step": 340
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.4323421835044929,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.783,
+      "step": 341
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.4247367763012067,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7058,
+      "step": 342
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.4444386432236298,
+      "learning_rate": 0.00017034315507498635,
+      "loss": 0.7733,
+      "step": 343
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.48628229506613657,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.8221,
+      "step": 344
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.49253670986195996,
+      "learning_rate": 0.00016997374153703625,
+      "loss": 0.8053,
+      "step": 345
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.4339843395502114,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.7737,
+      "step": 346
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.4833521631305639,
+      "learning_rate": 0.00016960244743290868,
+      "loss": 0.7774,
+      "step": 347
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.5205237546210102,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.8133,
+      "step": 348
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.3987540237364512,
+      "learning_rate": 0.00016922928274124886,
+      "loss": 0.692,
+      "step": 349
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.42948195691635055,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.7249,
+      "step": 350
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.6465198840665377,
+      "learning_rate": 0.00016885425749097444,
+      "loss": 0.8311,
+      "step": 351
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.47249829805105875,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7841,
+      "step": 352
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.4261879264130769,
+      "learning_rate": 0.00016847738176100632,
+      "loss": 0.7186,
+      "step": 353
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.4733907759093097,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7866,
+      "step": 354
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.4146452149695671,
+      "learning_rate": 0.0001680986656799975,
+      "loss": 0.7119,
+      "step": 355
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.456970736717012,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.7613,
+      "step": 356
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.4861683419656093,
+      "learning_rate": 0.00016771811942606108,
+      "loss": 0.777,
+      "step": 357
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.46398036139960575,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7371,
+      "step": 358
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.3741787436059476,
+      "learning_rate": 0.00016733575322649657,
+      "loss": 0.691,
+      "step": 359
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.48059657128466404,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.8145,
+      "step": 360
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.4494500231240095,
+      "learning_rate": 0.00016695157735751513,
+      "loss": 0.7534,
+      "step": 361
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.5289891737414257,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7592,
+      "step": 362
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.5238508441860991,
+      "learning_rate": 0.0001665656021439633,
+      "loss": 0.7654,
+      "step": 363
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.5006747022563904,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7694,
+      "step": 364
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.5261095172263893,
+      "learning_rate": 0.00016617783795904565,
+      "loss": 0.7961,
+      "step": 365
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.4465164232516254,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7719,
+      "step": 366
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.5509450648508075,
+      "learning_rate": 0.00016578829522404583,
+      "loss": 0.8502,
+      "step": 367
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4966173074903064,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7722,
+      "step": 368
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.4979829819052925,
+      "learning_rate": 0.00016539698440804661,
+      "loss": 0.8141,
+      "step": 369
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4070899764023006,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7126,
+      "step": 370
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.5277470259602063,
+      "learning_rate": 0.0001650039160276485,
+      "loss": 0.7905,
+      "step": 371
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.55268793628899,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7444,
+      "step": 372
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.4661807766951434,
+      "learning_rate": 0.0001646091006466871,
+      "loss": 0.7516,
+      "step": 373
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.43528338047912696,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.7993,
+      "step": 374
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5169034933901338,
+      "learning_rate": 0.00016421254887594917,
+      "loss": 0.7944,
+      "step": 375
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.4659723705564739,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7806,
+      "step": 376
+    },
+    {
+      "epoch": 0.3016,
+      "grad_norm": 0.4621920855334705,
+      "learning_rate": 0.00016381427137288754,
+      "loss": 0.7643,
+      "step": 377
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.5033112857376137,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7568,
+      "step": 378
+    },
+    {
+      "epoch": 0.3032,
+      "grad_norm": 0.45914237617703174,
+      "learning_rate": 0.0001634142788413346,
+      "loss": 0.7571,
+      "step": 379
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4811632118631198,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7865,
+      "step": 380
+    },
+    {
+      "epoch": 0.3048,
+      "grad_norm": 0.47428555365500835,
+      "learning_rate": 0.00016301258203121462,
+      "loss": 0.7615,
+      "step": 381
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.41642206878293037,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7635,
+      "step": 382
+    },
+    {
+      "epoch": 0.3064,
+      "grad_norm": 0.46410825848106113,
+      "learning_rate": 0.00016260919173825508,
+      "loss": 0.7795,
+      "step": 383
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4749681805721738,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.8257,
+      "step": 384
+    },
+    {
+      "epoch": 0.308,
+      "grad_norm": 0.4230544131011867,
+      "learning_rate": 0.00016220411880369601,
+      "loss": 0.7637,
+      "step": 385
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.3919205953867002,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.729,
+      "step": 386
+    },
+    {
+      "epoch": 0.3096,
+      "grad_norm": 0.4645646620710507,
+      "learning_rate": 0.00016179737411399926,
+      "loss": 0.7632,
+      "step": 387
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.5226866117554707,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7977,
+      "step": 388
+    },
+    {
+      "epoch": 0.3112,
+      "grad_norm": 0.4532133815875697,
+      "learning_rate": 0.00016138896860055555,
+      "loss": 0.819,
+      "step": 389
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.4800058122866018,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.742,
+      "step": 390
+    },
+    {
+      "epoch": 0.3128,
+      "grad_norm": 0.40963900125414177,
+      "learning_rate": 0.00016097891323939062,
+      "loss": 0.7623,
+      "step": 391
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4345153368300366,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7405,
+      "step": 392
+    },
+    {
+      "epoch": 0.3144,
+      "grad_norm": 0.3888545931609104,
+      "learning_rate": 0.00016056721905087056,
+      "loss": 0.7214,
+      "step": 393
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.3726771135751388,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.6504,
+      "step": 394
+    },
+    {
+      "epoch": 0.316,
+      "grad_norm": 0.4961883258832556,
+      "learning_rate": 0.00016015389709940538,
+      "loss": 0.784,
+      "step": 395
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.4808155942240947,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7843,
+      "step": 396
+    },
+    {
+      "epoch": 0.3176,
+      "grad_norm": 0.4360198562884242,
+      "learning_rate": 0.0001597389584931517,
+      "loss": 0.7447,
+      "step": 397
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.4290451599095809,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7496,
+      "step": 398
+    },
+    {
+      "epoch": 0.3192,
+      "grad_norm": 0.49075497550809033,
+      "learning_rate": 0.0001593224143837142,
+      "loss": 0.7901,
+      "step": 399
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.433975566052962,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.8071,
+      "step": 400
+    },
+    {
+      "epoch": 0.3208,
+      "grad_norm": 0.43455806077802206,
+      "learning_rate": 0.00015890427596584617,
+      "loss": 0.7979,
+      "step": 401
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.39654183107816543,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7455,
+      "step": 402
+    },
+    {
+      "epoch": 0.3224,
+      "grad_norm": 0.4027747900164966,
+      "learning_rate": 0.00015848455447714822,
+      "loss": 0.7612,
+      "step": 403
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.5215110040861919,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.8311,
+      "step": 404
+    },
+    {
+      "epoch": 0.324,
+      "grad_norm": 0.458476625944734,
+      "learning_rate": 0.00015806326119776663,
+      "loss": 0.6957,
+      "step": 405
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.5032765612976655,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7318,
+      "step": 406
+    },
+    {
+      "epoch": 0.3256,
+      "grad_norm": 0.4282328303611774,
+      "learning_rate": 0.00015764040745008988,
+      "loss": 0.7315,
+      "step": 407
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4283810020409699,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.8376,
+      "step": 408
+    },
+    {
+      "epoch": 0.3272,
+      "grad_norm": 0.3956368192508352,
+      "learning_rate": 0.00015721600459844468,
+      "loss": 0.6642,
+      "step": 409
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.49128944752081743,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7537,
+      "step": 410
+    },
+    {
+      "epoch": 0.3288,
+      "grad_norm": 0.4001949751491434,
+      "learning_rate": 0.00015679006404879033,
+      "loss": 0.736,
+      "step": 411
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.4477164246360966,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7402,
+      "step": 412
+    },
+    {
+      "epoch": 0.3304,
+      "grad_norm": 0.4031274937697729,
+      "learning_rate": 0.00015636259724841222,
+      "loss": 0.6603,
+      "step": 413
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.43057763802374044,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.8079,
+      "step": 414
+    },
+    {
+      "epoch": 0.332,
+      "grad_norm": 0.43719760308136896,
+      "learning_rate": 0.00015593361568561428,
+      "loss": 0.6774,
+      "step": 415
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3676676205070124,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.6877,
+      "step": 416
+    },
+    {
+      "epoch": 0.3336,
+      "grad_norm": 0.458911809801583,
+      "learning_rate": 0.0001555031308894101,
+      "loss": 0.8522,
+      "step": 417
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.5571317205735612,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.8047,
+      "step": 418
+    },
+    {
+      "epoch": 0.3352,
+      "grad_norm": 0.48626898520583006,
+      "learning_rate": 0.0001550711544292131,
+      "loss": 0.8081,
+      "step": 419
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5310547052797169,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.8538,
+      "step": 420
+    },
+    {
+      "epoch": 0.3368,
+      "grad_norm": 0.45137021273237177,
+      "learning_rate": 0.00015463769791452574,
+      "loss": 0.7976,
+      "step": 421
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.41415089507065106,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7359,
+      "step": 422
+    },
+    {
+      "epoch": 0.3384,
+      "grad_norm": 0.37909456905958816,
+      "learning_rate": 0.00015420277299462736,
+      "loss": 0.6809,
+      "step": 423
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.48780051331278546,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7655,
+      "step": 424
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.4580221180681622,
+      "learning_rate": 0.00015376639135826107,
+      "loss": 0.7339,
+      "step": 425
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.4132785494038251,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7837,
+      "step": 426
+    },
+    {
+      "epoch": 0.3416,
+      "grad_norm": 0.3944766874177697,
+      "learning_rate": 0.00015332856473331978,
+      "loss": 0.7302,
+      "step": 427
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.4037477141523566,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7347,
+      "step": 428
+    },
+    {
+      "epoch": 0.3432,
+      "grad_norm": 0.44285585008417,
+      "learning_rate": 0.00015288930488653094,
+      "loss": 0.7709,
+      "step": 429
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.46780597074089453,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.6999,
+      "step": 430
+    },
+    {
+      "epoch": 0.3448,
+      "grad_norm": 0.43483794888344374,
+      "learning_rate": 0.0001524486236231402,
+      "loss": 0.7685,
+      "step": 431
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.4245764749893502,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7185,
+      "step": 432
+    },
+    {
+      "epoch": 0.3464,
+      "grad_norm": 0.41944137971055734,
+      "learning_rate": 0.00015200653278659432,
+      "loss": 0.7137,
+      "step": 433
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.4152331799504103,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7186,
+      "step": 434
+    },
+    {
+      "epoch": 0.348,
+      "grad_norm": 0.3838060086477032,
+      "learning_rate": 0.00015156304425822267,
+      "loss": 0.686,
+      "step": 435
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.5027524972661573,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7165,
+      "step": 436
+    },
+    {
+      "epoch": 0.3496,
+      "grad_norm": 0.451858046202718,
+      "learning_rate": 0.00015111816995691809,
+      "loss": 0.7389,
+      "step": 437
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.44750570537460616,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.8162,
+      "step": 438
+    },
+    {
+      "epoch": 0.3512,
+      "grad_norm": 0.4193427784451634,
+      "learning_rate": 0.00015067192183881658,
+      "loss": 0.7083,
+      "step": 439
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3985119493318944,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7839,
+      "step": 440
+    },
+    {
+      "epoch": 0.3528,
+      "grad_norm": 0.6216787661550495,
+      "learning_rate": 0.00015022431189697568,
+      "loss": 0.8906,
+      "step": 441
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.4942803545738827,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8168,
+      "step": 442
+    },
+    {
+      "epoch": 0.3544,
+      "grad_norm": 0.42018726726070965,
+      "learning_rate": 0.0001497753521610526,
+      "loss": 0.6902,
+      "step": 443
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.43343226435576415,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7445,
+      "step": 444
+    },
+    {
+      "epoch": 0.356,
+      "grad_norm": 0.42222783049209456,
+      "learning_rate": 0.00014932505469698052,
+      "loss": 0.7477,
+      "step": 445
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.4997280795026545,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.8327,
+      "step": 446
+    },
+    {
+      "epoch": 0.3576,
+      "grad_norm": 0.4509750043267108,
+      "learning_rate": 0.0001488734316066446,
+      "loss": 0.7193,
+      "step": 447
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.5435894942324518,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.8441,
+      "step": 448
+    },
+    {
+      "epoch": 0.3592,
+      "grad_norm": 0.5690852374518198,
+      "learning_rate": 0.0001484204950275565,
+      "loss": 0.8191,
+      "step": 449
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5901781835149241,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.8416,
+      "step": 450
+    },
+    {
+      "epoch": 0.3608,
+      "grad_norm": 0.44095547117052597,
+      "learning_rate": 0.00014796625713252848,
+      "loss": 0.6817,
+      "step": 451
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.410082351206514,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.6844,
+      "step": 452
+    },
+    {
+      "epoch": 0.3624,
+      "grad_norm": 0.433390304861033,
+      "learning_rate": 0.00014751073012934587,
+      "loss": 0.7151,
+      "step": 453
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.4673148852916645,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7742,
+      "step": 454
+    },
+    {
+      "epoch": 0.364,
+      "grad_norm": 0.5629792615802157,
+      "learning_rate": 0.0001470539262604393,
+      "loss": 0.6944,
+      "step": 455
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.4197799181102277,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7866,
+      "step": 456
+    },
+    {
+      "epoch": 0.3656,
+      "grad_norm": 0.42309809359489753,
+      "learning_rate": 0.00014659585780255556,
+      "loss": 0.6679,
+      "step": 457
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.5107005715491209,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.7455,
+      "step": 458
+    },
+    {
+      "epoch": 0.3672,
+      "grad_norm": 0.4033634928151588,
+      "learning_rate": 0.0001461365370664276,
+      "loss": 0.7024,
+      "step": 459
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4384657706250393,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.739,
+      "step": 460
+    },
+    {
+      "epoch": 0.3688,
+      "grad_norm": 0.4504037806901727,
+      "learning_rate": 0.00014567597639644387,
+      "loss": 0.7991,
+      "step": 461
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.48886688910936515,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7435,
+      "step": 462
+    },
+    {
+      "epoch": 0.3704,
+      "grad_norm": 0.41267602272551973,
+      "learning_rate": 0.00014521418817031628,
+      "loss": 0.7444,
+      "step": 463
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4115970936792021,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.696,
+      "step": 464
+    },
+    {
+      "epoch": 0.372,
+      "grad_norm": 0.38804305128365696,
+      "learning_rate": 0.00014475118479874774,
+      "loss": 0.7202,
+      "step": 465
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.45562899264909823,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7737,
+      "step": 466
+    },
+    {
+      "epoch": 0.3736,
+      "grad_norm": 0.47743243253657847,
+      "learning_rate": 0.0001442869787250987,
+      "loss": 0.758,
+      "step": 467
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4395717414306346,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.6759,
+      "step": 468
+    },
+    {
+      "epoch": 0.3752,
+      "grad_norm": 0.5150009930824007,
+      "learning_rate": 0.00014382158242505234,
+      "loss": 0.8263,
+      "step": 469
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.44237543082679537,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7734,
+      "step": 470
+    },
+    {
+      "epoch": 0.3768,
+      "grad_norm": 0.4931934397534956,
+      "learning_rate": 0.00014335500840627986,
+      "loss": 0.7782,
+      "step": 471
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.45668570071819947,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.7347,
+      "step": 472
+    },
+    {
+      "epoch": 0.3784,
+      "grad_norm": 0.44567464720174327,
+      "learning_rate": 0.0001428872692081038,
+      "loss": 0.764,
+      "step": 473
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.40635707404287097,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7772,
+      "step": 474
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.4730844272067816,
+      "learning_rate": 0.00014241837740116132,
+      "loss": 0.7864,
+      "step": 475
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.42541835491027025,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7079,
+      "step": 476
+    },
+    {
+      "epoch": 0.3816,
+      "grad_norm": 0.5089978868540131,
+      "learning_rate": 0.00014194834558706632,
+      "loss": 0.7802,
+      "step": 477
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.5211685215608624,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.8409,
+      "step": 478
+    },
+    {
+      "epoch": 0.3832,
+      "grad_norm": 0.47327225076322404,
+      "learning_rate": 0.0001414771863980707,
+      "loss": 0.8571,
+      "step": 479
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.40936558132508094,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.7973,
+      "step": 480
+    },
+    {
+      "epoch": 0.3848,
+      "grad_norm": 0.4362419996174767,
+      "learning_rate": 0.00014100491249672498,
+      "loss": 0.7758,
+      "step": 481
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.5238046230383762,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7709,
+      "step": 482
+    },
+    {
+      "epoch": 0.3864,
+      "grad_norm": 0.5497251848678121,
+      "learning_rate": 0.0001405315365755379,
+      "loss": 0.7611,
+      "step": 483
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.45884982502845467,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7774,
+      "step": 484
+    },
+    {
+      "epoch": 0.388,
+      "grad_norm": 0.4012956728140862,
+      "learning_rate": 0.00014005707135663527,
+      "loss": 0.7038,
+      "step": 485
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.46963958742490086,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7212,
+      "step": 486
+    },
+    {
+      "epoch": 0.3896,
+      "grad_norm": 0.414724977340573,
+      "learning_rate": 0.00013958152959141825,
+      "loss": 0.7483,
+      "step": 487
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.45545795630258695,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.8236,
+      "step": 488
+    },
+    {
+      "epoch": 0.3912,
+      "grad_norm": 0.4080879001104178,
+      "learning_rate": 0.00013910492406022033,
+      "loss": 0.6612,
+      "step": 489
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.5728231130149111,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.8943,
+      "step": 490
+    },
+    {
+      "epoch": 0.3928,
+      "grad_norm": 0.4048483515111806,
+      "learning_rate": 0.0001386272675719642,
+      "loss": 0.7335,
+      "step": 491
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.3714512698859433,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7355,
+      "step": 492
+    },
+    {
+      "epoch": 0.3944,
+      "grad_norm": 0.4632496150613504,
+      "learning_rate": 0.00013814857296381728,
+      "loss": 0.8119,
+      "step": 493
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.44023327521008365,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.791,
+      "step": 494
+    },
+    {
+      "epoch": 0.396,
+      "grad_norm": 0.4715924508488183,
+      "learning_rate": 0.00013766885310084688,
+      "loss": 0.7535,
+      "step": 495
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.44552213416424963,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7662,
+      "step": 496
+    },
+    {
+      "epoch": 0.3976,
+      "grad_norm": 0.4097714152317744,
+      "learning_rate": 0.00013718812087567414,
+      "loss": 0.6585,
+      "step": 497
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.41866497088119065,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.6993,
+      "step": 498
+    },
+    {
+      "epoch": 0.3992,
+      "grad_norm": 0.4188938513957478,
+      "learning_rate": 0.000136706389208128,
+      "loss": 0.6517,
+      "step": 499
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.414161560358882,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.701,
+      "step": 500
+    },
+    {
+      "epoch": 0.4008,
+      "grad_norm": 0.38747983953923315,
+      "learning_rate": 0.00013622367104489756,
+      "loss": 0.6629,
+      "step": 501
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.42397825009703494,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.7227,
+      "step": 502
+    },
+    {
+      "epoch": 0.4024,
+      "grad_norm": 0.5074637517864383,
+      "learning_rate": 0.0001357399793591844,
+      "loss": 0.7306,
+      "step": 503
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.41451661042263027,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7548,
+      "step": 504
+    },
+    {
+      "epoch": 0.404,
+      "grad_norm": 0.5207857570485644,
+      "learning_rate": 0.00013525532715035366,
+      "loss": 0.8035,
+      "step": 505
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.48617295332934835,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.8309,
+      "step": 506
+    },
+    {
+      "epoch": 0.4056,
+      "grad_norm": 0.3849126756616638,
+      "learning_rate": 0.00013476972744358507,
+      "loss": 0.6318,
+      "step": 507
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.44966786994539865,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.7125,
+      "step": 508
+    },
+    {
+      "epoch": 0.4072,
+      "grad_norm": 0.4998498402001065,
+      "learning_rate": 0.00013428319328952253,
+      "loss": 0.7635,
+      "step": 509
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.40824096736907733,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.8029,
+      "step": 510
+    },
+    {
+      "epoch": 0.4088,
+      "grad_norm": 0.42493678327155465,
+      "learning_rate": 0.0001337957377639235,
+      "loss": 0.7078,
+      "step": 511
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4643039147700563,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.763,
+      "step": 512
+    },
+    {
+      "epoch": 0.4104,
+      "grad_norm": 0.451550065523837,
+      "learning_rate": 0.0001333073739673076,
+      "loss": 0.7453,
+      "step": 513
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.3962407481953412,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.6989,
+      "step": 514
+    },
+    {
+      "epoch": 0.412,
+      "grad_norm": 0.4240972513922981,
+      "learning_rate": 0.0001328181150246045,
+      "loss": 0.6977,
+      "step": 515
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.41336338106308385,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.687,
+      "step": 516
+    },
+    {
+      "epoch": 0.4136,
+      "grad_norm": 0.37027685712700015,
+      "learning_rate": 0.00013232797408480127,
+      "loss": 0.6883,
+      "step": 517
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.40170706197125805,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7568,
+      "step": 518
+    },
+    {
+      "epoch": 0.4152,
+      "grad_norm": 0.4554872123840497,
+      "learning_rate": 0.00013183696432058888,
+      "loss": 0.7447,
+      "step": 519
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.40574395432970756,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7286,
+      "step": 520
+    },
+    {
+      "epoch": 0.4168,
+      "grad_norm": 0.48608042117887207,
+      "learning_rate": 0.00013134509892800822,
+      "loss": 0.7384,
+      "step": 521
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.45949040952795284,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7059,
+      "step": 522
+    },
+    {
+      "epoch": 0.4184,
+      "grad_norm": 0.42454813479036535,
+      "learning_rate": 0.00013085239112609547,
+      "loss": 0.7408,
+      "step": 523
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.42970061008444677,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7353,
+      "step": 524
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.415338461458164,
+      "learning_rate": 0.00013035885415652685,
+      "loss": 0.7682,
+      "step": 525
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.40326020822765896,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7332,
+      "step": 526
+    },
+    {
+      "epoch": 0.4216,
+      "grad_norm": 0.4696858589276105,
+      "learning_rate": 0.00012986450128326266,
+      "loss": 0.8016,
+      "step": 527
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3866997010269802,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7031,
+      "step": 528
+    },
+    {
+      "epoch": 0.4232,
+      "grad_norm": 0.4302858454821888,
+      "learning_rate": 0.00012936934579219094,
+      "loss": 0.7477,
+      "step": 529
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.5291243986938438,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7281,
+      "step": 530
+    },
+    {
+      "epoch": 0.4248,
+      "grad_norm": 0.414861003656148,
+      "learning_rate": 0.00012887340099077024,
+      "loss": 0.6288,
+      "step": 531
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.3745285570911487,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.6187,
+      "step": 532
+    },
+    {
+      "epoch": 0.4264,
+      "grad_norm": 0.4433836524373664,
+      "learning_rate": 0.0001283766802076722,
+      "loss": 0.7558,
+      "step": 533
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.41415406260115645,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.6811,
+      "step": 534
+    },
+    {
+      "epoch": 0.428,
+      "grad_norm": 0.5928849942784749,
+      "learning_rate": 0.00012787919679242306,
+      "loss": 0.8649,
+      "step": 535
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4460533812963949,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7696,
+      "step": 536
+    },
+    {
+      "epoch": 0.4296,
+      "grad_norm": 0.4322610863341201,
+      "learning_rate": 0.00012738096411504522,
+      "loss": 0.7364,
+      "step": 537
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.43468893082608867,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7042,
+      "step": 538
+    },
+    {
+      "epoch": 0.4312,
+      "grad_norm": 0.4279277608921427,
+      "learning_rate": 0.00012688199556569753,
+      "loss": 0.7676,
+      "step": 539
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4740815355097255,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7874,
+      "step": 540
+    },
+    {
+      "epoch": 0.4328,
+      "grad_norm": 0.4662686508421357,
+      "learning_rate": 0.0001263823045543158,
+      "loss": 0.7279,
+      "step": 541
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.45311482424332367,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.7358,
+      "step": 542
+    },
+    {
+      "epoch": 0.4344,
+      "grad_norm": 0.5402236483780922,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.7455,
+      "step": 543
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.37960441149392116,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.6726,
+      "step": 544
+    },
+    {
+      "epoch": 0.436,
+      "grad_norm": 0.6249639514914175,
+      "learning_rate": 0.00012538080888191408,
+      "loss": 0.8621,
+      "step": 545
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.5331954264682194,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.8619,
+      "step": 546
+    },
+    {
+      "epoch": 0.4376,
+      "grad_norm": 0.43736806035408327,
+      "learning_rate": 0.00012487903113640337,
+      "loss": 0.7501,
+      "step": 547
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.39133146091624577,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.6914,
+      "step": 548
+    },
+    {
+      "epoch": 0.4392,
+      "grad_norm": 0.5933312796253537,
+      "learning_rate": 0.00012437658475915377,
+      "loss": 0.9193,
+      "step": 549
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.3847507085275376,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7759,
+      "step": 550
+    },
+    {
+      "epoch": 0.4408,
+      "grad_norm": 0.363405009433934,
+      "learning_rate": 0.00012387348325356874,
+      "loss": 0.7039,
+      "step": 551
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.42582514682408823,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.7756,
+      "step": 552
+    },
+    {
+      "epoch": 0.4424,
+      "grad_norm": 0.4090049364624364,
+      "learning_rate": 0.00012336974014065844,
+      "loss": 0.8281,
+      "step": 553
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.3738580177207341,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7165,
+      "step": 554
+    },
+    {
+      "epoch": 0.444,
+      "grad_norm": 0.4863876051491984,
+      "learning_rate": 0.00012286536895867654,
+      "loss": 0.8025,
+      "step": 555
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.45352490918087,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7511,
+      "step": 556
+    },
+    {
+      "epoch": 0.4456,
+      "grad_norm": 0.42160383400991047,
+      "learning_rate": 0.00012236038326275626,
+      "loss": 0.7283,
+      "step": 557
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.4083416424869398,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7113,
+      "step": 558
+    },
+    {
+      "epoch": 0.4472,
+      "grad_norm": 0.4499074190268108,
+      "learning_rate": 0.00012185479662454595,
+      "loss": 0.7747,
+      "step": 559
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.5312004411706165,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7676,
+      "step": 560
+    },
+    {
+      "epoch": 0.4488,
+      "grad_norm": 0.4181180073120208,
+      "learning_rate": 0.00012134862263184467,
+      "loss": 0.7128,
+      "step": 561
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.3742699657921233,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.7277,
+      "step": 562
+    },
+    {
+      "epoch": 0.4504,
+      "grad_norm": 0.41547253781863375,
+      "learning_rate": 0.00012084187488823657,
+      "loss": 0.7226,
+      "step": 563
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.4125901670375791,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7637,
+      "step": 564
+    },
+    {
+      "epoch": 0.452,
+      "grad_norm": 0.3940323995879189,
+      "learning_rate": 0.00012033456701272576,
+      "loss": 0.7455,
+      "step": 565
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.44271352119465857,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7134,
+      "step": 566
+    },
+    {
+      "epoch": 0.4536,
+      "grad_norm": 0.420556415407943,
+      "learning_rate": 0.00011982671263936995,
+      "loss": 0.6852,
+      "step": 567
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.5086736657220369,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7157,
+      "step": 568
+    },
+    {
+      "epoch": 0.4552,
+      "grad_norm": 0.5047448006462422,
+      "learning_rate": 0.00011931832541691418,
+      "loss": 0.7405,
+      "step": 569
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.47623346942205214,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.6759,
+      "step": 570
+    },
+    {
+      "epoch": 0.4568,
+      "grad_norm": 0.435263941647073,
+      "learning_rate": 0.00011880941900842397,
+      "loss": 0.7728,
+      "step": 571
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.4351854503352841,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.6759,
+      "step": 572
+    },
+    {
+      "epoch": 0.4584,
+      "grad_norm": 0.4412934491702434,
+      "learning_rate": 0.00011830000709091815,
+      "loss": 0.7063,
+      "step": 573
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.41445704404686395,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7044,
+      "step": 574
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.45066220450604044,
+      "learning_rate": 0.0001177901033550012,
+      "loss": 0.7094,
+      "step": 575
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.4233844751789096,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.6544,
+      "step": 576
+    },
+    {
+      "epoch": 0.4616,
+      "grad_norm": 0.5000831029823927,
+      "learning_rate": 0.00011727972150449544,
+      "loss": 0.776,
+      "step": 577
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.418103957155811,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7505,
+      "step": 578
+    },
+    {
+      "epoch": 0.4632,
+      "grad_norm": 0.3835911317292797,
+      "learning_rate": 0.00011676887525607271,
+      "loss": 0.7067,
+      "step": 579
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.38346700490955143,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7269,
+      "step": 580
+    },
+    {
+      "epoch": 0.4648,
+      "grad_norm": 0.4428443387426583,
+      "learning_rate": 0.00011625757833888551,
+      "loss": 0.7108,
+      "step": 581
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.41075676169897724,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7651,
+      "step": 582
+    },
+    {
+      "epoch": 0.4664,
+      "grad_norm": 0.5108612361056679,
+      "learning_rate": 0.0001157458444941984,
+      "loss": 0.703,
+      "step": 583
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3939003031118122,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7866,
+      "step": 584
+    },
+    {
+      "epoch": 0.468,
+      "grad_norm": 0.44288146767279274,
+      "learning_rate": 0.00011523368747501839,
+      "loss": 0.8414,
+      "step": 585
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.6436151513443139,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.727,
+      "step": 586
+    },
+    {
+      "epoch": 0.4696,
+      "grad_norm": 0.5314724432682242,
+      "learning_rate": 0.00011472112104572547,
+      "loss": 0.6771,
+      "step": 587
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.41804944318320336,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.723,
+      "step": 588
+    },
+    {
+      "epoch": 0.4712,
+      "grad_norm": 0.43385673641757744,
+      "learning_rate": 0.0001142081589817027,
+      "loss": 0.6596,
+      "step": 589
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.43972284723540145,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7128,
+      "step": 590
+    },
+    {
+      "epoch": 0.4728,
+      "grad_norm": 0.4130285489578436,
+      "learning_rate": 0.00011369481506896582,
+      "loss": 0.7377,
+      "step": 591
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.39400294155797677,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7172,
+      "step": 592
+    },
+    {
+      "epoch": 0.4744,
+      "grad_norm": 0.3556266444230241,
+      "learning_rate": 0.00011318110310379301,
+      "loss": 0.6508,
+      "step": 593
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.3922797153545315,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.733,
+      "step": 594
+    },
+    {
+      "epoch": 0.476,
+      "grad_norm": 0.42557567070328217,
+      "learning_rate": 0.00011266703689235394,
+      "loss": 0.6811,
+      "step": 595
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.47518565761801196,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.6853,
+      "step": 596
+    },
+    {
+      "epoch": 0.4776,
+      "grad_norm": 0.5243789972413347,
+      "learning_rate": 0.00011215263025033869,
+      "loss": 0.7605,
+      "step": 597
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.596499320293954,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.845,
+      "step": 598
+    },
+    {
+      "epoch": 0.4792,
+      "grad_norm": 0.5678827706833086,
+      "learning_rate": 0.00011163789700258655,
+      "loss": 0.7664,
+      "step": 599
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.46971601029972,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7971,
+      "step": 600
+    },
+    {
+      "epoch": 0.4808,
+      "grad_norm": 0.448639421281421,
+      "learning_rate": 0.00011112285098271451,
+      "loss": 0.7541,
+      "step": 601
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.46375060985537697,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.7397,
+      "step": 602
+    },
+    {
+      "epoch": 0.4824,
+      "grad_norm": 0.48939844130058974,
+      "learning_rate": 0.00011060750603274535,
+      "loss": 0.7442,
+      "step": 603
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3873308221390546,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.6937,
+      "step": 604
+    },
+    {
+      "epoch": 0.484,
+      "grad_norm": 0.5384616961392372,
+      "learning_rate": 0.00011009187600273566,
+      "loss": 0.7938,
+      "step": 605
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.47470324845901535,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.8125,
+      "step": 606
+    },
+    {
+      "epoch": 0.4856,
+      "grad_norm": 0.42177784078337344,
+      "learning_rate": 0.00010957597475040373,
+      "loss": 0.744,
+      "step": 607
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.4934661926935679,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7409,
+      "step": 608
+    },
+    {
+      "epoch": 0.4872,
+      "grad_norm": 0.4613479157092345,
+      "learning_rate": 0.00010905981614075693,
+      "loss": 0.779,
+      "step": 609
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.5452939935438601,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.8148,
+      "step": 610
+    },
+    {
+      "epoch": 0.4888,
+      "grad_norm": 0.4508393845728654,
+      "learning_rate": 0.00010854341404571928,
+      "loss": 0.7924,
+      "step": 611
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.4565799747480313,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.737,
+      "step": 612
+    },
+    {
+      "epoch": 0.4904,
+      "grad_norm": 0.519353136825866,
+      "learning_rate": 0.00010802678234375851,
+      "loss": 0.8351,
+      "step": 613
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.4497705570041638,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7249,
+      "step": 614
+    },
+    {
+      "epoch": 0.492,
+      "grad_norm": 0.3677815627301932,
+      "learning_rate": 0.0001075099349195131,
+      "loss": 0.6891,
+      "step": 615
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4838852628668298,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.7266,
+      "step": 616
+    },
+    {
+      "epoch": 0.4936,
+      "grad_norm": 0.5254859830223015,
+      "learning_rate": 0.00010699288566341914,
+      "loss": 0.7767,
+      "step": 617
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.3919541556072212,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7107,
+      "step": 618
+    },
+    {
+      "epoch": 0.4952,
+      "grad_norm": 0.42827431072546535,
+      "learning_rate": 0.000106475648471337,
+      "loss": 0.7075,
+      "step": 619
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.46837510300320306,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7855,
+      "step": 620
+    },
+    {
+      "epoch": 0.4968,
+      "grad_norm": 0.5506398108077281,
+      "learning_rate": 0.00010595823724417795,
+      "loss": 0.8101,
+      "step": 621
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.44938366772991534,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.7572,
+      "step": 622
+    },
+    {
+      "epoch": 0.4984,
+      "grad_norm": 0.4602391712132883,
+      "learning_rate": 0.00010544066588753044,
+      "loss": 0.7316,
+      "step": 623
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.46875722931233516,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.7303,
+      "step": 624
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.4697615865717118,
+      "learning_rate": 0.00010492294831128641,
+      "loss": 0.7817,
+      "step": 625
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.41354754102731717,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7188,
+      "step": 626
+    },
+    {
+      "epoch": 0.5016,
+      "grad_norm": 0.3805016393702369,
+      "learning_rate": 0.00010440509842926767,
+      "loss": 0.6743,
+      "step": 627
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.5054227844560674,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.7994,
+      "step": 628
+    },
+    {
+      "epoch": 0.5032,
+      "grad_norm": 0.46229371886933873,
+      "learning_rate": 0.00010388713015885161,
+      "loss": 0.7079,
+      "step": 629
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.4376660706476633,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.7244,
+      "step": 630
+    },
+    {
+      "epoch": 0.5048,
+      "grad_norm": 0.4004980165850154,
+      "learning_rate": 0.00010336905742059742,
+      "loss": 0.7255,
+      "step": 631
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.5464876140864391,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7595,
+      "step": 632
+    },
+    {
+      "epoch": 0.5064,
+      "grad_norm": 0.45246774575417864,
+      "learning_rate": 0.0001028508941378719,
+      "loss": 0.7115,
+      "step": 633
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.42679202516536824,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7379,
+      "step": 634
+    },
+    {
+      "epoch": 0.508,
+      "grad_norm": 0.5662659497458966,
+      "learning_rate": 0.00010233265423647523,
+      "loss": 0.8379,
+      "step": 635
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.45926997631132005,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.7837,
+      "step": 636
+    },
+    {
+      "epoch": 0.5096,
+      "grad_norm": 0.41818847338701415,
+      "learning_rate": 0.00010181435164426676,
+      "loss": 0.6759,
+      "step": 637
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.46659325022724846,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.6766,
+      "step": 638
+    },
+    {
+      "epoch": 0.5112,
+      "grad_norm": 0.43217241869721623,
+      "learning_rate": 0.00010129600029079072,
+      "loss": 0.7169,
+      "step": 639
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.5306198437580659,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.739,
+      "step": 640
+    },
+    {
+      "epoch": 0.5128,
+      "grad_norm": 0.5882053404532518,
+      "learning_rate": 0.00010077761410690172,
+      "loss": 0.7938,
+      "step": 641
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.5712201539772703,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.8035,
+      "step": 642
+    },
+    {
+      "epoch": 0.5144,
+      "grad_norm": 0.5543194754396824,
+      "learning_rate": 0.00010025920702439051,
+      "loss": 0.7287,
+      "step": 643
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.6361289588756656,
+      "learning_rate": 0.0001,
+      "loss": 0.6836,
+      "step": 644
+    },
+    {
+      "epoch": 0.516,
+      "grad_norm": 0.5003807411642396,
+      "learning_rate": 9.97407929756095e-05,
+      "loss": 0.8057,
+      "step": 645
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.4805444307900423,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7293,
+      "step": 646
+    },
+    {
+      "epoch": 0.5176,
+      "grad_norm": 0.4459141956754468,
+      "learning_rate": 9.92223858930983e-05,
+      "loss": 0.7287,
+      "step": 647
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.4061230096741999,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.6775,
+      "step": 648
+    },
+    {
+      "epoch": 0.5192,
+      "grad_norm": 0.41600117805833925,
+      "learning_rate": 9.870399970920932e-05,
+      "loss": 0.6779,
+      "step": 649
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.44343437229655974,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7739,
+      "step": 650
+    },
+    {
+      "epoch": 0.5208,
+      "grad_norm": 0.42608066265593547,
+      "learning_rate": 9.818564835573323e-05,
+      "loss": 0.7031,
+      "step": 651
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3714372008628129,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.6887,
+      "step": 652
+    },
+    {
+      "epoch": 0.5224,
+      "grad_norm": 0.43043174543791074,
+      "learning_rate": 9.766734576352478e-05,
+      "loss": 0.7342,
+      "step": 653
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.5226011484213909,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.7462,
+      "step": 654
+    },
+    {
+      "epoch": 0.524,
+      "grad_norm": 0.4662867219259712,
+      "learning_rate": 9.714910586212816e-05,
+      "loss": 0.7653,
+      "step": 655
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4187437731003551,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.663,
+      "step": 656
+    },
+    {
+      "epoch": 0.5256,
+      "grad_norm": 0.5355323628621518,
+      "learning_rate": 9.663094257940258e-05,
+      "loss": 0.7373,
+      "step": 657
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.4737265018487043,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7729,
+      "step": 658
+    },
+    {
+      "epoch": 0.5272,
+      "grad_norm": 0.4865560065340418,
+      "learning_rate": 9.611286984114841e-05,
+      "loss": 0.7206,
+      "step": 659
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.5026776434756831,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.8127,
+      "step": 660
+    },
+    {
+      "epoch": 0.5288,
+      "grad_norm": 0.5283868792090227,
+      "learning_rate": 9.559490157073236e-05,
+      "loss": 0.7204,
+      "step": 661
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.3719591365008973,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.6594,
+      "step": 662
+    },
+    {
+      "epoch": 0.5304,
+      "grad_norm": 0.4619509085697814,
+      "learning_rate": 9.507705168871358e-05,
+      "loss": 0.7127,
+      "step": 663
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.40751957206512246,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6669,
+      "step": 664
+    },
+    {
+      "epoch": 0.532,
+      "grad_norm": 0.5286405314124261,
+      "learning_rate": 9.455933411246958e-05,
+      "loss": 0.718,
+      "step": 665
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.468664846462301,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.7331,
+      "step": 666
+    },
+    {
+      "epoch": 0.5336,
+      "grad_norm": 0.45131184728020785,
+      "learning_rate": 9.404176275582208e-05,
+      "loss": 0.7606,
+      "step": 667
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3829838559164893,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.6796,
+      "step": 668
+    },
+    {
+      "epoch": 0.5352,
+      "grad_norm": 0.5079153239489907,
+      "learning_rate": 9.352435152866298e-05,
+      "loss": 0.7025,
+      "step": 669
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.5085941273721384,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.9081,
+      "step": 670
+    },
+    {
+      "epoch": 0.5368,
+      "grad_norm": 0.5449010666211154,
+      "learning_rate": 9.300711433658087e-05,
+      "loss": 0.8133,
+      "step": 671
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.43887401346898125,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.761,
+      "step": 672
+    },
+    {
+      "epoch": 0.5384,
+      "grad_norm": 0.45721165998038454,
+      "learning_rate": 9.249006508048694e-05,
+      "loss": 0.6743,
+      "step": 673
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.3955203395625884,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.6766,
+      "step": 674
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.38969950727053226,
+      "learning_rate": 9.197321765624152e-05,
+      "loss": 0.6313,
+      "step": 675
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.47811786297910214,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7201,
+      "step": 676
+    },
+    {
+      "epoch": 0.5416,
+      "grad_norm": 0.4412149235369839,
+      "learning_rate": 9.145658595428074e-05,
+      "loss": 0.6747,
+      "step": 677
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.42196332818323135,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.742,
+      "step": 678
+    },
+    {
+      "epoch": 0.5432,
+      "grad_norm": 0.4458398334329028,
+      "learning_rate": 9.09401838592431e-05,
+      "loss": 0.6956,
+      "step": 679
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.5318092860659093,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.7957,
+      "step": 680
+    },
+    {
+      "epoch": 0.5448,
+      "grad_norm": 0.4930020597088423,
+      "learning_rate": 9.04240252495963e-05,
+      "loss": 0.7279,
+      "step": 681
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.5017236219515888,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.7046,
+      "step": 682
+    },
+    {
+      "epoch": 0.5464,
+      "grad_norm": 0.4630320439388713,
+      "learning_rate": 8.990812399726435e-05,
+      "loss": 0.684,
+      "step": 683
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.4227403678850825,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7033,
+      "step": 684
+    },
+    {
+      "epoch": 0.548,
+      "grad_norm": 0.4829551597107614,
+      "learning_rate": 8.939249396725467e-05,
+      "loss": 0.7424,
+      "step": 685
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.46945474281683686,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.8187,
+      "step": 686
+    },
+    {
+      "epoch": 0.5496,
+      "grad_norm": 0.5496894171200998,
+      "learning_rate": 8.887714901728551e-05,
+      "loss": 0.7374,
+      "step": 687
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.4663964552750982,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.8222,
+      "step": 688
+    },
+    {
+      "epoch": 0.5512,
+      "grad_norm": 0.4019324549889185,
+      "learning_rate": 8.836210299741346e-05,
+      "loss": 0.6636,
+      "step": 689
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.4227083940417562,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7451,
+      "step": 690
+    },
+    {
+      "epoch": 0.5528,
+      "grad_norm": 0.464047932365196,
+      "learning_rate": 8.784736974966135e-05,
+      "loss": 0.812,
+      "step": 691
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.49827929688770517,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.7623,
+      "step": 692
+    },
+    {
+      "epoch": 0.5544,
+      "grad_norm": 0.4494487600877898,
+      "learning_rate": 8.733296310764611e-05,
+      "loss": 0.7798,
+      "step": 693
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.39968387273888206,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.6776,
+      "step": 694
+    },
+    {
+      "epoch": 0.556,
+      "grad_norm": 0.4262359246978625,
+      "learning_rate": 8.6818896896207e-05,
+      "loss": 0.7656,
+      "step": 695
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.4307000807129045,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7657,
+      "step": 696
+    },
+    {
+      "epoch": 0.5576,
+      "grad_norm": 0.36881199515602503,
+      "learning_rate": 8.63051849310342e-05,
+      "loss": 0.7187,
+      "step": 697
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.44543571007999266,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.7846,
+      "step": 698
+    },
+    {
+      "epoch": 0.5592,
+      "grad_norm": 0.41884765278422886,
+      "learning_rate": 8.579184101829734e-05,
+      "loss": 0.7264,
+      "step": 699
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4501268207049666,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6708,
+      "step": 700
+    },
+    {
+      "epoch": 0.5608,
+      "grad_norm": 0.44298997651470345,
+      "learning_rate": 8.527887895427454e-05,
+      "loss": 0.7435,
+      "step": 701
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.4490464030296316,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6915,
+      "step": 702
+    },
+    {
+      "epoch": 0.5624,
+      "grad_norm": 0.4096812132035904,
+      "learning_rate": 8.476631252498162e-05,
+      "loss": 0.68,
+      "step": 703
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.44334191450228255,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7234,
+      "step": 704
+    },
+    {
+      "epoch": 0.564,
+      "grad_norm": 0.4065765469269847,
+      "learning_rate": 8.425415550580162e-05,
+      "loss": 0.6803,
+      "step": 705
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.6014789132852144,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6985,
+      "step": 706
+    },
+    {
+      "epoch": 0.5656,
+      "grad_norm": 0.5409184779201434,
+      "learning_rate": 8.374242166111448e-05,
+      "loss": 0.7104,
+      "step": 707
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4708620813419675,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.6942,
+      "step": 708
+    },
+    {
+      "epoch": 0.5672,
+      "grad_norm": 0.3991163373887498,
+      "learning_rate": 8.323112474392731e-05,
+      "loss": 0.6586,
+      "step": 709
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.37935939327845475,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.6761,
+      "step": 710
+    },
+    {
+      "epoch": 0.5688,
+      "grad_norm": 0.399493088326328,
+      "learning_rate": 8.272027849550457e-05,
+      "loss": 0.7024,
+      "step": 711
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4132077291438483,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.6957,
+      "step": 712
+    },
+    {
+      "epoch": 0.5704,
+      "grad_norm": 0.8187355975815758,
+      "learning_rate": 8.220989664499878e-05,
+      "loss": 0.7509,
+      "step": 713
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.40872121952512946,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6791,
+      "step": 714
+    },
+    {
+      "epoch": 0.572,
+      "grad_norm": 0.4572049418635426,
+      "learning_rate": 8.169999290908188e-05,
+      "loss": 0.7393,
+      "step": 715
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.5404394783118169,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.7402,
+      "step": 716
+    },
+    {
+      "epoch": 0.5736,
+      "grad_norm": 0.4291348543194726,
+      "learning_rate": 8.119058099157604e-05,
+      "loss": 0.6862,
+      "step": 717
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.48511918081626154,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7775,
+      "step": 718
+    },
+    {
+      "epoch": 0.5752,
+      "grad_norm": 0.5010967847627927,
+      "learning_rate": 8.068167458308582e-05,
+      "loss": 0.7399,
+      "step": 719
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4238614997566764,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.7594,
+      "step": 720
+    },
+    {
+      "epoch": 0.5768,
+      "grad_norm": 0.4129108067302885,
+      "learning_rate": 8.017328736063006e-05,
+      "loss": 0.6705,
+      "step": 721
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.3801850715707057,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6847,
+      "step": 722
+    },
+    {
+      "epoch": 0.5784,
+      "grad_norm": 0.3850503416440505,
+      "learning_rate": 7.966543298727425e-05,
+      "loss": 0.6425,
+      "step": 723
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.4322390793478073,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.7555,
+      "step": 724
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.40808245632812906,
+      "learning_rate": 7.915812511176347e-05,
+      "loss": 0.7237,
+      "step": 725
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.43661810536641515,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.7242,
+      "step": 726
+    },
+    {
+      "epoch": 0.5816,
+      "grad_norm": 0.4511391167280754,
+      "learning_rate": 7.865137736815535e-05,
+      "loss": 0.6379,
+      "step": 727
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.44596464124334795,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6934,
+      "step": 728
+    },
+    {
+      "epoch": 0.5832,
+      "grad_norm": 0.4407154181297479,
+      "learning_rate": 7.814520337545406e-05,
+      "loss": 0.7147,
+      "step": 729
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.45653935500776016,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.7739,
+      "step": 730
+    },
+    {
+      "epoch": 0.5848,
+      "grad_norm": 0.40727134095533485,
+      "learning_rate": 7.763961673724379e-05,
+      "loss": 0.6808,
+      "step": 731
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.43313671686290234,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6762,
+      "step": 732
+    },
+    {
+      "epoch": 0.5864,
+      "grad_norm": 0.5747822773258235,
+      "learning_rate": 7.713463104132345e-05,
+      "loss": 0.7176,
+      "step": 733
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.49156474674882356,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.7738,
+      "step": 734
+    },
+    {
+      "epoch": 0.588,
+      "grad_norm": 0.4825352538155201,
+      "learning_rate": 7.663025985934158e-05,
+      "loss": 0.7841,
+      "step": 735
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3891821855316537,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.6461,
+      "step": 736
+    },
+    {
+      "epoch": 0.5896,
+      "grad_norm": 0.5184218795544006,
+      "learning_rate": 7.61265167464313e-05,
+      "loss": 0.7965,
+      "step": 737
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.4545607963945075,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.6857,
+      "step": 738
+    },
+    {
+      "epoch": 0.5912,
+      "grad_norm": 0.399987496419545,
+      "learning_rate": 7.562341524084623e-05,
+      "loss": 0.6916,
+      "step": 739
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.43975174551870333,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.7624,
+      "step": 740
+    },
+    {
+      "epoch": 0.5928,
+      "grad_norm": 0.43333418303998245,
+      "learning_rate": 7.512096886359664e-05,
+      "loss": 0.7543,
+      "step": 741
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.4047273320690393,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.7068,
+      "step": 742
+    },
+    {
+      "epoch": 0.5944,
+      "grad_norm": 0.4416971341780052,
+      "learning_rate": 7.461919111808595e-05,
+      "loss": 0.6352,
+      "step": 743
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.4464883470061259,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.6834,
+      "step": 744
+    },
+    {
+      "epoch": 0.596,
+      "grad_norm": 0.4425251211091578,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.7237,
+      "step": 745
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.4339067833924351,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6823,
+      "step": 746
+    },
+    {
+      "epoch": 0.5976,
+      "grad_norm": 0.37628247079657295,
+      "learning_rate": 7.361769544568425e-05,
+      "loss": 0.678,
+      "step": 747
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.504222130084519,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.7172,
+      "step": 748
+    },
+    {
+      "epoch": 0.5992,
+      "grad_norm": 0.4923074669303458,
+      "learning_rate": 7.311800443430251e-05,
+      "loss": 0.8012,
+      "step": 749
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.552459322864626,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7479,
+      "step": 750
+    },
+    {
+      "epoch": 0.6008,
+      "grad_norm": 0.45851662219750045,
+      "learning_rate": 7.26190358849548e-05,
+      "loss": 0.663,
+      "step": 751
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.39770811850719884,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.66,
+      "step": 752
+    },
+    {
+      "epoch": 0.6024,
+      "grad_norm": 0.5387005644882501,
+      "learning_rate": 7.212080320757695e-05,
+      "loss": 0.76,
+      "step": 753
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.4120007240669047,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.7223,
+      "step": 754
+    },
+    {
+      "epoch": 0.604,
+      "grad_norm": 0.4529312181035841,
+      "learning_rate": 7.162331979232783e-05,
+      "loss": 0.7491,
+      "step": 755
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.3880839627189889,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.6217,
+      "step": 756
+    },
+    {
+      "epoch": 0.6056,
+      "grad_norm": 0.3613010136847369,
+      "learning_rate": 7.112659900922976e-05,
+      "loss": 0.6757,
+      "step": 757
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.47210927255794144,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7446,
+      "step": 758
+    },
+    {
+      "epoch": 0.6072,
+      "grad_norm": 0.3725482277294428,
+      "learning_rate": 7.06306542078091e-05,
+      "loss": 0.6791,
+      "step": 759
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4681099311515542,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.7608,
+      "step": 760
+    },
+    {
+      "epoch": 0.6088,
+      "grad_norm": 0.37805816611073806,
+      "learning_rate": 7.013549871673736e-05,
+      "loss": 0.6693,
+      "step": 761
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.49364379425671195,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.687,
+      "step": 762
+    },
+    {
+      "epoch": 0.6104,
+      "grad_norm": 0.4757654358296188,
+      "learning_rate": 6.964114584347316e-05,
+      "loss": 0.654,
+      "step": 763
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.41975469935854065,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6817,
+      "step": 764
+    },
+    {
+      "epoch": 0.612,
+      "grad_norm": 0.4948237468744432,
+      "learning_rate": 6.914760887390452e-05,
+      "loss": 0.7695,
+      "step": 765
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.43692541122160833,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6984,
+      "step": 766
+    },
+    {
+      "epoch": 0.6136,
+      "grad_norm": 0.48637179718674667,
+      "learning_rate": 6.865490107199181e-05,
+      "loss": 0.6883,
+      "step": 767
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.4595795984986259,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.7694,
+      "step": 768
+    },
+    {
+      "epoch": 0.6152,
+      "grad_norm": 0.4577933808320737,
+      "learning_rate": 6.816303567941112e-05,
+      "loss": 0.7038,
+      "step": 769
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.476120584644686,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.7576,
+      "step": 770
+    },
+    {
+      "epoch": 0.6168,
+      "grad_norm": 0.5645954033128227,
+      "learning_rate": 6.767202591519875e-05,
+      "loss": 0.7832,
+      "step": 771
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.38318261143506066,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.6781,
+      "step": 772
+    },
+    {
+      "epoch": 0.6184,
+      "grad_norm": 0.46016469252035364,
+      "learning_rate": 6.718188497539554e-05,
+      "loss": 0.6898,
+      "step": 773
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.4766016729359985,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.6492,
+      "step": 774
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.493164015223603,
+      "learning_rate": 6.669262603269246e-05,
+      "loss": 0.7555,
+      "step": 775
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.48558809975651357,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.7621,
+      "step": 776
+    },
+    {
+      "epoch": 0.6216,
+      "grad_norm": 0.41546306411218065,
+      "learning_rate": 6.620426223607654e-05,
+      "loss": 0.684,
+      "step": 777
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.41016241103210943,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7196,
+      "step": 778
+    },
+    {
+      "epoch": 0.6232,
+      "grad_norm": 0.47752129112664465,
+      "learning_rate": 6.571680671047749e-05,
+      "loss": 0.7473,
+      "step": 779
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.4856038759001054,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7563,
+      "step": 780
+    },
+    {
+      "epoch": 0.6248,
+      "grad_norm": 0.5021371546458332,
+      "learning_rate": 6.523027255641493e-05,
+      "loss": 0.7234,
+      "step": 781
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.40910017528170856,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.6213,
+      "step": 782
+    },
+    {
+      "epoch": 0.6264,
+      "grad_norm": 0.4263522002359121,
+      "learning_rate": 6.474467284964634e-05,
+      "loss": 0.7287,
+      "step": 783
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.5031930708975815,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.7488,
+      "step": 784
+    },
+    {
+      "epoch": 0.628,
+      "grad_norm": 0.4145521757788483,
+      "learning_rate": 6.426002064081565e-05,
+      "loss": 0.7347,
+      "step": 785
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.41344524756191847,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7244,
+      "step": 786
+    },
+    {
+      "epoch": 0.6296,
+      "grad_norm": 0.49117563386202884,
+      "learning_rate": 6.377632895510248e-05,
+      "loss": 0.6522,
+      "step": 787
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.4459330410255157,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6059,
+      "step": 788
+    },
+    {
+      "epoch": 0.6312,
+      "grad_norm": 0.44548999592648314,
+      "learning_rate": 6.329361079187199e-05,
+      "loss": 0.6308,
+      "step": 789
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.4486957922043369,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.6552,
+      "step": 790
+    },
+    {
+      "epoch": 0.6328,
+      "grad_norm": 0.4025693300248295,
+      "learning_rate": 6.281187912432587e-05,
+      "loss": 0.6665,
+      "step": 791
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.4006004171665509,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6983,
+      "step": 792
+    },
+    {
+      "epoch": 0.6344,
+      "grad_norm": 0.4082360091469701,
+      "learning_rate": 6.233114689915316e-05,
+      "loss": 0.7635,
+      "step": 793
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.509426336326775,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.7777,
+      "step": 794
+    },
+    {
+      "epoch": 0.636,
+      "grad_norm": 0.5219702631152514,
+      "learning_rate": 6.18514270361827e-05,
+      "loss": 0.7026,
+      "step": 795
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.409433339007301,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.6858,
+      "step": 796
+    },
+    {
+      "epoch": 0.6376,
+      "grad_norm": 0.5111445742851393,
+      "learning_rate": 6.13727324280358e-05,
+      "loss": 0.7345,
+      "step": 797
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.4278440622908864,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.5927,
+      "step": 798
+    },
+    {
+      "epoch": 0.6392,
+      "grad_norm": 0.40575034713415187,
+      "learning_rate": 6.08950759397797e-05,
+      "loss": 0.702,
+      "step": 799
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.47102491851325967,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6909,
+      "step": 800
+    },
+    {
+      "epoch": 0.6408,
+      "grad_norm": 0.43463529412558344,
+      "learning_rate": 6.0418470408581774e-05,
+      "loss": 0.7479,
+      "step": 801
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.514778899493937,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.7814,
+      "step": 802
+    },
+    {
+      "epoch": 0.6424,
+      "grad_norm": 0.3801411112782445,
+      "learning_rate": 5.9942928643364724e-05,
+      "loss": 0.7259,
+      "step": 803
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.4355576096171972,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6514,
+      "step": 804
+    },
+    {
+      "epoch": 0.644,
+      "grad_norm": 0.4325118593920335,
+      "learning_rate": 5.946846342446214e-05,
+      "loss": 0.6931,
+      "step": 805
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.4258437731663501,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.7035,
+      "step": 806
+    },
+    {
+      "epoch": 0.6456,
+      "grad_norm": 0.4291018923022965,
+      "learning_rate": 5.899508750327501e-05,
+      "loss": 0.6857,
+      "step": 807
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4819472695768693,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.7583,
+      "step": 808
+    },
+    {
+      "epoch": 0.6472,
+      "grad_norm": 0.5463902606663359,
+      "learning_rate": 5.8522813601929324e-05,
+      "loss": 0.6832,
+      "step": 809
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.3861307644596196,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6511,
+      "step": 810
+    },
+    {
+      "epoch": 0.6488,
+      "grad_norm": 0.41580726874585294,
+      "learning_rate": 5.80516544129337e-05,
+      "loss": 0.7147,
+      "step": 811
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.44964197208891377,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.7005,
+      "step": 812
+    },
+    {
+      "epoch": 0.6504,
+      "grad_norm": 0.4021668915311515,
+      "learning_rate": 5.758162259883867e-05,
+      "loss": 0.6892,
+      "step": 813
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.39523796525402727,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.6929,
+      "step": 814
+    },
+    {
+      "epoch": 0.652,
+      "grad_norm": 0.38565520201311354,
+      "learning_rate": 5.7112730791896207e-05,
+      "loss": 0.6682,
+      "step": 815
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3995348928161467,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.6982,
+      "step": 816
+    },
+    {
+      "epoch": 0.6536,
+      "grad_norm": 0.4703906816473926,
+      "learning_rate": 5.664499159372017e-05,
+      "loss": 0.6853,
+      "step": 817
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.5033547096332919,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.7224,
+      "step": 818
+    },
+    {
+      "epoch": 0.6552,
+      "grad_norm": 0.38181669979362143,
+      "learning_rate": 5.617841757494762e-05,
+      "loss": 0.6383,
+      "step": 819
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.48820392994934503,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.7348,
+      "step": 820
+    },
+    {
+      "epoch": 0.6568,
+      "grad_norm": 0.3970352873792021,
+      "learning_rate": 5.5713021274901335e-05,
+      "loss": 0.6915,
+      "step": 821
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.4282462226980526,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.7262,
+      "step": 822
+    },
+    {
+      "epoch": 0.6584,
+      "grad_norm": 0.4131366154190306,
+      "learning_rate": 5.524881520125229e-05,
+      "loss": 0.6542,
+      "step": 823
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.426275032913804,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6986,
+      "step": 824
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.4385390832068032,
+      "learning_rate": 5.4785811829683764e-05,
+      "loss": 0.6932,
+      "step": 825
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.4337495818622833,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.7575,
+      "step": 826
+    },
+    {
+      "epoch": 0.6616,
+      "grad_norm": 0.466350734858001,
+      "learning_rate": 5.432402360355615e-05,
+      "loss": 0.7019,
+      "step": 827
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4368116396485175,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.6942,
+      "step": 828
+    },
+    {
+      "epoch": 0.6632,
+      "grad_norm": 0.4126598144026732,
+      "learning_rate": 5.386346293357242e-05,
+      "loss": 0.7673,
+      "step": 829
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.6288184741220477,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6909,
+      "step": 830
+    },
+    {
+      "epoch": 0.6648,
+      "grad_norm": 0.45048610759488894,
+      "learning_rate": 5.3404142197444506e-05,
+      "loss": 0.655,
+      "step": 831
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.4594592620204484,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.7328,
+      "step": 832
+    },
+    {
+      "epoch": 0.6664,
+      "grad_norm": 0.4338626551219975,
+      "learning_rate": 5.2946073739560706e-05,
+      "loss": 0.6485,
+      "step": 833
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.37899094255348686,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6566,
+      "step": 834
+    },
+    {
+      "epoch": 0.668,
+      "grad_norm": 0.437215121696517,
+      "learning_rate": 5.248926987065417e-05,
+      "loss": 0.6853,
+      "step": 835
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3880565714552592,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6799,
+      "step": 836
+    },
+    {
+      "epoch": 0.6696,
+      "grad_norm": 0.45764446660770025,
+      "learning_rate": 5.203374286747158e-05,
+      "loss": 0.6892,
+      "step": 837
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.4483748667859573,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.772,
+      "step": 838
+    },
+    {
+      "epoch": 0.6712,
+      "grad_norm": 0.42973735024858706,
+      "learning_rate": 5.15795049724435e-05,
+      "loss": 0.6798,
+      "step": 839
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.40383156607789245,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.6659,
+      "step": 840
+    },
+    {
+      "epoch": 0.6728,
+      "grad_norm": 0.8482268686810346,
+      "learning_rate": 5.112656839335543e-05,
+      "loss": 0.7819,
+      "step": 841
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.41571178141097415,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7018,
+      "step": 842
+    },
+    {
+      "epoch": 0.6744,
+      "grad_norm": 0.44792133357155567,
+      "learning_rate": 5.0674945303019526e-05,
+      "loss": 0.6571,
+      "step": 843
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.42746194302730683,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6657,
+      "step": 844
+    },
+    {
+      "epoch": 0.676,
+      "grad_norm": 0.5100274591863196,
+      "learning_rate": 5.022464783894744e-05,
+      "loss": 0.725,
+      "step": 845
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.4676549283394495,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7094,
+      "step": 846
+    },
+    {
+      "epoch": 0.6776,
+      "grad_norm": 0.4531270516170705,
+      "learning_rate": 4.977568810302432e-05,
+      "loss": 0.7445,
+      "step": 847
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.4702392246252879,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6441,
+      "step": 848
+    },
+    {
+      "epoch": 0.6792,
+      "grad_norm": 0.46607399968829655,
+      "learning_rate": 4.9328078161183464e-05,
+      "loss": 0.7198,
+      "step": 849
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.8329031061656196,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.7032,
+      "step": 850
+    },
+    {
+      "epoch": 0.6808,
+      "grad_norm": 0.44674766634301843,
+      "learning_rate": 4.88818300430819e-05,
+      "loss": 0.7361,
+      "step": 851
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.3910474905924863,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6427,
+      "step": 852
+    },
+    {
+      "epoch": 0.6824,
+      "grad_norm": 0.48318530717608854,
+      "learning_rate": 4.843695574177737e-05,
+      "loss": 0.7561,
+      "step": 853
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.5740067638768978,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.6773,
+      "step": 854
+    },
+    {
+      "epoch": 0.684,
+      "grad_norm": 0.46306313597699666,
+      "learning_rate": 4.7993467213405706e-05,
+      "loss": 0.682,
+      "step": 855
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.4921344276105706,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6215,
+      "step": 856
+    },
+    {
+      "epoch": 0.6856,
+      "grad_norm": 0.4480719423734006,
+      "learning_rate": 4.755137637685979e-05,
+      "loss": 0.7183,
+      "step": 857
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.4333055760407282,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.6872,
+      "step": 858
+    },
+    {
+      "epoch": 0.6872,
+      "grad_norm": 0.4503796959024257,
+      "learning_rate": 4.7110695113469085e-05,
+      "loss": 0.7445,
+      "step": 859
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.5498040941736474,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.7201,
+      "step": 860
+    },
+    {
+      "epoch": 0.6888,
+      "grad_norm": 0.4647843613174905,
+      "learning_rate": 4.6671435266680216e-05,
+      "loss": 0.7024,
+      "step": 861
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.45936528866579146,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.6426,
+      "step": 862
+    },
+    {
+      "epoch": 0.6904,
+      "grad_norm": 0.6008042177834249,
+      "learning_rate": 4.623360864173893e-05,
+      "loss": 0.6195,
+      "step": 863
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.40339065827934534,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.642,
+      "step": 864
+    },
+    {
+      "epoch": 0.692,
+      "grad_norm": 0.3889583369421051,
+      "learning_rate": 4.579722700537268e-05,
+      "loss": 0.6752,
+      "step": 865
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.42592145162890077,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6587,
+      "step": 866
+    },
+    {
+      "epoch": 0.6936,
+      "grad_norm": 0.4644492058526666,
+      "learning_rate": 4.5362302085474254e-05,
+      "loss": 0.6938,
+      "step": 867
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3719718621862749,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.6299,
+      "step": 868
+    },
+    {
+      "epoch": 0.6952,
+      "grad_norm": 0.4915898843370229,
+      "learning_rate": 4.492884557078688e-05,
+      "loss": 0.6359,
+      "step": 869
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.44205598693120685,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7051,
+      "step": 870
+    },
+    {
+      "epoch": 0.6968,
+      "grad_norm": 0.4768946429172093,
+      "learning_rate": 4.449686911058992e-05,
+      "loss": 0.7462,
+      "step": 871
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.4654488939460009,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.6449,
+      "step": 872
+    },
+    {
+      "epoch": 0.6984,
+      "grad_norm": 0.4081539910008458,
+      "learning_rate": 4.406638431438576e-05,
+      "loss": 0.6738,
+      "step": 873
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.4803114921037021,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.7085,
+      "step": 874
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.43690457017312406,
+      "learning_rate": 4.36374027515878e-05,
+      "loss": 0.6948,
+      "step": 875
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.5079444611869034,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6334,
+      "step": 876
+    },
+    {
+      "epoch": 0.7016,
+      "grad_norm": 0.37947204367856424,
+      "learning_rate": 4.320993595120969e-05,
+      "loss": 0.6523,
+      "step": 877
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.40617936016923767,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6936,
+      "step": 878
+    },
+    {
+      "epoch": 0.7032,
+      "grad_norm": 0.41296487666566845,
+      "learning_rate": 4.278399540155536e-05,
+      "loss": 0.6852,
+      "step": 879
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3539471291866326,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.6446,
+      "step": 880
+    },
+    {
+      "epoch": 0.7048,
+      "grad_norm": 0.4560156604490935,
+      "learning_rate": 4.2359592549910145e-05,
+      "loss": 0.7012,
+      "step": 881
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.41704253075285647,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.6852,
+      "step": 882
+    },
+    {
+      "epoch": 0.7064,
+      "grad_norm": 0.38883821624010945,
+      "learning_rate": 4.193673880223339e-05,
+      "loss": 0.6869,
+      "step": 883
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.39386521161668075,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6144,
+      "step": 884
+    },
+    {
+      "epoch": 0.708,
+      "grad_norm": 0.4376174108382858,
+      "learning_rate": 4.1515445522851784e-05,
+      "loss": 0.6986,
+      "step": 885
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.4065493351088732,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6454,
+      "step": 886
+    },
+    {
+      "epoch": 0.7096,
+      "grad_norm": 0.3941707213877182,
+      "learning_rate": 4.109572403415386e-05,
+      "loss": 0.6007,
+      "step": 887
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4326973562326279,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.6757,
+      "step": 888
+    },
+    {
+      "epoch": 0.7112,
+      "grad_norm": 0.40634280380940946,
+      "learning_rate": 4.0677585616285774e-05,
+      "loss": 0.6898,
+      "step": 889
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.46060240732593877,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.7121,
+      "step": 890
+    },
+    {
+      "epoch": 0.7128,
+      "grad_norm": 0.39107008531256765,
+      "learning_rate": 4.026104150684835e-05,
+      "loss": 0.6439,
+      "step": 891
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.5568511298175615,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6982,
+      "step": 892
+    },
+    {
+      "epoch": 0.7144,
+      "grad_norm": 0.38029120588041837,
+      "learning_rate": 3.984610290059467e-05,
+      "loss": 0.6581,
+      "step": 893
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.43775333314753384,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.6743,
+      "step": 894
+    },
+    {
+      "epoch": 0.716,
+      "grad_norm": 0.4223830486419374,
+      "learning_rate": 3.943278094912946e-05,
+      "loss": 0.6952,
+      "step": 895
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.4315052253643534,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.732,
+      "step": 896
+    },
+    {
+      "epoch": 0.7176,
+      "grad_norm": 0.39178277582854004,
+      "learning_rate": 3.902108676060937e-05,
+      "loss": 0.6027,
+      "step": 897
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.5627104260185086,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.7304,
+      "step": 898
+    },
+    {
+      "epoch": 0.7192,
+      "grad_norm": 0.39350483281651716,
+      "learning_rate": 3.861103139944449e-05,
+      "loss": 0.6866,
+      "step": 899
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.4095154219580877,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.6831,
+      "step": 900
+    },
+    {
+      "epoch": 0.7208,
+      "grad_norm": 0.37354388758571105,
+      "learning_rate": 3.820262588600074e-05,
+      "loss": 0.6328,
+      "step": 901
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.3936880749526102,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6883,
+      "step": 902
+    },
+    {
+      "epoch": 0.7224,
+      "grad_norm": 0.41527899802744356,
+      "learning_rate": 3.7795881196303995e-05,
+      "loss": 0.6792,
+      "step": 903
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.7595329157942547,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.7183,
+      "step": 904
+    },
+    {
+      "epoch": 0.724,
+      "grad_norm": 0.4975754419510984,
+      "learning_rate": 3.739080826174498e-05,
+      "loss": 0.6937,
+      "step": 905
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.4521967913155814,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.6655,
+      "step": 906
+    },
+    {
+      "epoch": 0.7256,
+      "grad_norm": 0.33020318016896827,
+      "learning_rate": 3.6987417968785366e-05,
+      "loss": 0.6167,
+      "step": 907
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.5048026377794702,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.7685,
+      "step": 908
+    },
+    {
+      "epoch": 0.7272,
+      "grad_norm": 0.4491681253602928,
+      "learning_rate": 3.658572115866541e-05,
+      "loss": 0.7819,
+      "step": 909
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.485749096981155,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.674,
+      "step": 910
+    },
+    {
+      "epoch": 0.7288,
+      "grad_norm": 0.4095808744339379,
+      "learning_rate": 3.618572862711247e-05,
+      "loss": 0.6542,
+      "step": 911
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.47721915371702767,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.7347,
+      "step": 912
+    },
+    {
+      "epoch": 0.7304,
+      "grad_norm": 0.47655723514124676,
+      "learning_rate": 3.578745112405083e-05,
+      "loss": 0.6334,
+      "step": 913
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.4032498972457474,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.6865,
+      "step": 914
+    },
+    {
+      "epoch": 0.732,
+      "grad_norm": 0.4380646892148774,
+      "learning_rate": 3.539089935331294e-05,
+      "loss": 0.7206,
+      "step": 915
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.41233642711261126,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.653,
+      "step": 916
+    },
+    {
+      "epoch": 0.7336,
+      "grad_norm": 0.9220102122310604,
+      "learning_rate": 3.4996083972351515e-05,
+      "loss": 0.7581,
+      "step": 917
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.4835730322570472,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6839,
+      "step": 918
+    },
+    {
+      "epoch": 0.7352,
+      "grad_norm": 0.3673687066773582,
+      "learning_rate": 3.4603015591953395e-05,
+      "loss": 0.682,
+      "step": 919
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.44778735795246244,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.7226,
+      "step": 920
+    },
+    {
+      "epoch": 0.7368,
+      "grad_norm": 0.43417465922385345,
+      "learning_rate": 3.421170477595419e-05,
+      "loss": 0.721,
+      "step": 921
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.5059754441089996,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7042,
+      "step": 922
+    },
+    {
+      "epoch": 0.7384,
+      "grad_norm": 0.444759680951798,
+      "learning_rate": 3.3822162040954354e-05,
+      "loss": 0.6797,
+      "step": 923
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.41036003482243283,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.671,
+      "step": 924
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.4546342656898402,
+      "learning_rate": 3.34343978560367e-05,
+      "loss": 0.7371,
+      "step": 925
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.4698253064161849,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.7089,
+      "step": 926
+    },
+    {
+      "epoch": 0.7416,
+      "grad_norm": 0.5642074583381484,
+      "learning_rate": 3.3048422642484886e-05,
+      "loss": 0.6529,
+      "step": 927
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.4030917615643339,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6512,
+      "step": 928
+    },
+    {
+      "epoch": 0.7432,
+      "grad_norm": 0.4641126026083197,
+      "learning_rate": 3.266424677350346e-05,
+      "loss": 0.6812,
+      "step": 929
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4730074993629064,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6292,
+      "step": 930
+    },
+    {
+      "epoch": 0.7448,
+      "grad_norm": 0.4653932203462421,
+      "learning_rate": 3.228188057393895e-05,
+      "loss": 0.7085,
+      "step": 931
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.4017944821803112,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.6597,
+      "step": 932
+    },
+    {
+      "epoch": 0.7464,
+      "grad_norm": 0.38975962223847943,
+      "learning_rate": 3.190133432000252e-05,
+      "loss": 0.6442,
+      "step": 933
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.5709031936190804,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.733,
+      "step": 934
+    },
+    {
+      "epoch": 0.748,
+      "grad_norm": 0.5317919974022999,
+      "learning_rate": 3.1522618238993725e-05,
+      "loss": 0.7605,
+      "step": 935
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.40589358111230983,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.6684,
+      "step": 936
+    },
+    {
+      "epoch": 0.7496,
+      "grad_norm": 0.5503734655213102,
+      "learning_rate": 3.114574250902558e-05,
+      "loss": 0.7868,
+      "step": 937
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.4916943514014524,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.7103,
+      "step": 938
+    },
+    {
+      "epoch": 0.7512,
+      "grad_norm": 0.6020255347762185,
+      "learning_rate": 3.077071725875116e-05,
+      "loss": 0.7783,
+      "step": 939
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.41380248929043034,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6904,
+      "step": 940
+    },
+    {
+      "epoch": 0.7528,
+      "grad_norm": 0.38840340966320364,
+      "learning_rate": 3.0397552567091337e-05,
+      "loss": 0.6757,
+      "step": 941
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.36884808026100746,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.6359,
+      "step": 942
+    },
+    {
+      "epoch": 0.7544,
+      "grad_norm": 0.4638521732939306,
+      "learning_rate": 3.0026258462963787e-05,
+      "loss": 0.7644,
+      "step": 943
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.4031733673863279,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.6829,
+      "step": 944
+    },
+    {
+      "epoch": 0.756,
+      "grad_norm": 0.4455684228946421,
+      "learning_rate": 2.9656844925013637e-05,
+      "loss": 0.6175,
+      "step": 945
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.44512262470760516,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6551,
+      "step": 946
+    },
+    {
+      "epoch": 0.7576,
+      "grad_norm": 0.4392274040280466,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.7532,
+      "step": 947
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3846213730566546,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6462,
+      "step": 948
+    },
+    {
+      "epoch": 0.7592,
+      "grad_norm": 0.3908739955341278,
+      "learning_rate": 2.8923699209255284e-05,
+      "loss": 0.6389,
+      "step": 949
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.4210490998284046,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.7032,
+      "step": 950
+    },
+    {
+      "epoch": 0.7608,
+      "grad_norm": 0.5121144943279137,
+      "learning_rate": 2.8559986734967282e-05,
+      "loss": 0.7509,
+      "step": 951
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.4276742573054915,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6979,
+      "step": 952
+    },
+    {
+      "epoch": 0.7624,
+      "grad_norm": 0.35333439799515204,
+      "learning_rate": 2.819819423336775e-05,
+      "loss": 0.6707,
+      "step": 953
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.4204807321417563,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6767,
+      "step": 954
+    },
+    {
+      "epoch": 0.764,
+      "grad_norm": 0.48716658706423366,
+      "learning_rate": 2.7838331427743282e-05,
+      "loss": 0.7501,
+      "step": 955
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.38039866799337263,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6545,
+      "step": 956
+    },
+    {
+      "epoch": 0.7656,
+      "grad_norm": 0.32672065605148515,
+      "learning_rate": 2.7480407989519198e-05,
+      "loss": 0.5797,
+      "step": 957
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.4076002889185543,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.6728,
+      "step": 958
+    },
+    {
+      "epoch": 0.7672,
+      "grad_norm": 0.38753220711205655,
+      "learning_rate": 2.712443353799984e-05,
+      "loss": 0.6454,
+      "step": 959
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.40540466182809887,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6637,
+      "step": 960
+    },
+    {
+      "epoch": 0.7688,
+      "grad_norm": 0.4454018587849093,
+      "learning_rate": 2.677041764010988e-05,
+      "loss": 0.6776,
+      "step": 961
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.5611693656661421,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6607,
+      "step": 962
+    },
+    {
+      "epoch": 0.7704,
+      "grad_norm": 0.3825257735895134,
+      "learning_rate": 2.6418369810137188e-05,
+      "loss": 0.7003,
+      "step": 963
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.40734098381589984,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6609,
+      "step": 964
+    },
+    {
+      "epoch": 0.772,
+      "grad_norm": 0.42583721416048925,
+      "learning_rate": 2.6068299509477266e-05,
+      "loss": 0.6623,
+      "step": 965
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.48805096685571947,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6934,
+      "step": 966
+    },
+    {
+      "epoch": 0.7736,
+      "grad_norm": 0.42500877249350866,
+      "learning_rate": 2.5720216146378917e-05,
+      "loss": 0.6513,
+      "step": 967
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.40493089101612706,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.674,
+      "step": 968
+    },
+    {
+      "epoch": 0.7752,
+      "grad_norm": 0.37284688840143426,
+      "learning_rate": 2.5374129075691265e-05,
+      "loss": 0.683,
+      "step": 969
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.3850534379139986,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.656,
+      "step": 970
+    },
+    {
+      "epoch": 0.7768,
+      "grad_norm": 0.45394202888183943,
+      "learning_rate": 2.503004759861258e-05,
+      "loss": 0.661,
+      "step": 971
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.4689936892545102,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6613,
+      "step": 972
+    },
+    {
+      "epoch": 0.7784,
+      "grad_norm": 0.4694811646835681,
+      "learning_rate": 2.4687980962440072e-05,
+      "loss": 0.6819,
+      "step": 973
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.3980590952745821,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6348,
+      "step": 974
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.3875275031787178,
+      "learning_rate": 2.4347938360321566e-05,
+      "loss": 0.6382,
+      "step": 975
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.47513560666438664,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6895,
+      "step": 976
+    },
+    {
+      "epoch": 0.7816,
+      "grad_norm": 0.3563948006388464,
+      "learning_rate": 2.400992893100822e-05,
+      "loss": 0.6074,
+      "step": 977
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.43436570592978274,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6693,
+      "step": 978
+    },
+    {
+      "epoch": 0.7832,
+      "grad_norm": 0.42645668786967905,
+      "learning_rate": 2.3673961758609152e-05,
+      "loss": 0.6854,
+      "step": 979
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.42641985111716113,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.7029,
+      "step": 980
+    },
+    {
+      "epoch": 0.7848,
+      "grad_norm": 0.43453452583242996,
+      "learning_rate": 2.334004587234717e-05,
+      "loss": 0.6757,
+      "step": 981
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.5943868344640286,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.7683,
+      "step": 982
+    },
+    {
+      "epoch": 0.7864,
+      "grad_norm": 0.35667236842417455,
+      "learning_rate": 2.300819024631603e-05,
+      "loss": 0.7153,
+      "step": 983
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.4169268668868495,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6382,
+      "step": 984
+    },
+    {
+      "epoch": 0.788,
+      "grad_norm": 0.45900056490702645,
+      "learning_rate": 2.26784037992395e-05,
+      "loss": 0.7021,
+      "step": 985
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.3743573851538462,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6293,
+      "step": 986
+    },
+    {
+      "epoch": 0.7896,
+      "grad_norm": 0.431799415704186,
+      "learning_rate": 2.2350695394231345e-05,
+      "loss": 0.6307,
+      "step": 987
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.443839386021764,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.7189,
+      "step": 988
+    },
+    {
+      "epoch": 0.7912,
+      "grad_norm": 0.457217602118605,
+      "learning_rate": 2.2025073838557454e-05,
+      "loss": 0.7166,
+      "step": 989
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.4298899530051766,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.659,
+      "step": 990
+    },
+    {
+      "epoch": 0.7928,
+      "grad_norm": 0.44483076435674174,
+      "learning_rate": 2.1701547883398922e-05,
+      "loss": 0.7343,
+      "step": 991
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.35211899297553695,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.5832,
+      "step": 992
+    },
+    {
+      "epoch": 0.7944,
+      "grad_norm": 0.7731949739466223,
+      "learning_rate": 2.138012622361689e-05,
+      "loss": 0.7262,
+      "step": 993
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.41017806254830447,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6515,
+      "step": 994
+    },
+    {
+      "epoch": 0.796,
+      "grad_norm": 0.37979830419359745,
+      "learning_rate": 2.106081749751897e-05,
+      "loss": 0.6707,
+      "step": 995
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.44717119588271975,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.7163,
+      "step": 996
+    },
+    {
+      "epoch": 0.7976,
+      "grad_norm": 0.4079752030933211,
+      "learning_rate": 2.0743630286627002e-05,
+      "loss": 0.665,
+      "step": 997
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.46165802291394364,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6828,
+      "step": 998
+    },
+    {
+      "epoch": 0.7992,
+      "grad_norm": 0.4177470942809108,
+      "learning_rate": 2.0428573115446392e-05,
+      "loss": 0.653,
+      "step": 999
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3817091854290759,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6909,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8008,
+      "grad_norm": 0.5418869050582212,
+      "learning_rate": 2.011565445123711e-05,
+      "loss": 0.7432,
+      "step": 1001
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.44523306273336016,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.676,
+      "step": 1002
+    },
+    {
+      "epoch": 0.8024,
+      "grad_norm": 0.3619808539027808,
+      "learning_rate": 1.980488270378612e-05,
+      "loss": 0.5962,
+      "step": 1003
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.5842481216946012,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.7678,
+      "step": 1004
+    },
+    {
+      "epoch": 0.804,
+      "grad_norm": 0.5461910209473555,
+      "learning_rate": 1.9496266225181248e-05,
+      "loss": 0.6585,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.5270273146574822,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7617,
+      "step": 1006
+    },
+    {
+      "epoch": 0.8056,
+      "grad_norm": 0.5765724533332864,
+      "learning_rate": 1.918981330958678e-05,
+      "loss": 0.7621,
+      "step": 1007
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.4174238770840169,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6699,
+      "step": 1008
+    },
+    {
+      "epoch": 0.8072,
+      "grad_norm": 0.5840180362864116,
+      "learning_rate": 1.8885532193020704e-05,
+      "loss": 0.7183,
+      "step": 1009
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.43782142253088613,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7388,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8088,
+      "grad_norm": 0.4143539040070012,
+      "learning_rate": 1.8583431053133127e-05,
+      "loss": 0.6748,
+      "step": 1011
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.38610298875132026,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.6717,
+      "step": 1012
+    },
+    {
+      "epoch": 0.8104,
+      "grad_norm": 0.4529466347927891,
+      "learning_rate": 1.8283518008986567e-05,
+      "loss": 0.6332,
+      "step": 1013
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.4104044812469816,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.5814,
+      "step": 1014
+    },
+    {
+      "epoch": 0.812,
+      "grad_norm": 0.3513567415760835,
+      "learning_rate": 1.7985801120837865e-05,
+      "loss": 0.6406,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4581821739165075,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6671,
+      "step": 1016
+    },
+    {
+      "epoch": 0.8136,
+      "grad_norm": 0.49925155395470433,
+      "learning_rate": 1.7690288389921493e-05,
+      "loss": 0.793,
+      "step": 1017
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.4938698306908264,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.6664,
+      "step": 1018
+    },
+    {
+      "epoch": 0.8152,
+      "grad_norm": 0.4071602120149282,
+      "learning_rate": 1.739698775823442e-05,
+      "loss": 0.6133,
+      "step": 1019
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.470231428927306,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.7382,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8168,
+      "grad_norm": 0.41085096374471225,
+      "learning_rate": 1.7105907108322816e-05,
+      "loss": 0.647,
+      "step": 1021
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.41451453515814146,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.6763,
+      "step": 1022
+    },
+    {
+      "epoch": 0.8184,
+      "grad_norm": 0.4317718419901924,
+      "learning_rate": 1.6817054263070174e-05,
+      "loss": 0.6626,
+      "step": 1023
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3739245136514371,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6631,
+      "step": 1024
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.49147832404649994,
+      "learning_rate": 1.6530436985486996e-05,
+      "loss": 0.7476,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.39322438754960165,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6806,
+      "step": 1026
+    },
+    {
+      "epoch": 0.8216,
+      "grad_norm": 0.41150429736076116,
+      "learning_rate": 1.6246062978502164e-05,
+      "loss": 0.6922,
+      "step": 1027
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.39706711639249687,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6346,
+      "step": 1028
+    },
+    {
+      "epoch": 0.8232,
+      "grad_norm": 0.49795204929118797,
+      "learning_rate": 1.5963939884756042e-05,
+      "loss": 0.6942,
+      "step": 1029
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.38100589182329975,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6335,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8248,
+      "grad_norm": 0.41974531064997533,
+      "learning_rate": 1.5684075286394985e-05,
+      "loss": 0.6788,
+      "step": 1031
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3750730946030414,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6395,
+      "step": 1032
+    },
+    {
+      "epoch": 0.8264,
+      "grad_norm": 0.36268379646764537,
+      "learning_rate": 1.5406476704867524e-05,
+      "loss": 0.6142,
+      "step": 1033
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.423716553799071,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6448,
+      "step": 1034
+    },
+    {
+      "epoch": 0.828,
+      "grad_norm": 0.4942107069468341,
+      "learning_rate": 1.5131151600722337e-05,
+      "loss": 0.6627,
+      "step": 1035
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.42456064348420397,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6894,
+      "step": 1036
+    },
+    {
+      "epoch": 0.8296,
+      "grad_norm": 0.36518083687152986,
+      "learning_rate": 1.485810737340767e-05,
+      "loss": 0.6189,
+      "step": 1037
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.4404730960139239,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.6369,
+      "step": 1038
+    },
+    {
+      "epoch": 0.8312,
+      "grad_norm": 0.392567768076952,
+      "learning_rate": 1.4587351361072454e-05,
+      "loss": 0.6081,
+      "step": 1039
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.4290000072468106,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.7304,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8328,
+      "grad_norm": 0.4199022292164449,
+      "learning_rate": 1.4318890840369182e-05,
+      "loss": 0.6237,
+      "step": 1041
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.4397419231813673,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.7319,
+      "step": 1042
+    },
+    {
+      "epoch": 0.8344,
+      "grad_norm": 0.40755746261021175,
+      "learning_rate": 1.4052733026258281e-05,
+      "loss": 0.6677,
+      "step": 1043
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.46991411782334597,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.7943,
+      "step": 1044
+    },
+    {
+      "epoch": 0.836,
+      "grad_norm": 0.4426440689555142,
+      "learning_rate": 1.3788885071814172e-05,
+      "loss": 0.6747,
+      "step": 1045
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.4348959755812302,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6185,
+      "step": 1046
+    },
+    {
+      "epoch": 0.8376,
+      "grad_norm": 0.433817258179641,
+      "learning_rate": 1.3527354068033139e-05,
+      "loss": 0.6844,
+      "step": 1047
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.41814622171273114,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.6268,
+      "step": 1048
+    },
+    {
+      "epoch": 0.8392,
+      "grad_norm": 0.47997402629270003,
+      "learning_rate": 1.326814704364262e-05,
+      "loss": 0.7138,
+      "step": 1049
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.4347748528844761,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6831,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8408,
+      "grad_norm": 0.4010485407026081,
+      "learning_rate": 1.3011270964912459e-05,
+      "loss": 0.6475,
+      "step": 1051
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.374961629875271,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6404,
+      "step": 1052
+    },
+    {
+      "epoch": 0.8424,
+      "grad_norm": 0.3457641040319451,
+      "learning_rate": 1.275673273546758e-05,
+      "loss": 0.6436,
+      "step": 1053
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.3969026935113594,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6486,
+      "step": 1054
+    },
+    {
+      "epoch": 0.844,
+      "grad_norm": 0.36695230119895345,
+      "learning_rate": 1.2504539196102439e-05,
+      "loss": 0.6426,
+      "step": 1055
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.39454987021060967,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6672,
+      "step": 1056
+    },
+    {
+      "epoch": 0.8456,
+      "grad_norm": 0.38197236694832964,
+      "learning_rate": 1.2254697124597237e-05,
+      "loss": 0.6689,
+      "step": 1057
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.5920091225654076,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.7146,
+      "step": 1058
+    },
+    {
+      "epoch": 0.8472,
+      "grad_norm": 0.3814735307252148,
+      "learning_rate": 1.2007213235535786e-05,
+      "loss": 0.6746,
+      "step": 1059
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4533789513726459,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.7033,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8488,
+      "grad_norm": 0.38583837856569086,
+      "learning_rate": 1.176209418012495e-05,
+      "loss": 0.6677,
+      "step": 1061
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.454085290082888,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6362,
+      "step": 1062
+    },
+    {
+      "epoch": 0.8504,
+      "grad_norm": 0.4742972977466862,
+      "learning_rate": 1.1519346546015907e-05,
+      "loss": 0.667,
+      "step": 1063
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.4824677083631548,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.7184,
+      "step": 1064
+    },
+    {
+      "epoch": 0.852,
+      "grad_norm": 0.3578146858746713,
+      "learning_rate": 1.1278976857127311e-05,
+      "loss": 0.5969,
+      "step": 1065
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.457944247105592,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6866,
+      "step": 1066
+    },
+    {
+      "epoch": 0.8536,
+      "grad_norm": 0.38401135991776364,
+      "learning_rate": 1.1040991573469629e-05,
+      "loss": 0.6125,
+      "step": 1067
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.44540543819947204,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6726,
+      "step": 1068
+    },
+    {
+      "epoch": 0.8552,
+      "grad_norm": 0.437737845257865,
+      "learning_rate": 1.0805397090971737e-05,
+      "loss": 0.7318,
+      "step": 1069
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.3709558725890715,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6437,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8568,
+      "grad_norm": 0.44814860250802013,
+      "learning_rate": 1.057219974130903e-05,
+      "loss": 0.6883,
+      "step": 1071
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.46865254426251146,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6647,
+      "step": 1072
+    },
+    {
+      "epoch": 0.8584,
+      "grad_norm": 0.5167131246294455,
+      "learning_rate": 1.0341405791733183e-05,
+      "loss": 0.7374,
+      "step": 1073
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.3890793000392378,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6978,
+      "step": 1074
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.46258858660726937,
+      "learning_rate": 1.0113021444903726e-05,
+      "loss": 0.7821,
+      "step": 1075
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.4845841428225895,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.7329,
+      "step": 1076
+    },
+    {
+      "epoch": 0.8616,
+      "grad_norm": 0.44073517080169233,
+      "learning_rate": 9.887052838721322e-06,
+      "loss": 0.6542,
+      "step": 1077
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.4424170136027106,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.7131,
+      "step": 1078
+    },
+    {
+      "epoch": 0.8632,
+      "grad_norm": 0.4289198341530067,
+      "learning_rate": 9.663506046162985e-06,
+      "loss": 0.6492,
+      "step": 1079
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.48072495328863873,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6587,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8648,
+      "grad_norm": 0.4320187801746069,
+      "learning_rate": 9.44238707511862e-06,
+      "loss": 0.6444,
+      "step": 1081
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.4561240308758049,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.7246,
+      "step": 1082
+    },
+    {
+      "epoch": 0.8664,
+      "grad_norm": 0.6087580846063493,
+      "learning_rate": 9.22370186822965e-06,
+      "loss": 0.7112,
+      "step": 1083
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.4530540900311635,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6826,
+      "step": 1084
+    },
+    {
+      "epoch": 0.868,
+      "grad_norm": 0.473302927557411,
+      "learning_rate": 9.0074563027294e-06,
+      "loss": 0.6788,
+      "step": 1085
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.39430156773094005,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6492,
+      "step": 1086
+    },
+    {
+      "epoch": 0.8696,
+      "grad_norm": 0.38741073903837653,
+      "learning_rate": 8.79365619028507e-06,
+      "loss": 0.6516,
+      "step": 1087
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4992858738222001,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.7291,
+      "step": 1088
+    },
+    {
+      "epoch": 0.8712,
+      "grad_norm": 0.4448439619855907,
+      "learning_rate": 8.582307276841462e-06,
+      "loss": 0.681,
+      "step": 1089
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.4043429402789075,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.7083,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8728,
+      "grad_norm": 0.4872191603346239,
+      "learning_rate": 8.37341524246672e-06,
+      "loss": 0.7314,
+      "step": 1091
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.37327936693140723,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.6852,
+      "step": 1092
+    },
+    {
+      "epoch": 0.8744,
+      "grad_norm": 0.4573379195008329,
+      "learning_rate": 8.166985701199582e-06,
+      "loss": 0.623,
+      "step": 1093
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.45019933175472543,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.7033,
+      "step": 1094
+    },
+    {
+      "epoch": 0.876,
+      "grad_norm": 0.4188359341478893,
+      "learning_rate": 7.963024200898462e-06,
+      "loss": 0.717,
+      "step": 1095
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.4867025160886115,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.7638,
+      "step": 1096
+    },
+    {
+      "epoch": 0.8776,
+      "grad_norm": 0.3828268279750639,
+      "learning_rate": 7.761536223092458e-06,
+      "loss": 0.6223,
+      "step": 1097
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.44475127573228324,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6524,
+      "step": 1098
+    },
+    {
+      "epoch": 0.8792,
+      "grad_norm": 0.38603117627268635,
+      "learning_rate": 7.562527182833978e-06,
+      "loss": 0.6467,
+      "step": 1099
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4514119122712399,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.7004,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8808,
+      "grad_norm": 0.42230542927567644,
+      "learning_rate": 7.366002428553153e-06,
+      "loss": 0.6272,
+      "step": 1101
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.4091204371094442,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.7014,
+      "step": 1102
+    },
+    {
+      "epoch": 0.8824,
+      "grad_norm": 0.35972169908969026,
+      "learning_rate": 7.171967241914224e-06,
+      "loss": 0.5502,
+      "step": 1103
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.4222850189851346,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6569,
+      "step": 1104
+    },
+    {
+      "epoch": 0.884,
+      "grad_norm": 0.3910537015498819,
+      "learning_rate": 6.980426837673437e-06,
+      "loss": 0.5971,
+      "step": 1105
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.47414596155603655,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.7066,
+      "step": 1106
+    },
+    {
+      "epoch": 0.8856,
+      "grad_norm": 0.3863146078649115,
+      "learning_rate": 6.791386363539065e-06,
+      "loss": 0.6429,
+      "step": 1107
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.387325530291395,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.7361,
+      "step": 1108
+    },
+    {
+      "epoch": 0.8872,
+      "grad_norm": 0.5225291485970791,
+      "learning_rate": 6.604850900032955e-06,
+      "loss": 0.6583,
+      "step": 1109
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.4506614605781981,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6544,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8888,
+      "grad_norm": 0.4562971013829592,
+      "learning_rate": 6.420825460353974e-06,
+      "loss": 0.6772,
+      "step": 1111
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.4478968361283968,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.634,
+      "step": 1112
+    },
+    {
+      "epoch": 0.8904,
+      "grad_norm": 0.625845777935862,
+      "learning_rate": 6.239314990243339e-06,
+      "loss": 0.7035,
+      "step": 1113
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.37214414526022493,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.6108,
+      "step": 1114
+    },
+    {
+      "epoch": 0.892,
+      "grad_norm": 0.4092861210799874,
+      "learning_rate": 6.0603243678516995e-06,
+      "loss": 0.7133,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.40098242819927743,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6562,
+      "step": 1116
+    },
+    {
+      "epoch": 0.8936,
+      "grad_norm": 0.4600776408862824,
+      "learning_rate": 5.883858403607967e-06,
+      "loss": 0.7029,
+      "step": 1117
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.5058924463716198,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6528,
+      "step": 1118
+    },
+    {
+      "epoch": 0.8952,
+      "grad_norm": 0.39735021532152043,
+      "learning_rate": 5.7099218400900716e-06,
+      "loss": 0.6804,
+      "step": 1119
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.4445361120546298,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.658,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8968,
+      "grad_norm": 0.46753041919202154,
+      "learning_rate": 5.538519351897575e-06,
+      "loss": 0.6155,
+      "step": 1121
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.41436384791158104,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6891,
+      "step": 1122
+    },
+    {
+      "epoch": 0.8984,
+      "grad_norm": 0.4077205837912534,
+      "learning_rate": 5.369655545525909e-06,
+      "loss": 0.6523,
+      "step": 1123
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.466454862461288,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.678,
+      "step": 1124
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.409423738849171,
+      "learning_rate": 5.2033349592426335e-06,
+      "loss": 0.7214,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.4748124598050134,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6449,
+      "step": 1126
+    },
+    {
+      "epoch": 0.9016,
+      "grad_norm": 0.3871256574050166,
+      "learning_rate": 5.039562062965508e-06,
+      "loss": 0.67,
+      "step": 1127
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.46831718930627114,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6883,
+      "step": 1128
+    },
+    {
+      "epoch": 0.9032,
+      "grad_norm": 0.37504315446264486,
+      "learning_rate": 4.87834125814235e-06,
+      "loss": 0.669,
+      "step": 1129
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.47688428959244344,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.665,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9048,
+      "grad_norm": 0.3619539400651978,
+      "learning_rate": 4.719676877632639e-06,
+      "loss": 0.6505,
+      "step": 1131
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.4594127274377102,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.657,
+      "step": 1132
+    },
+    {
+      "epoch": 0.9064,
+      "grad_norm": 0.4280258838906803,
+      "learning_rate": 4.563573185591219e-06,
+      "loss": 0.6714,
+      "step": 1133
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.4034978688949853,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6796,
+      "step": 1134
+    },
+    {
+      "epoch": 0.908,
+      "grad_norm": 0.399767578959943,
+      "learning_rate": 4.4100343773536225e-06,
+      "loss": 0.5946,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.36537304243469054,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6483,
+      "step": 1136
+    },
+    {
+      "epoch": 0.9096,
+      "grad_norm": 0.4429827195089348,
+      "learning_rate": 4.259064579323302e-06,
+      "loss": 0.6358,
+      "step": 1137
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.503755620863335,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.7091,
+      "step": 1138
+    },
+    {
+      "epoch": 0.9112,
+      "grad_norm": 0.44931627730300316,
+      "learning_rate": 4.1106678488607495e-06,
+      "loss": 0.6844,
+      "step": 1139
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.355137014163896,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.6883,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9128,
+      "grad_norm": 0.4751342717070414,
+      "learning_rate": 3.964848174174541e-06,
+      "loss": 0.7175,
+      "step": 1141
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.3934305241762361,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.6305,
+      "step": 1142
+    },
+    {
+      "epoch": 0.9144,
+      "grad_norm": 0.4284674206520445,
+      "learning_rate": 3.821609474213983e-06,
+      "loss": 0.6263,
+      "step": 1143
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.5304078370298239,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6128,
+      "step": 1144
+    },
+    {
+      "epoch": 0.916,
+      "grad_norm": 0.3838543952700125,
+      "learning_rate": 3.6809555985639068e-06,
+      "loss": 0.6596,
+      "step": 1145
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.47214302246291556,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6835,
+      "step": 1146
+    },
+    {
+      "epoch": 0.9176,
+      "grad_norm": 0.4193805545544715,
+      "learning_rate": 3.5428903273411863e-06,
+      "loss": 0.7442,
+      "step": 1147
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.46391645022575856,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6458,
+      "step": 1148
+    },
+    {
+      "epoch": 0.9192,
+      "grad_norm": 0.40499092839719736,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.6886,
+      "step": 1149
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.3987988642580818,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.698,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9208,
+      "grad_norm": 0.427183617651861,
+      "learning_rate": 3.2745403706978872e-06,
+      "loss": 0.7051,
+      "step": 1151
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4443270212370436,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.7027,
+      "step": 1152
+    },
+    {
+      "epoch": 0.9224,
+      "grad_norm": 0.4471706882159882,
+      "learning_rate": 3.1442628972662704e-06,
+      "loss": 0.6577,
+      "step": 1153
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.48972079173069394,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.682,
+      "step": 1154
+    },
+    {
+      "epoch": 0.924,
+      "grad_norm": 0.5065267660417246,
+      "learning_rate": 3.0165884520461316e-06,
+      "loss": 0.6782,
+      "step": 1155
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.5069107515729039,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.7146,
+      "step": 1156
+    },
+    {
+      "epoch": 0.9256,
+      "grad_norm": 0.40254688697544516,
+      "learning_rate": 2.8915204663281013e-06,
+      "loss": 0.6141,
+      "step": 1157
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.47425515127829737,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6517,
+      "step": 1158
+    },
+    {
+      "epoch": 0.9272,
+      "grad_norm": 0.3999312335834582,
+      "learning_rate": 2.7690623013533976e-06,
+      "loss": 0.6731,
+      "step": 1159
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.50007730288023,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.7449,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9288,
+      "grad_norm": 0.390957345630411,
+      "learning_rate": 2.649217248223468e-06,
+      "loss": 0.6375,
+      "step": 1161
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.435374353883707,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6349,
+      "step": 1162
+    },
+    {
+      "epoch": 0.9304,
+      "grad_norm": 0.446772375933306,
+      "learning_rate": 2.5319885278115906e-06,
+      "loss": 0.6734,
+      "step": 1163
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.46141118177489704,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.7738,
+      "step": 1164
+    },
+    {
+      "epoch": 0.932,
+      "grad_norm": 0.40101262069842275,
+      "learning_rate": 2.4173792906762804e-06,
+      "loss": 0.6909,
+      "step": 1165
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.4401243609537745,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6995,
+      "step": 1166
+    },
+    {
+      "epoch": 0.9336,
+      "grad_norm": 0.4067423394336151,
+      "learning_rate": 2.3053926169765984e-06,
+      "loss": 0.6638,
+      "step": 1167
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.38393597416443764,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6797,
+      "step": 1168
+    },
+    {
+      "epoch": 0.9352,
+      "grad_norm": 0.47557262884475343,
+      "learning_rate": 2.1960315163894075e-06,
+      "loss": 0.7142,
+      "step": 1169
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.41047225341638655,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.5714,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9368,
+      "grad_norm": 0.36385385554164773,
+      "learning_rate": 2.0892989280284823e-06,
+      "loss": 0.6242,
+      "step": 1171
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.430047639568558,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.7213,
+      "step": 1172
+    },
+    {
+      "epoch": 0.9384,
+      "grad_norm": 0.4003759824175224,
+      "learning_rate": 1.9851977203654835e-06,
+      "loss": 0.633,
+      "step": 1173
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.5276516864778331,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.7372,
+      "step": 1174
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.37424234084134184,
+      "learning_rate": 1.8837306911529184e-06,
+      "loss": 0.6498,
+      "step": 1175
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.39092128322326236,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6578,
+      "step": 1176
+    },
+    {
+      "epoch": 0.9416,
+      "grad_norm": 0.40114954765241456,
+      "learning_rate": 1.7849005673489127e-06,
+      "loss": 0.6324,
+      "step": 1177
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.5654401298903662,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.7025,
+      "step": 1178
+    },
+    {
+      "epoch": 0.9432,
+      "grad_norm": 0.4565313925452614,
+      "learning_rate": 1.6887100050439587e-06,
+      "loss": 0.6726,
+      "step": 1179
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.45063897696530186,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.7099,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9448,
+      "grad_norm": 0.5070284990026516,
+      "learning_rate": 1.595161589389449e-06,
+      "loss": 0.6629,
+      "step": 1181
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.4623797593188141,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.677,
+      "step": 1182
+    },
+    {
+      "epoch": 0.9464,
+      "grad_norm": 0.3542532180522851,
+      "learning_rate": 1.5042578345283108e-06,
+      "loss": 0.6542,
+      "step": 1183
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.5165101099155814,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6961,
+      "step": 1184
+    },
+    {
+      "epoch": 0.948,
+      "grad_norm": 0.41578784903230226,
+      "learning_rate": 1.4160011835273934e-06,
+      "loss": 0.6144,
+      "step": 1185
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.4262885168527811,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.6376,
+      "step": 1186
+    },
+    {
+      "epoch": 0.9496,
+      "grad_norm": 0.5125292108606694,
+      "learning_rate": 1.3303940083117527e-06,
+      "loss": 0.6715,
+      "step": 1187
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.43702204246654686,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6939,
+      "step": 1188
+    },
+    {
+      "epoch": 0.9512,
+      "grad_norm": 0.37114897690493825,
+      "learning_rate": 1.2474386096010039e-06,
+      "loss": 0.6775,
+      "step": 1189
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4435858684842088,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.683,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9528,
+      "grad_norm": 0.36459331021956454,
+      "learning_rate": 1.1671372168474138e-06,
+      "loss": 0.6328,
+      "step": 1191
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.39769689176348383,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6787,
+      "step": 1192
+    },
+    {
+      "epoch": 0.9544,
+      "grad_norm": 0.4392491742426648,
+      "learning_rate": 1.089491988176017e-06,
+      "loss": 0.6535,
+      "step": 1193
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.4802610081280981,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.767,
+      "step": 1194
+    },
+    {
+      "epoch": 0.956,
+      "grad_norm": 0.4972405040039743,
+      "learning_rate": 1.014505010326583e-06,
+      "loss": 0.6684,
+      "step": 1195
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.4300061019366573,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.7004,
+      "step": 1196
+    },
+    {
+      "epoch": 0.9576,
+      "grad_norm": 0.37919684478557075,
+      "learning_rate": 9.421782985976068e-07,
+      "loss": 0.6371,
+      "step": 1197
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.39152226247770316,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6349,
+      "step": 1198
+    },
+    {
+      "epoch": 0.9592,
+      "grad_norm": 0.41967373082518133,
+      "learning_rate": 8.725137967920738e-07,
+      "loss": 0.7265,
+      "step": 1199
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.4122962224075846,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6585,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9608,
+      "grad_norm": 0.4436547161906095,
+      "learning_rate": 8.055133771652345e-07,
+      "loss": 0.7092,
+      "step": 1201
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.4964147457189863,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6714,
+      "step": 1202
+    },
+    {
+      "epoch": 0.9624,
+      "grad_norm": 0.4106307093293199,
+      "learning_rate": 7.411788403743237e-07,
+      "loss": 0.6533,
+      "step": 1203
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.4716435870719622,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.7295,
+      "step": 1204
+    },
+    {
+      "epoch": 0.964,
+      "grad_norm": 0.41576863897973987,
+      "learning_rate": 6.7951191543012e-07,
+      "loss": 0.7056,
+      "step": 1205
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.43674970561506565,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.7308,
+      "step": 1206
+    },
+    {
+      "epoch": 0.9656,
+      "grad_norm": 0.3316605911176218,
+      "learning_rate": 6.205142596505176e-07,
+      "loss": 0.5981,
+      "step": 1207
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.4410517556841685,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6245,
+      "step": 1208
+    },
+    {
+      "epoch": 0.9672,
+      "grad_norm": 0.46086759578868924,
+      "learning_rate": 5.64187458615939e-07,
+      "loss": 0.6425,
+      "step": 1209
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.4266740889501166,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6741,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9688,
+      "grad_norm": 0.46234931271388335,
+      "learning_rate": 5.105330261267916e-07,
+      "loss": 0.623,
+      "step": 1211
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.41902737827351866,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6583,
+      "step": 1212
+    },
+    {
+      "epoch": 0.9704,
+      "grad_norm": 0.3591130098650748,
+      "learning_rate": 4.5955240416271084e-07,
+      "loss": 0.6731,
+      "step": 1213
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.41096488750723664,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.7105,
+      "step": 1214
+    },
+    {
+      "epoch": 0.972,
+      "grad_norm": 0.45856275241674943,
+      "learning_rate": 4.112469628438365e-07,
+      "loss": 0.6413,
+      "step": 1215
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.5754725420679759,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6829,
+      "step": 1216
+    },
+    {
+      "epoch": 0.9736,
+      "grad_norm": 0.47553566714873646,
+      "learning_rate": 3.6561800039403016e-07,
+      "loss": 0.7288,
+      "step": 1217
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.43786113087863426,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.6404,
+      "step": 1218
+    },
+    {
+      "epoch": 0.9752,
+      "grad_norm": 0.46596646225893884,
+      "learning_rate": 3.2266674310589273e-07,
+      "loss": 0.7697,
+      "step": 1219
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4172807297480227,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6499,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9768,
+      "grad_norm": 0.4217294242714277,
+      "learning_rate": 2.8239434530792365e-07,
+      "loss": 0.6422,
+      "step": 1221
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.4902760524078681,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6386,
+      "step": 1222
+    },
+    {
+      "epoch": 0.9784,
+      "grad_norm": 0.42289899866497593,
+      "learning_rate": 2.448018893333681e-07,
+      "loss": 0.6917,
+      "step": 1223
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.38391404447307315,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6692,
+      "step": 1224
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.4028401645223473,
+      "learning_rate": 2.098903854912515e-07,
+      "loss": 0.6718,
+      "step": 1225
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.3783484008933337,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.5929,
+      "step": 1226
+    },
+    {
+      "epoch": 0.9816,
+      "grad_norm": 0.4203126292842415,
+      "learning_rate": 1.7766077203915655e-07,
+      "loss": 0.5779,
+      "step": 1227
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3733061190802989,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6366,
+      "step": 1228
+    },
+    {
+      "epoch": 0.9832,
+      "grad_norm": 0.44571185691890597,
+      "learning_rate": 1.481139151579991e-07,
+      "loss": 0.6681,
+      "step": 1229
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.39590497993421114,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6524,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9848,
+      "grad_norm": 0.4595685602779222,
+      "learning_rate": 1.2125060892881346e-07,
+      "loss": 0.6217,
+      "step": 1231
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.406658460916925,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.6682,
+      "step": 1232
+    },
+    {
+      "epoch": 0.9864,
+      "grad_norm": 0.5966985871698207,
+      "learning_rate": 9.707157531134713e-08,
+      "loss": 0.7496,
+      "step": 1233
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.37185100028454815,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.7142,
+      "step": 1234
+    },
+    {
+      "epoch": 0.988,
+      "grad_norm": 0.5722795980301746,
+      "learning_rate": 7.557746412468758e-08,
+      "loss": 0.6986,
+      "step": 1235
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.4423403708247473,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6844,
+      "step": 1236
+    },
+    {
+      "epoch": 0.9896,
+      "grad_norm": 0.3670252792367035,
+      "learning_rate": 5.6768853029787184e-08,
+      "loss": 0.6368,
+      "step": 1237
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.3600132217622975,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.632,
+      "step": 1238
+    },
+    {
+      "epoch": 0.9912,
+      "grad_norm": 0.45259612096189367,
+      "learning_rate": 4.064624751394242e-08,
+      "loss": 0.6995,
+      "step": 1239
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.36480194691598306,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6072,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9928,
+      "grad_norm": 0.43766481771460525,
+      "learning_rate": 2.7210080877237976e-08,
+      "loss": 0.6251,
+      "step": 1241
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.49008337751938974,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.7103,
+      "step": 1242
+    },
+    {
+      "epoch": 0.9944,
+      "grad_norm": 0.3860677060738148,
+      "learning_rate": 1.646071422083395e-08,
+      "loss": 0.6531,
+      "step": 1243
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.41635945899168325,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6685,
+      "step": 1244
+    },
+    {
+      "epoch": 0.996,
+      "grad_norm": 0.4076160725177934,
+      "learning_rate": 8.398436437317969e-09,
+      "loss": 0.7144,
+      "step": 1245
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.4075007837183193,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.654,
+      "step": 1246
+    },
+    {
+      "epoch": 0.9976,
+      "grad_norm": 0.5147838413019985,
+      "learning_rate": 3.023464202944748e-09,
+      "loss": 0.6378,
+      "step": 1247
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.41380566659647616,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.586,
+      "step": 1248
+    },
+    {
+      "epoch": 0.9992,
+      "grad_norm": 0.49445333550351556,
+      "learning_rate": 3.3594197175190745e-10,
+      "loss": 0.6943,
+      "step": 1249
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.38169510328559775,
+      "learning_rate": 0.0,
+      "loss": 0.6749,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0,
+      "step": 1250,
+      "total_flos": 1075826133532672.0,
+      "train_loss": 0.7417626914024353,
+      "train_runtime": 19163.927,
+      "train_samples_per_second": 1.044,
+      "train_steps_per_second": 0.065
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1075826133532672.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e285932c220f7796bbe215fa382561159e889c97
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "q_proj",
+    "o_proj",
+    "up_proj",
+    "k_proj",
+    "down_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5ce1a0edb6da7c60c207707536af4faddd79ce7a
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e997f60d69e8d2b862c702246d6d2c85054d52b24dbf80c5fcfcbb271f47989
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dc9e219fa04676b7752adb85660fa1e8db64e8ac
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:857349115ca01448b46955f52ddcdcf4423c3f631d63ddc92e6e332bedfb0181
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c290efbac5381465108628152698e1f7217eebd
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,8792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.9747163093624516,
+      "learning_rate": 5.263157894736842e-06,
+      "loss": 1.3776,
+      "step": 1
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.9971316137974952,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.3593,
+      "step": 2
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 1.0294064859851075,
+      "learning_rate": 1.5789473684210526e-05,
+      "loss": 1.4638,
+      "step": 3
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8785473701775477,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.2433,
+      "step": 4
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.9222189298141704,
+      "learning_rate": 2.6315789473684212e-05,
+      "loss": 1.3731,
+      "step": 5
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.9510475170175133,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.3591,
+      "step": 6
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.7084226276467224,
+      "learning_rate": 3.6842105263157895e-05,
+      "loss": 1.0935,
+      "step": 7
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.906132971294305,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.1725,
+      "step": 8
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.7712213021772553,
+      "learning_rate": 4.736842105263158e-05,
+      "loss": 1.0772,
+      "step": 9
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.3178145410160216,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.1589,
+      "step": 10
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.850994370222525,
+      "learning_rate": 5.789473684210527e-05,
+      "loss": 1.0119,
+      "step": 11
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7447680537725399,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 0.9941,
+      "step": 12
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.8413526953194138,
+      "learning_rate": 6.842105263157895e-05,
+      "loss": 1.0714,
+      "step": 13
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.6570030364761191,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.967,
+      "step": 14
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 1.049204357937229,
+      "learning_rate": 7.894736842105263e-05,
+      "loss": 0.9898,
+      "step": 15
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.6319707351650328,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.9134,
+      "step": 16
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.6526982002329986,
+      "learning_rate": 8.947368421052632e-05,
+      "loss": 0.9306,
+      "step": 17
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.5434970784785387,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.8736,
+      "step": 18
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.5725189393741179,
+      "learning_rate": 0.0001,
+      "loss": 0.9179,
+      "step": 19
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5302888010676386,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.8855,
+      "step": 20
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.6382052813053475,
+      "learning_rate": 0.0001105263157894737,
+      "loss": 0.9381,
+      "step": 21
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.6044670064868235,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.9268,
+      "step": 22
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.5627824401835214,
+      "learning_rate": 0.00012105263157894738,
+      "loss": 0.894,
+      "step": 23
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.4535652110015434,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8492,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.6327582382001423,
+      "learning_rate": 0.00013157894736842108,
+      "loss": 0.8887,
+      "step": 25
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.5317494644379498,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.8935,
+      "step": 26
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.5122808745979125,
+      "learning_rate": 0.00014210526315789474,
+      "loss": 0.9313,
+      "step": 27
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5794191539976313,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.9305,
+      "step": 28
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.6283683198317399,
+      "learning_rate": 0.00015263157894736845,
+      "loss": 1.0035,
+      "step": 29
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.5306201579346713,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8103,
+      "step": 30
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.5486303280090739,
+      "learning_rate": 0.0001631578947368421,
+      "loss": 0.8286,
+      "step": 31
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.4823708388104478,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.9333,
+      "step": 32
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.594896847756731,
+      "learning_rate": 0.0001736842105263158,
+      "loss": 0.8822,
+      "step": 33
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.49095929767947233,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8244,
+      "step": 34
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.5154915662155016,
+      "learning_rate": 0.00018421052631578948,
+      "loss": 0.8675,
+      "step": 35
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.521265502362853,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.9447,
+      "step": 36
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.4891594600069159,
+      "learning_rate": 0.00019473684210526317,
+      "loss": 0.8823,
+      "step": 37
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.591233346119749,
+      "learning_rate": 0.0002,
+      "loss": 0.8902,
+      "step": 38
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.49124794236045227,
+      "learning_rate": 0.00019999966405802826,
+      "loss": 0.8059,
+      "step": 39
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5758637086666665,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8825,
+      "step": 40
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.4786735766305452,
+      "learning_rate": 0.00019999697653579705,
+      "loss": 0.9047,
+      "step": 41
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.5528332925969597,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.894,
+      "step": 42
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.5055203047790537,
+      "learning_rate": 0.0001999916015635627,
+      "loss": 0.7982,
+      "step": 43
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.6393093677384203,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8814,
+      "step": 44
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.47679644352584477,
+      "learning_rate": 0.00019998353928577919,
+      "loss": 0.8093,
+      "step": 45
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.501127789032324,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.898,
+      "step": 46
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.49935168203483027,
+      "learning_rate": 0.0001999727899191228,
+      "loss": 0.8727,
+      "step": 47
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5407161169427772,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.9324,
+      "step": 48
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.5393844437050201,
+      "learning_rate": 0.00019995935375248606,
+      "loss": 0.8809,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.4432871996924282,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8086,
+      "step": 50
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.4596088706929705,
+      "learning_rate": 0.00019994323114697022,
+      "loss": 0.8306,
+      "step": 51
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5084204356374836,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8601,
+      "step": 52
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.5353499362354399,
+      "learning_rate": 0.0001999244225358753,
+      "loss": 0.8785,
+      "step": 53
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.6953583786517583,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8999,
+      "step": 54
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.45453813559310047,
+      "learning_rate": 0.00019990292842468868,
+      "loss": 0.8058,
+      "step": 55
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5955649216955119,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.9337,
+      "step": 56
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.467998115167168,
+      "learning_rate": 0.0001998787493910712,
+      "loss": 0.7969,
+      "step": 57
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.5694988438877551,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.803,
+      "step": 58
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.4993003636229634,
+      "learning_rate": 0.000199851886084842,
+      "loss": 0.8904,
+      "step": 59
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5265132557079555,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.9117,
+      "step": 60
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.5260799370031813,
+      "learning_rate": 0.00019982233922796085,
+      "loss": 0.8068,
+      "step": 61
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.534529456433131,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8493,
+      "step": 62
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.5484845423317868,
+      "learning_rate": 0.00019979010961450878,
+      "loss": 0.9084,
+      "step": 63
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4717299830891815,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.8762,
+      "step": 64
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.5320054509994052,
+      "learning_rate": 0.00019975519811066663,
+      "loss": 0.867,
+      "step": 65
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.4968147872972133,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8246,
+      "step": 66
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.4556804959525972,
+      "learning_rate": 0.0001997176056546921,
+      "loss": 0.756,
+      "step": 67
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.6376392860317438,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.9274,
+      "step": 68
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.503563685108971,
+      "learning_rate": 0.0001996773332568941,
+      "loss": 0.818,
+      "step": 69
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.4332359635935246,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.7394,
+      "step": 70
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.5442703744579824,
+      "learning_rate": 0.00019963438199960599,
+      "loss": 0.8818,
+      "step": 71
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5029957297136984,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8752,
+      "step": 72
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.471330634112903,
+      "learning_rate": 0.00019958875303715615,
+      "loss": 0.8373,
+      "step": 73
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.48541377554141146,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8262,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.7278423656673401,
+      "learning_rate": 0.0001995404475958373,
+      "loss": 0.9023,
+      "step": 75
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.46571363881804856,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.7964,
+      "step": 76
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.48225090517511315,
+      "learning_rate": 0.0001994894669738732,
+      "loss": 0.7375,
+      "step": 77
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.46463249793100286,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.8404,
+      "step": 78
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.5176022140990233,
+      "learning_rate": 0.0001994358125413841,
+      "loss": 0.8608,
+      "step": 79
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4640933574111419,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8464,
+      "step": 80
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.5049737620655329,
+      "learning_rate": 0.0001993794857403495,
+      "loss": 0.8597,
+      "step": 81
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.4000446338602807,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.7712,
+      "step": 82
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.5421156886912685,
+      "learning_rate": 0.0001993204880845699,
+      "loss": 0.8473,
+      "step": 83
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.43785601117177925,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.7764,
+      "step": 84
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.5777032565618996,
+      "learning_rate": 0.00019925882115962568,
+      "loss": 0.8354,
+      "step": 85
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.5102041532130068,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.8778,
+      "step": 86
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.5812015512689956,
+      "learning_rate": 0.00019919448662283478,
+      "loss": 0.8561,
+      "step": 87
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.585857224143132,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8318,
+      "step": 88
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.448955209649677,
+      "learning_rate": 0.00019912748620320794,
+      "loss": 0.8565,
+      "step": 89
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.45471013020840184,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.7553,
+      "step": 90
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.4753483930706963,
+      "learning_rate": 0.00019905782170140238,
+      "loss": 0.7902,
+      "step": 91
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.5429707171943804,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.7649,
+      "step": 92
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.4966628028743198,
+      "learning_rate": 0.00019898549498967343,
+      "loss": 0.8799,
+      "step": 93
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.5006757379214986,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.8537,
+      "step": 94
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.48092851201041953,
+      "learning_rate": 0.000198910508011824,
+      "loss": 0.8285,
+      "step": 95
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.7589312779979084,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.9664,
+      "step": 96
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.4798855694434043,
+      "learning_rate": 0.00019883286278315262,
+      "loss": 0.7525,
+      "step": 97
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.5167176020578753,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8533,
+      "step": 98
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.45898646958816514,
+      "learning_rate": 0.00019875256139039902,
+      "loss": 0.8272,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5130366805408321,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8246,
+      "step": 100
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.5382568732867721,
+      "learning_rate": 0.00019866960599168826,
+      "loss": 0.8248,
+      "step": 101
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.4654109966764274,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.8667,
+      "step": 102
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.5335849522006713,
+      "learning_rate": 0.0001985839988164726,
+      "loss": 0.8978,
+      "step": 103
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.5234000189758574,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.8356,
+      "step": 104
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.43548891193506006,
+      "learning_rate": 0.00019849574216547171,
+      "loss": 0.719,
+      "step": 105
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.5086125572177761,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8092,
+      "step": 106
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.4834049587544824,
+      "learning_rate": 0.00019840483841061058,
+      "loss": 0.785,
+      "step": 107
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5223605888622092,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.8144,
+      "step": 108
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.5897705067900536,
+      "learning_rate": 0.00019831128999495606,
+      "loss": 0.9088,
+      "step": 109
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.46431911679447213,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.8309,
+      "step": 110
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.504224351663702,
+      "learning_rate": 0.0001982150994326511,
+      "loss": 0.8071,
+      "step": 111
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.547454671929799,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.7324,
+      "step": 112
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.514111241073896,
+      "learning_rate": 0.0001981162693088471,
+      "loss": 0.8113,
+      "step": 113
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.47314412685832746,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.823,
+      "step": 114
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.4863065522426431,
+      "learning_rate": 0.0001980148022796345,
+      "loss": 0.78,
+      "step": 115
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4875828457610195,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7818,
+      "step": 116
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.485080279739068,
+      "learning_rate": 0.00019791070107197153,
+      "loss": 0.8017,
+      "step": 117
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.4550894931203216,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.847,
+      "step": 118
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.4707139910339425,
+      "learning_rate": 0.0001978039684836106,
+      "loss": 0.7968,
+      "step": 119
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.977513875288363,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.9965,
+      "step": 120
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.45037237678267117,
+      "learning_rate": 0.0001976946073830234,
+      "loss": 0.7461,
+      "step": 121
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.4609004148948183,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8684,
+      "step": 122
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.5405117938056486,
+      "learning_rate": 0.00019758262070932375,
+      "loss": 0.9122,
+      "step": 123
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4445883762341974,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7881,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5076132174866915,
+      "learning_rate": 0.00019746801147218842,
+      "loss": 0.8639,
+      "step": 125
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.6126248956606299,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.8841,
+      "step": 126
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.46261186141768107,
+      "learning_rate": 0.00019735078275177654,
+      "loss": 0.8062,
+      "step": 127
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.43630485280685666,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.7864,
+      "step": 128
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.43348950728673896,
+      "learning_rate": 0.00019723093769864663,
+      "loss": 0.7752,
+      "step": 129
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5265221514244094,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8304,
+      "step": 130
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.6271969859011378,
+      "learning_rate": 0.0001971084795336719,
+      "loss": 0.7354,
+      "step": 131
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.5166949010390673,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.7814,
+      "step": 132
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.4822743087473739,
+      "learning_rate": 0.00019698341154795389,
+      "loss": 0.7615,
+      "step": 133
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.5450969403265944,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.9126,
+      "step": 134
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.4516497659644016,
+      "learning_rate": 0.00019685573710273376,
+      "loss": 0.7809,
+      "step": 135
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.6237163004882903,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.9205,
+      "step": 136
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.47876327682526937,
+      "learning_rate": 0.00019672545962930215,
+      "loss": 0.8275,
+      "step": 137
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.5395791344208695,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.795,
+      "step": 138
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.5067725433393416,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.868,
+      "step": 139
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.48354919242852473,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8144,
+      "step": 140
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.5045229343088378,
+      "learning_rate": 0.00019645710967265882,
+      "loss": 0.7937,
+      "step": 141
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.4501365842401607,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7669,
+      "step": 142
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.4654460898472039,
+      "learning_rate": 0.00019631904440143612,
+      "loss": 0.8003,
+      "step": 143
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.49265876580137014,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.7711,
+      "step": 144
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.4196755153909947,
+      "learning_rate": 0.00019617839052578603,
+      "loss": 0.6961,
+      "step": 145
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.4730379510588242,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.8488,
+      "step": 146
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.4179085152791245,
+      "learning_rate": 0.0001960351518258255,
+      "loss": 0.7634,
+      "step": 147
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.4504927184964666,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.769,
+      "step": 148
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.47912198033551545,
+      "learning_rate": 0.00019588933215113926,
+      "loss": 0.833,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.46676570068667694,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.7524,
+      "step": 150
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.45374835180690026,
+      "learning_rate": 0.00019574093542067673,
+      "loss": 0.7972,
+      "step": 151
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.49780694456258046,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.782,
+      "step": 152
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.46717860780736153,
+      "learning_rate": 0.0001955899656226464,
+      "loss": 0.8309,
+      "step": 153
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.4727836597310324,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.7552,
+      "step": 154
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.48113301659668845,
+      "learning_rate": 0.0001954364268144088,
+      "loss": 0.7565,
+      "step": 155
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.4320875610697008,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7598,
+      "step": 156
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.45990659802037936,
+      "learning_rate": 0.00019528032312236736,
+      "loss": 0.7104,
+      "step": 157
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.46416518781984156,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.708,
+      "step": 158
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.45461241381325634,
+      "learning_rate": 0.00019512165874185767,
+      "loss": 0.7886,
+      "step": 159
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4395339059749342,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.7879,
+      "step": 160
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.45283536508024796,
+      "learning_rate": 0.0001949604379370345,
+      "loss": 0.7552,
+      "step": 161
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.48514938779893596,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8396,
+      "step": 162
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.45290479179907883,
+      "learning_rate": 0.00019479666504075736,
+      "loss": 0.8046,
+      "step": 163
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.48291651301776906,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8381,
+      "step": 164
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.43975408072203875,
+      "learning_rate": 0.0001946303444544741,
+      "loss": 0.7761,
+      "step": 165
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.5494387560382519,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8345,
+      "step": 166
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.44048794546963965,
+      "learning_rate": 0.00019446148064810242,
+      "loss": 0.8619,
+      "step": 167
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.42475661368960177,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.7788,
+      "step": 168
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.47211544022836005,
+      "learning_rate": 0.00019429007815990993,
+      "loss": 0.8035,
+      "step": 169
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.41005411244101797,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7713,
+      "step": 170
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.4798370923401185,
+      "learning_rate": 0.00019411614159639204,
+      "loss": 0.7771,
+      "step": 171
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5489860373506683,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7399,
+      "step": 172
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.5949651572236011,
+      "learning_rate": 0.00019393967563214833,
+      "loss": 0.7944,
+      "step": 173
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.5103101440970763,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.804,
+      "step": 174
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.4416216088387608,
+      "learning_rate": 0.00019376068500975667,
+      "loss": 0.769,
+      "step": 175
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.42314990470772496,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.7831,
+      "step": 176
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.4364464883815751,
+      "learning_rate": 0.000193579174539646,
+      "loss": 0.7723,
+      "step": 177
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.41870456712768306,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7646,
+      "step": 178
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.4325493067333497,
+      "learning_rate": 0.00019339514909996706,
+      "loss": 0.7895,
+      "step": 179
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.5937916976258861,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8872,
+      "step": 180
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.46076168375250054,
+      "learning_rate": 0.00019320861363646095,
+      "loss": 0.7762,
+      "step": 181
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.4719415378217879,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.8442,
+      "step": 182
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.41742393603444783,
+      "learning_rate": 0.00019301957316232658,
+      "loss": 0.7724,
+      "step": 183
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.5606094174530225,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.8143,
+      "step": 184
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.5123057991525729,
+      "learning_rate": 0.0001928280327580858,
+      "loss": 0.8788,
+      "step": 185
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.5478849614118967,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.8339,
+      "step": 186
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.5053262511022109,
+      "learning_rate": 0.00019263399757144683,
+      "loss": 0.7908,
+      "step": 187
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.4843201758268366,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8058,
+      "step": 188
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.4329137931409849,
+      "learning_rate": 0.000192437472817166,
+      "loss": 0.7424,
+      "step": 189
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.48018937475308404,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.8026,
+      "step": 190
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.5833152367128422,
+      "learning_rate": 0.00019223846377690754,
+      "loss": 0.8433,
+      "step": 191
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.4464496239722511,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.7725,
+      "step": 192
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.4501347749823692,
+      "learning_rate": 0.00019203697579910154,
+      "loss": 0.7558,
+      "step": 193
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.4389798585604824,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.752,
+      "step": 194
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.47661997805132117,
+      "learning_rate": 0.00019183301429880043,
+      "loss": 0.7626,
+      "step": 195
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.5413153605782053,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.7964,
+      "step": 196
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.4566333020208942,
+      "learning_rate": 0.00019162658475753327,
+      "loss": 0.7959,
+      "step": 197
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.4621358452276674,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8675,
+      "step": 198
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.48862296044982606,
+      "learning_rate": 0.00019141769272315858,
+      "loss": 0.7648,
+      "step": 199
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4753951660463967,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8307,
+      "step": 200
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.5141194737862564,
+      "learning_rate": 0.00019120634380971496,
+      "loss": 0.7077,
+      "step": 201
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.45234333639641944,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8031,
+      "step": 202
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.4515668456684562,
+      "learning_rate": 0.0001909925436972706,
+      "loss": 0.731,
+      "step": 203
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5737095843420752,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.8357,
+      "step": 204
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.535951024744029,
+      "learning_rate": 0.00019077629813177036,
+      "loss": 0.8049,
+      "step": 205
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.4511706176519065,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7383,
+      "step": 206
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.46078484358501814,
+      "learning_rate": 0.00019055761292488142,
+      "loss": 0.7057,
+      "step": 207
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4615355759209768,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7867,
+      "step": 208
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.4251775169661483,
+      "learning_rate": 0.00019033649395383702,
+      "loss": 0.7808,
+      "step": 209
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.5973341731981406,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.822,
+      "step": 210
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.5855646612286085,
+      "learning_rate": 0.00019011294716127867,
+      "loss": 0.8053,
+      "step": 211
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.41727662450704456,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7663,
+      "step": 212
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.47845229997592925,
+      "learning_rate": 0.0001898869785550963,
+      "loss": 0.838,
+      "step": 213
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.5489140143070973,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.8267,
+      "step": 214
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.467885417875514,
+      "learning_rate": 0.00018965859420826684,
+      "loss": 0.7373,
+      "step": 215
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.479561570863446,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.7829,
+      "step": 216
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.45746076239178723,
+      "learning_rate": 0.00018942780025869098,
+      "loss": 0.8088,
+      "step": 217
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.507631601408242,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.8016,
+      "step": 218
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.430451314261295,
+      "learning_rate": 0.00018919460290902826,
+      "loss": 0.7789,
+      "step": 219
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4624490047965098,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.6833,
+      "step": 220
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.5668639057309102,
+      "learning_rate": 0.0001889590084265304,
+      "loss": 0.8195,
+      "step": 221
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.5179540680889814,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7803,
+      "step": 222
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.466776035445987,
+      "learning_rate": 0.0001887210231428727,
+      "loss": 0.787,
+      "step": 223
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4677485384390769,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.8059,
+      "step": 224
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5200907173542695,
+      "learning_rate": 0.0001884806534539841,
+      "loss": 0.8346,
+      "step": 225
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.5263360649310521,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8515,
+      "step": 226
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.5092342050626046,
+      "learning_rate": 0.0001882379058198751,
+      "loss": 0.78,
+      "step": 227
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.45337247035771755,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.7766,
+      "step": 228
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.48261592566978495,
+      "learning_rate": 0.00018799278676446423,
+      "loss": 0.7214,
+      "step": 229
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.4936136954755711,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7524,
+      "step": 230
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.4352438974202268,
+      "learning_rate": 0.00018774530287540278,
+      "loss": 0.8186,
+      "step": 231
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.5337821387174219,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.8484,
+      "step": 232
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.5624469243195404,
+      "learning_rate": 0.00018749546080389757,
+      "loss": 0.846,
+      "step": 233
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.4029135127012289,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7663,
+      "step": 234
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.5380877662892091,
+      "learning_rate": 0.00018724326726453244,
+      "loss": 0.8101,
+      "step": 235
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5722178777673009,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7683,
+      "step": 236
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.5379554537100044,
+      "learning_rate": 0.00018698872903508755,
+      "loss": 0.8834,
+      "step": 237
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.5568478872582503,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.8667,
+      "step": 238
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.5577808444837524,
+      "learning_rate": 0.0001867318529563574,
+      "loss": 0.7033,
+      "step": 239
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4962622589622996,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7538,
+      "step": 240
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.46147950362919277,
+      "learning_rate": 0.00018647264593196688,
+      "loss": 0.7837,
+      "step": 241
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.5015852768900653,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.824,
+      "step": 242
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.46416418218549393,
+      "learning_rate": 0.00018621111492818585,
+      "loss": 0.7756,
+      "step": 243
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.497949749763509,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.8204,
+      "step": 244
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.6079261447552436,
+      "learning_rate": 0.00018594726697374175,
+      "loss": 0.8981,
+      "step": 245
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.526743695859469,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.8355,
+      "step": 246
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.46954162920695786,
+      "learning_rate": 0.0001856811091596308,
+      "loss": 0.7522,
+      "step": 247
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4901504125403584,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.8489,
+      "step": 248
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.4698272073878121,
+      "learning_rate": 0.00018541264863892754,
+      "loss": 0.824,
+      "step": 249
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.43217871238602223,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.8172,
+      "step": 250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.47857698799982135,
+      "learning_rate": 0.00018514189262659235,
+      "loss": 0.8286,
+      "step": 251
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.46748873118066614,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8197,
+      "step": 252
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.5100842368597939,
+      "learning_rate": 0.00018486884839927768,
+      "loss": 0.8053,
+      "step": 253
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.5672187705898414,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.7568,
+      "step": 254
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.4526590684351888,
+      "learning_rate": 0.0001845935232951325,
+      "loss": 0.7985,
+      "step": 255
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.46330274343290273,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.8475,
+      "step": 256
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.4725498131891526,
+      "learning_rate": 0.00018431592471360503,
+      "loss": 0.7905,
+      "step": 257
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4428375291128829,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7589,
+      "step": 258
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.5337519447527753,
+      "learning_rate": 0.000184036060115244,
+      "loss": 0.8534,
+      "step": 259
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.47327745791785747,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.7134,
+      "step": 260
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.44948160295387496,
+      "learning_rate": 0.00018375393702149787,
+      "loss": 0.7707,
+      "step": 261
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5570473802648406,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.8728,
+      "step": 262
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.5308676094602198,
+      "learning_rate": 0.00018346956301451304,
+      "loss": 0.8574,
+      "step": 263
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5321765551025057,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.8354,
+      "step": 264
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.46111601239828837,
+      "learning_rate": 0.00018318294573692985,
+      "loss": 0.7851,
+      "step": 265
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.4354857475616162,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.7556,
+      "step": 266
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.47259238349750865,
+      "learning_rate": 0.0001828940928916772,
+      "loss": 0.7517,
+      "step": 267
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.4477930536139435,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7668,
+      "step": 268
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.5230405087481416,
+      "learning_rate": 0.00018260301224176558,
+      "loss": 0.822,
+      "step": 269
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.4631954532874808,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7812,
+      "step": 270
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.41168641496446173,
+      "learning_rate": 0.00018230971161007853,
+      "loss": 0.8112,
+      "step": 271
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4832045243793889,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7872,
+      "step": 272
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.39738429395928343,
+      "learning_rate": 0.00018201419887916214,
+      "loss": 0.7565,
+      "step": 273
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.6618431513711611,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.8073,
+      "step": 274
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.6426765850504291,
+      "learning_rate": 0.00018171648199101346,
+      "loss": 0.8753,
+      "step": 275
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4880857522804306,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.764,
+      "step": 276
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.45007102188173415,
+      "learning_rate": 0.00018141656894686689,
+      "loss": 0.7981,
+      "step": 277
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.4220971270876302,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.7344,
+      "step": 278
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.493693810778044,
+      "learning_rate": 0.00018111446780697929,
+      "loss": 0.8367,
+      "step": 279
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.39592572323090214,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7349,
+      "step": 280
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.592672121309063,
+      "learning_rate": 0.00018081018669041324,
+      "loss": 0.9222,
+      "step": 281
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.48853563828611635,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.7647,
+      "step": 282
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.5099885648250655,
+      "learning_rate": 0.00018050373377481878,
+      "loss": 0.8057,
+      "step": 283
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.47344188754976313,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7374,
+      "step": 284
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.5450900217778545,
+      "learning_rate": 0.0001801951172962139,
+      "loss": 0.8305,
+      "step": 285
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.500235444800896,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7839,
+      "step": 286
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.4254375929183266,
+      "learning_rate": 0.0001798843455487629,
+      "loss": 0.7141,
+      "step": 287
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.4931940454951766,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7789,
+      "step": 288
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.4705259096844537,
+      "learning_rate": 0.00017957142688455362,
+      "loss": 0.8136,
+      "step": 289
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.5208499422721253,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7836,
+      "step": 290
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.5676233223845931,
+      "learning_rate": 0.00017925636971337304,
+      "loss": 0.7864,
+      "step": 291
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.5591655871222605,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.8028,
+      "step": 292
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.4754102096910578,
+      "learning_rate": 0.00017893918250248104,
+      "loss": 0.7925,
+      "step": 293
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.5643583330027085,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.8378,
+      "step": 294
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.47043092011499726,
+      "learning_rate": 0.00017861987377638312,
+      "loss": 0.8159,
+      "step": 295
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5322038857940076,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.826,
+      "step": 296
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.4484512256373065,
+      "learning_rate": 0.0001782984521166011,
+      "loss": 0.7877,
+      "step": 297
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.45341619903015334,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7785,
+      "step": 298
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.48637062714931634,
+      "learning_rate": 0.00017797492616144256,
+      "loss": 0.7895,
+      "step": 299
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.47476787977687446,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7731,
+      "step": 300
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.490442660613966,
+      "learning_rate": 0.00017764930460576866,
+      "loss": 0.7661,
+      "step": 301
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.48217832291846996,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7772,
+      "step": 302
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.5718547640020041,
+      "learning_rate": 0.00017732159620076053,
+      "loss": 0.7886,
+      "step": 303
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.4790260871316992,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7743,
+      "step": 304
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.538171029229575,
+      "learning_rate": 0.00017699180975368396,
+      "loss": 0.7518,
+      "step": 305
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.47444774727621175,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.8274,
+      "step": 306
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.4641069586982883,
+      "learning_rate": 0.00017665995412765285,
+      "loss": 0.7492,
+      "step": 307
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.39460271353192916,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7454,
+      "step": 308
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.49461182872373133,
+      "learning_rate": 0.00017632603824139085,
+      "loss": 0.8201,
+      "step": 309
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.47081085668362577,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.7969,
+      "step": 310
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.4681455790198126,
+      "learning_rate": 0.0001759900710689918,
+      "loss": 0.768,
+      "step": 311
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.42370965593642185,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8096,
+      "step": 312
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.5735821745386214,
+      "learning_rate": 0.00017565206163967846,
+      "loss": 0.7375,
+      "step": 313
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.42007123640118516,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8235,
+      "step": 314
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.4465925599641599,
+      "learning_rate": 0.00017531201903755994,
+      "loss": 0.766,
+      "step": 315
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.531384270972724,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.8441,
+      "step": 316
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.424343521475695,
+      "learning_rate": 0.00017496995240138744,
+      "loss": 0.8606,
+      "step": 317
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.3595778823729719,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7174,
+      "step": 318
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.48050316531217974,
+      "learning_rate": 0.00017462587092430875,
+      "loss": 0.8123,
+      "step": 319
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.5248556492574765,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.8686,
+      "step": 320
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.45602577632397057,
+      "learning_rate": 0.00017427978385362112,
+      "loss": 0.783,
+      "step": 321
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.5427847444476832,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.851,
+      "step": 322
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.5117593054863468,
+      "learning_rate": 0.0001739317004905227,
+      "loss": 0.7989,
+      "step": 323
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.45803642501159125,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.7122,
+      "step": 324
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.45691256458822394,
+      "learning_rate": 0.00017358163018986282,
+      "loss": 0.7875,
+      "step": 325
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.4623278173666505,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.7427,
+      "step": 326
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.4557973067784474,
+      "learning_rate": 0.00017322958235989016,
+      "loss": 0.7717,
+      "step": 327
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.4684151122111679,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7318,
+      "step": 328
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.5180386857317837,
+      "learning_rate": 0.00017287556646200018,
+      "loss": 0.8477,
+      "step": 329
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.5704168958071772,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.8237,
+      "step": 330
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.48729965053396485,
+      "learning_rate": 0.00017251959201048083,
+      "loss": 0.7924,
+      "step": 331
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.46909698585705867,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.8054,
+      "step": 332
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.40570176445398337,
+      "learning_rate": 0.00017216166857225674,
+      "loss": 0.7301,
+      "step": 333
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.43930179100756683,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.7743,
+      "step": 334
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.45451598783625535,
+      "learning_rate": 0.00017180180576663228,
+      "loss": 0.828,
+      "step": 335
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4827073570209465,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.7993,
+      "step": 336
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.39628655140142455,
+      "learning_rate": 0.00017144001326503273,
+      "loss": 0.6848,
+      "step": 337
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.43425422771628786,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.8164,
+      "step": 338
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.39861016980449127,
+      "learning_rate": 0.00017107630079074478,
+      "loss": 0.7698,
+      "step": 339
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4581271153457384,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.8106,
+      "step": 340
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.4195059927827672,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.779,
+      "step": 341
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.4117022682648752,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.6964,
+      "step": 342
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.4440470842584414,
+      "learning_rate": 0.00017034315507498635,
+      "loss": 0.7685,
+      "step": 343
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.49439139053437337,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.8156,
+      "step": 344
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.504797045407952,
+      "learning_rate": 0.00016997374153703625,
+      "loss": 0.8041,
+      "step": 345
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.461560523441606,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.7754,
+      "step": 346
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.4871876444141448,
+      "learning_rate": 0.00016960244743290868,
+      "loss": 0.778,
+      "step": 347
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.5133143350269246,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.8086,
+      "step": 348
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.3987091569511243,
+      "learning_rate": 0.00016922928274124886,
+      "loss": 0.6926,
+      "step": 349
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.4196731036587334,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.7249,
+      "step": 350
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.6019163224867493,
+      "learning_rate": 0.00016885425749097444,
+      "loss": 0.8312,
+      "step": 351
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4795991675982521,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7772,
+      "step": 352
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.4353338870569304,
+      "learning_rate": 0.00016847738176100632,
+      "loss": 0.7211,
+      "step": 353
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.46691733568868105,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7864,
+      "step": 354
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.3857036487152233,
+      "learning_rate": 0.0001680986656799975,
+      "loss": 0.7111,
+      "step": 355
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.4495366286474417,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.759,
+      "step": 356
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.4744694748595945,
+      "learning_rate": 0.00016771811942606108,
+      "loss": 0.7717,
+      "step": 357
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.4704097676256151,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7382,
+      "step": 358
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.36811260781930977,
+      "learning_rate": 0.00016733575322649657,
+      "loss": 0.6893,
+      "step": 359
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4888253364404233,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.8192,
+      "step": 360
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.4451286475399139,
+      "learning_rate": 0.00016695157735751513,
+      "loss": 0.7518,
+      "step": 361
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.5475940460741608,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7607,
+      "step": 362
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.40993584937801125,
+      "learning_rate": 0.0001665656021439633,
+      "loss": 0.763,
+      "step": 363
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.48824216934054654,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7755,
+      "step": 364
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.515099943428024,
+      "learning_rate": 0.00016617783795904565,
+      "loss": 0.7924,
+      "step": 365
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.43134057267310605,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7754,
+      "step": 366
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.5504098337356608,
+      "learning_rate": 0.00016578829522404583,
+      "loss": 0.8429,
+      "step": 367
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4940928471631194,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7701,
+      "step": 368
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.5078007893178331,
+      "learning_rate": 0.00016539698440804661,
+      "loss": 0.8127,
+      "step": 369
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.40809202305330516,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7131,
+      "step": 370
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.5267721859621575,
+      "learning_rate": 0.0001650039160276485,
+      "loss": 0.7879,
+      "step": 371
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.555623416935034,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7464,
+      "step": 372
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.47242108718334574,
+      "learning_rate": 0.0001646091006466871,
+      "loss": 0.7482,
+      "step": 373
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.434436157164965,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.797,
+      "step": 374
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5143690476812447,
+      "learning_rate": 0.00016421254887594917,
+      "loss": 0.7908,
+      "step": 375
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.4712395271201534,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7755,
+      "step": 376
+    },
+    {
+      "epoch": 0.3016,
+      "grad_norm": 0.46279760331106856,
+      "learning_rate": 0.00016381427137288754,
+      "loss": 0.7637,
+      "step": 377
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.4952490838053615,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7626,
+      "step": 378
+    },
+    {
+      "epoch": 0.3032,
+      "grad_norm": 0.4603696659954351,
+      "learning_rate": 0.0001634142788413346,
+      "loss": 0.7616,
+      "step": 379
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.45145665775466476,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7853,
+      "step": 380
+    },
+    {
+      "epoch": 0.3048,
+      "grad_norm": 0.4922555531672983,
+      "learning_rate": 0.00016301258203121462,
+      "loss": 0.7621,
+      "step": 381
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.4220625389408268,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7621,
+      "step": 382
+    },
+    {
+      "epoch": 0.3064,
+      "grad_norm": 0.44844932466206555,
+      "learning_rate": 0.00016260919173825508,
+      "loss": 0.7799,
+      "step": 383
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4679186618556549,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.8217,
+      "step": 384
+    },
+    {
+      "epoch": 0.308,
+      "grad_norm": 0.419269058125717,
+      "learning_rate": 0.00016220411880369601,
+      "loss": 0.7608,
+      "step": 385
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.41740087340015597,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7297,
+      "step": 386
+    },
+    {
+      "epoch": 0.3096,
+      "grad_norm": 0.45791369382882224,
+      "learning_rate": 0.00016179737411399926,
+      "loss": 0.765,
+      "step": 387
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.5255076947887414,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.8014,
+      "step": 388
+    },
+    {
+      "epoch": 0.3112,
+      "grad_norm": 0.459584359122999,
+      "learning_rate": 0.00016138896860055555,
+      "loss": 0.8193,
+      "step": 389
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.47469393989155145,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7453,
+      "step": 390
+    },
+    {
+      "epoch": 0.3128,
+      "grad_norm": 0.4131509273470646,
+      "learning_rate": 0.00016097891323939062,
+      "loss": 0.762,
+      "step": 391
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.42581743577782966,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7363,
+      "step": 392
+    },
+    {
+      "epoch": 0.3144,
+      "grad_norm": 0.38516684481175223,
+      "learning_rate": 0.00016056721905087056,
+      "loss": 0.7228,
+      "step": 393
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.3836176742214281,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.6523,
+      "step": 394
+    },
+    {
+      "epoch": 0.316,
+      "grad_norm": 0.4915656142232788,
+      "learning_rate": 0.00016015389709940538,
+      "loss": 0.7871,
+      "step": 395
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.48349109519241057,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7877,
+      "step": 396
+    },
+    {
+      "epoch": 0.3176,
+      "grad_norm": 0.44163695697766775,
+      "learning_rate": 0.0001597389584931517,
+      "loss": 0.7428,
+      "step": 397
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.4328561392531088,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7502,
+      "step": 398
+    },
+    {
+      "epoch": 0.3192,
+      "grad_norm": 0.4938027784803824,
+      "learning_rate": 0.0001593224143837142,
+      "loss": 0.7887,
+      "step": 399
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4297049972537339,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.8057,
+      "step": 400
+    },
+    {
+      "epoch": 0.3208,
+      "grad_norm": 0.44007381681857877,
+      "learning_rate": 0.00015890427596584617,
+      "loss": 0.7985,
+      "step": 401
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.3917744067596416,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7456,
+      "step": 402
+    },
+    {
+      "epoch": 0.3224,
+      "grad_norm": 0.41200502929810034,
+      "learning_rate": 0.00015848455447714822,
+      "loss": 0.7607,
+      "step": 403
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.5236098268934606,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.8249,
+      "step": 404
+    },
+    {
+      "epoch": 0.324,
+      "grad_norm": 0.4655752655037792,
+      "learning_rate": 0.00015806326119776663,
+      "loss": 0.6992,
+      "step": 405
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.4956055625853647,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7292,
+      "step": 406
+    },
+    {
+      "epoch": 0.3256,
+      "grad_norm": 0.45043226529870933,
+      "learning_rate": 0.00015764040745008988,
+      "loss": 0.7357,
+      "step": 407
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4192682347287542,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.8331,
+      "step": 408
+    },
+    {
+      "epoch": 0.3272,
+      "grad_norm": 0.38132386168633237,
+      "learning_rate": 0.00015721600459844468,
+      "loss": 0.6636,
+      "step": 409
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.46353702228952487,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7522,
+      "step": 410
+    },
+    {
+      "epoch": 0.3288,
+      "grad_norm": 0.4144194902717004,
+      "learning_rate": 0.00015679006404879033,
+      "loss": 0.7369,
+      "step": 411
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.4469177538620975,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7435,
+      "step": 412
+    },
+    {
+      "epoch": 0.3304,
+      "grad_norm": 0.3964842354715512,
+      "learning_rate": 0.00015636259724841222,
+      "loss": 0.6633,
+      "step": 413
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.4363718416537173,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.8078,
+      "step": 414
+    },
+    {
+      "epoch": 0.332,
+      "grad_norm": 0.42421510982110067,
+      "learning_rate": 0.00015593361568561428,
+      "loss": 0.6786,
+      "step": 415
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3736016797441009,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.689,
+      "step": 416
+    },
+    {
+      "epoch": 0.3336,
+      "grad_norm": 0.45707318258464946,
+      "learning_rate": 0.0001555031308894101,
+      "loss": 0.8534,
+      "step": 417
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.5421905187769916,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.8016,
+      "step": 418
+    },
+    {
+      "epoch": 0.3352,
+      "grad_norm": 0.5294648573601008,
+      "learning_rate": 0.0001550711544292131,
+      "loss": 0.8031,
+      "step": 419
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5202468416897913,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.8505,
+      "step": 420
+    },
+    {
+      "epoch": 0.3368,
+      "grad_norm": 0.43684561909750713,
+      "learning_rate": 0.00015463769791452574,
+      "loss": 0.7959,
+      "step": 421
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.41976006424603907,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7377,
+      "step": 422
+    },
+    {
+      "epoch": 0.3384,
+      "grad_norm": 0.3750568280997164,
+      "learning_rate": 0.00015420277299462736,
+      "loss": 0.6778,
+      "step": 423
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4844704168166154,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7729,
+      "step": 424
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.4573814253014277,
+      "learning_rate": 0.00015376639135826107,
+      "loss": 0.7295,
+      "step": 425
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.4229278166795163,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.779,
+      "step": 426
+    },
+    {
+      "epoch": 0.3416,
+      "grad_norm": 0.39134102147840255,
+      "learning_rate": 0.00015332856473331978,
+      "loss": 0.7282,
+      "step": 427
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.4018318818965121,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7387,
+      "step": 428
+    },
+    {
+      "epoch": 0.3432,
+      "grad_norm": 0.4525247116066564,
+      "learning_rate": 0.00015288930488653094,
+      "loss": 0.7727,
+      "step": 429
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.48012378662271826,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.6921,
+      "step": 430
+    },
+    {
+      "epoch": 0.3448,
+      "grad_norm": 0.4324585146029964,
+      "learning_rate": 0.0001524486236231402,
+      "loss": 0.766,
+      "step": 431
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.40183349656608747,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7135,
+      "step": 432
+    },
+    {
+      "epoch": 0.3464,
+      "grad_norm": 0.4214155735954674,
+      "learning_rate": 0.00015200653278659432,
+      "loss": 0.7117,
+      "step": 433
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.4663710431593555,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7223,
+      "step": 434
+    },
+    {
+      "epoch": 0.348,
+      "grad_norm": 0.3905830218453315,
+      "learning_rate": 0.00015156304425822267,
+      "loss": 0.6905,
+      "step": 435
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.5105981653821051,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7125,
+      "step": 436
+    },
+    {
+      "epoch": 0.3496,
+      "grad_norm": 0.4410317560984314,
+      "learning_rate": 0.00015111816995691809,
+      "loss": 0.7408,
+      "step": 437
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.449784109636782,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.8165,
+      "step": 438
+    },
+    {
+      "epoch": 0.3512,
+      "grad_norm": 0.41779437775287087,
+      "learning_rate": 0.00015067192183881658,
+      "loss": 0.71,
+      "step": 439
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.4475202549139668,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7816,
+      "step": 440
+    },
+    {
+      "epoch": 0.3528,
+      "grad_norm": 0.6204051569206457,
+      "learning_rate": 0.00015022431189697568,
+      "loss": 0.8919,
+      "step": 441
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.4800658555085623,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8167,
+      "step": 442
+    },
+    {
+      "epoch": 0.3544,
+      "grad_norm": 0.4226223390547385,
+      "learning_rate": 0.0001497753521610526,
+      "loss": 0.6846,
+      "step": 443
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.4283393298389574,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7417,
+      "step": 444
+    },
+    {
+      "epoch": 0.356,
+      "grad_norm": 0.43211339102560453,
+      "learning_rate": 0.00014932505469698052,
+      "loss": 0.7491,
+      "step": 445
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.49152903026855915,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.838,
+      "step": 446
+    },
+    {
+      "epoch": 0.3576,
+      "grad_norm": 0.452265605480497,
+      "learning_rate": 0.0001488734316066446,
+      "loss": 0.7209,
+      "step": 447
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.545278818491295,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.845,
+      "step": 448
+    },
+    {
+      "epoch": 0.3592,
+      "grad_norm": 0.5764244596882926,
+      "learning_rate": 0.0001484204950275565,
+      "loss": 0.8211,
+      "step": 449
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.6010972612369179,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.8423,
+      "step": 450
+    },
+    {
+      "epoch": 0.3608,
+      "grad_norm": 0.44105865182591925,
+      "learning_rate": 0.00014796625713252848,
+      "loss": 0.6825,
+      "step": 451
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.3877950674669648,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.6835,
+      "step": 452
+    },
+    {
+      "epoch": 0.3624,
+      "grad_norm": 0.43012622299713454,
+      "learning_rate": 0.00014751073012934587,
+      "loss": 0.7166,
+      "step": 453
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.4845720628208265,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7753,
+      "step": 454
+    },
+    {
+      "epoch": 0.364,
+      "grad_norm": 0.4227159679849885,
+      "learning_rate": 0.0001470539262604393,
+      "loss": 0.6918,
+      "step": 455
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.42693754032896614,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.786,
+      "step": 456
+    },
+    {
+      "epoch": 0.3656,
+      "grad_norm": 0.43350068018540505,
+      "learning_rate": 0.00014659585780255556,
+      "loss": 0.6695,
+      "step": 457
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.4539085530411711,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.7464,
+      "step": 458
+    },
+    {
+      "epoch": 0.3672,
+      "grad_norm": 0.4076576810042241,
+      "learning_rate": 0.0001461365370664276,
+      "loss": 0.7067,
+      "step": 459
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4377396029011281,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7387,
+      "step": 460
+    },
+    {
+      "epoch": 0.3688,
+      "grad_norm": 0.45352747003340027,
+      "learning_rate": 0.00014567597639644387,
+      "loss": 0.8014,
+      "step": 461
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.47126093132665964,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7444,
+      "step": 462
+    },
+    {
+      "epoch": 0.3704,
+      "grad_norm": 0.41395625679635656,
+      "learning_rate": 0.00014521418817031628,
+      "loss": 0.7425,
+      "step": 463
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4231225249203355,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.6971,
+      "step": 464
+    },
+    {
+      "epoch": 0.372,
+      "grad_norm": 0.4138814527994516,
+      "learning_rate": 0.00014475118479874774,
+      "loss": 0.7182,
+      "step": 465
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.4650691019424587,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7749,
+      "step": 466
+    },
+    {
+      "epoch": 0.3736,
+      "grad_norm": 0.4760664405868015,
+      "learning_rate": 0.0001442869787250987,
+      "loss": 0.7578,
+      "step": 467
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.45232863318343103,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.6736,
+      "step": 468
+    },
+    {
+      "epoch": 0.3752,
+      "grad_norm": 0.5230469103110574,
+      "learning_rate": 0.00014382158242505234,
+      "loss": 0.8259,
+      "step": 469
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.4338464917355991,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7681,
+      "step": 470
+    },
+    {
+      "epoch": 0.3768,
+      "grad_norm": 0.49859848326183653,
+      "learning_rate": 0.00014335500840627986,
+      "loss": 0.776,
+      "step": 471
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.44729854713715494,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.7363,
+      "step": 472
+    },
+    {
+      "epoch": 0.3784,
+      "grad_norm": 0.4322585615668971,
+      "learning_rate": 0.0001428872692081038,
+      "loss": 0.7653,
+      "step": 473
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.41427933628653707,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7755,
+      "step": 474
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.46708127193055726,
+      "learning_rate": 0.00014241837740116132,
+      "loss": 0.7904,
+      "step": 475
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.41499455660098955,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7066,
+      "step": 476
+    },
+    {
+      "epoch": 0.3816,
+      "grad_norm": 0.5020715750722332,
+      "learning_rate": 0.00014194834558706632,
+      "loss": 0.7784,
+      "step": 477
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.5156058012318389,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.8441,
+      "step": 478
+    },
+    {
+      "epoch": 0.3832,
+      "grad_norm": 0.4522808025098913,
+      "learning_rate": 0.0001414771863980707,
+      "loss": 0.8643,
+      "step": 479
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.3995307603496349,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.7943,
+      "step": 480
+    },
+    {
+      "epoch": 0.3848,
+      "grad_norm": 0.4285524294891582,
+      "learning_rate": 0.00014100491249672498,
+      "loss": 0.7826,
+      "step": 481
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.523136423212411,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7701,
+      "step": 482
+    },
+    {
+      "epoch": 0.3864,
+      "grad_norm": 0.4391366370665467,
+      "learning_rate": 0.0001405315365755379,
+      "loss": 0.7624,
+      "step": 483
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.46959643574814464,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.781,
+      "step": 484
+    },
+    {
+      "epoch": 0.388,
+      "grad_norm": 0.40194940332797746,
+      "learning_rate": 0.00014005707135663527,
+      "loss": 0.7015,
+      "step": 485
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.4684327943617427,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7251,
+      "step": 486
+    },
+    {
+      "epoch": 0.3896,
+      "grad_norm": 0.39832501600935233,
+      "learning_rate": 0.00013958152959141825,
+      "loss": 0.7508,
+      "step": 487
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.44967938370538474,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.823,
+      "step": 488
+    },
+    {
+      "epoch": 0.3912,
+      "grad_norm": 0.38964436200631264,
+      "learning_rate": 0.00013910492406022033,
+      "loss": 0.6597,
+      "step": 489
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.5875952758046031,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.8907,
+      "step": 490
+    },
+    {
+      "epoch": 0.3928,
+      "grad_norm": 0.41508015467363896,
+      "learning_rate": 0.0001386272675719642,
+      "loss": 0.7338,
+      "step": 491
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.36590855761782176,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7389,
+      "step": 492
+    },
+    {
+      "epoch": 0.3944,
+      "grad_norm": 0.490121745593381,
+      "learning_rate": 0.00013814857296381728,
+      "loss": 0.8102,
+      "step": 493
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.44088560611113053,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7914,
+      "step": 494
+    },
+    {
+      "epoch": 0.396,
+      "grad_norm": 0.5115681409107602,
+      "learning_rate": 0.00013766885310084688,
+      "loss": 0.7538,
+      "step": 495
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.43192003334052165,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7648,
+      "step": 496
+    },
+    {
+      "epoch": 0.3976,
+      "grad_norm": 0.41543866901551685,
+      "learning_rate": 0.00013718812087567414,
+      "loss": 0.6623,
+      "step": 497
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.4059960163507851,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.6941,
+      "step": 498
+    },
+    {
+      "epoch": 0.3992,
+      "grad_norm": 0.41196465157911266,
+      "learning_rate": 0.000136706389208128,
+      "loss": 0.6495,
+      "step": 499
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4134185282392816,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.699,
+      "step": 500
+    },
+    {
+      "epoch": 0.4008,
+      "grad_norm": 0.38557605015027085,
+      "learning_rate": 0.00013622367104489756,
+      "loss": 0.6611,
+      "step": 501
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.4095403096672025,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.721,
+      "step": 502
+    },
+    {
+      "epoch": 0.4024,
+      "grad_norm": 0.4986737593577314,
+      "learning_rate": 0.0001357399793591844,
+      "loss": 0.7324,
+      "step": 503
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.42721250489256873,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7572,
+      "step": 504
+    },
+    {
+      "epoch": 0.404,
+      "grad_norm": 0.5279985164816671,
+      "learning_rate": 0.00013525532715035366,
+      "loss": 0.7984,
+      "step": 505
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.49640263739102203,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.8373,
+      "step": 506
+    },
+    {
+      "epoch": 0.4056,
+      "grad_norm": 0.7823408697685369,
+      "learning_rate": 0.00013476972744358507,
+      "loss": 0.6345,
+      "step": 507
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.4464110012609075,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.7183,
+      "step": 508
+    },
+    {
+      "epoch": 0.4072,
+      "grad_norm": 0.4941674866246718,
+      "learning_rate": 0.00013428319328952253,
+      "loss": 0.765,
+      "step": 509
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.4212429963438416,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.8026,
+      "step": 510
+    },
+    {
+      "epoch": 0.4088,
+      "grad_norm": 0.5015401184238121,
+      "learning_rate": 0.0001337957377639235,
+      "loss": 0.7119,
+      "step": 511
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4712569345916714,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7601,
+      "step": 512
+    },
+    {
+      "epoch": 0.4104,
+      "grad_norm": 0.48298983722346495,
+      "learning_rate": 0.0001333073739673076,
+      "loss": 0.7468,
+      "step": 513
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.42128236331141106,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.6988,
+      "step": 514
+    },
+    {
+      "epoch": 0.412,
+      "grad_norm": 0.43660086130573456,
+      "learning_rate": 0.0001328181150246045,
+      "loss": 0.7,
+      "step": 515
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.403372334702858,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.6843,
+      "step": 516
+    },
+    {
+      "epoch": 0.4136,
+      "grad_norm": 0.35725815795608484,
+      "learning_rate": 0.00013232797408480127,
+      "loss": 0.687,
+      "step": 517
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.46502382720079394,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7597,
+      "step": 518
+    },
+    {
+      "epoch": 0.4152,
+      "grad_norm": 0.47555465949251546,
+      "learning_rate": 0.00013183696432058888,
+      "loss": 0.7433,
+      "step": 519
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.43203790553360294,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7289,
+      "step": 520
+    },
+    {
+      "epoch": 0.4168,
+      "grad_norm": 0.49746515651547685,
+      "learning_rate": 0.00013134509892800822,
+      "loss": 0.7373,
+      "step": 521
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.5435992460513512,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7064,
+      "step": 522
+    },
+    {
+      "epoch": 0.4184,
+      "grad_norm": 0.4479501305581582,
+      "learning_rate": 0.00013085239112609547,
+      "loss": 0.7408,
+      "step": 523
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.45825882637041954,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7366,
+      "step": 524
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.4207867694918564,
+      "learning_rate": 0.00013035885415652685,
+      "loss": 0.7701,
+      "step": 525
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.4061367178446236,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7346,
+      "step": 526
+    },
+    {
+      "epoch": 0.4216,
+      "grad_norm": 0.5198874812139052,
+      "learning_rate": 0.00012986450128326266,
+      "loss": 0.81,
+      "step": 527
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.37697730018063164,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7049,
+      "step": 528
+    },
+    {
+      "epoch": 0.4232,
+      "grad_norm": 0.44775689222934073,
+      "learning_rate": 0.00012936934579219094,
+      "loss": 0.7461,
+      "step": 529
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.5033512458411837,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7314,
+      "step": 530
+    },
+    {
+      "epoch": 0.4248,
+      "grad_norm": 0.41579339723003056,
+      "learning_rate": 0.00012887340099077024,
+      "loss": 0.6281,
+      "step": 531
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.36809122622614143,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.6204,
+      "step": 532
+    },
+    {
+      "epoch": 0.4264,
+      "grad_norm": 0.45194043935373646,
+      "learning_rate": 0.0001283766802076722,
+      "loss": 0.7546,
+      "step": 533
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.4288747716764305,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.6822,
+      "step": 534
+    },
+    {
+      "epoch": 0.428,
+      "grad_norm": 0.5903941475612065,
+      "learning_rate": 0.00012787919679242306,
+      "loss": 0.8609,
+      "step": 535
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4672594849005474,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7698,
+      "step": 536
+    },
+    {
+      "epoch": 0.4296,
+      "grad_norm": 0.4942648098027479,
+      "learning_rate": 0.00012738096411504522,
+      "loss": 0.7369,
+      "step": 537
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.42986225428556174,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7064,
+      "step": 538
+    },
+    {
+      "epoch": 0.4312,
+      "grad_norm": 0.42920038852281717,
+      "learning_rate": 0.00012688199556569753,
+      "loss": 0.7679,
+      "step": 539
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.5203513666933222,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7905,
+      "step": 540
+    },
+    {
+      "epoch": 0.4328,
+      "grad_norm": 0.4944381595276974,
+      "learning_rate": 0.0001263823045543158,
+      "loss": 0.7235,
+      "step": 541
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.46170223055809334,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.74,
+      "step": 542
+    },
+    {
+      "epoch": 0.4344,
+      "grad_norm": 0.5172231671804536,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.7417,
+      "step": 543
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.38242212830027367,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.6738,
+      "step": 544
+    },
+    {
+      "epoch": 0.436,
+      "grad_norm": 0.6621820445926183,
+      "learning_rate": 0.00012538080888191408,
+      "loss": 0.8639,
+      "step": 545
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.5380969136869799,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.8612,
+      "step": 546
+    },
+    {
+      "epoch": 0.4376,
+      "grad_norm": 0.4413616622505353,
+      "learning_rate": 0.00012487903113640337,
+      "loss": 0.7477,
+      "step": 547
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3926736956223621,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.6948,
+      "step": 548
+    },
+    {
+      "epoch": 0.4392,
+      "grad_norm": 0.5991568297488339,
+      "learning_rate": 0.00012437658475915377,
+      "loss": 0.9187,
+      "step": 549
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.39312025261804895,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7766,
+      "step": 550
+    },
+    {
+      "epoch": 0.4408,
+      "grad_norm": 0.3487157010010168,
+      "learning_rate": 0.00012387348325356874,
+      "loss": 0.703,
+      "step": 551
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.43079790932596457,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.7772,
+      "step": 552
+    },
+    {
+      "epoch": 0.4424,
+      "grad_norm": 0.4121308140189897,
+      "learning_rate": 0.00012336974014065844,
+      "loss": 0.823,
+      "step": 553
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.38090987839878365,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7192,
+      "step": 554
+    },
+    {
+      "epoch": 0.444,
+      "grad_norm": 0.7962407192316323,
+      "learning_rate": 0.00012286536895867654,
+      "loss": 0.8027,
+      "step": 555
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.4707226388623328,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7552,
+      "step": 556
+    },
+    {
+      "epoch": 0.4456,
+      "grad_norm": 0.42748653530505404,
+      "learning_rate": 0.00012236038326275626,
+      "loss": 0.7287,
+      "step": 557
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.4088509360914169,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7127,
+      "step": 558
+    },
+    {
+      "epoch": 0.4472,
+      "grad_norm": 0.47058586234288113,
+      "learning_rate": 0.00012185479662454595,
+      "loss": 0.7746,
+      "step": 559
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.5021539526911709,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.766,
+      "step": 560
+    },
+    {
+      "epoch": 0.4488,
+      "grad_norm": 0.42619414718966814,
+      "learning_rate": 0.00012134862263184467,
+      "loss": 0.7145,
+      "step": 561
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.3736131505311299,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.7253,
+      "step": 562
+    },
+    {
+      "epoch": 0.4504,
+      "grad_norm": 0.4199445019074886,
+      "learning_rate": 0.00012084187488823657,
+      "loss": 0.7234,
+      "step": 563
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.4080505134188845,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7694,
+      "step": 564
+    },
+    {
+      "epoch": 0.452,
+      "grad_norm": 0.38891966675084405,
+      "learning_rate": 0.00012033456701272576,
+      "loss": 0.7481,
+      "step": 565
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.5546274481813345,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7072,
+      "step": 566
+    },
+    {
+      "epoch": 0.4536,
+      "grad_norm": 0.3842808967932505,
+      "learning_rate": 0.00011982671263936995,
+      "loss": 0.6833,
+      "step": 567
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.5323218413319508,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7154,
+      "step": 568
+    },
+    {
+      "epoch": 0.4552,
+      "grad_norm": 0.5229109514986995,
+      "learning_rate": 0.00011931832541691418,
+      "loss": 0.7449,
+      "step": 569
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.4643395245494808,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.6746,
+      "step": 570
+    },
+    {
+      "epoch": 0.4568,
+      "grad_norm": 0.4429720701897638,
+      "learning_rate": 0.00011880941900842397,
+      "loss": 0.7769,
+      "step": 571
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.4326668366153276,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.6756,
+      "step": 572
+    },
+    {
+      "epoch": 0.4584,
+      "grad_norm": 0.4849046251181642,
+      "learning_rate": 0.00011830000709091815,
+      "loss": 0.7104,
+      "step": 573
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.41564972787398957,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7073,
+      "step": 574
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.4493781091924018,
+      "learning_rate": 0.0001177901033550012,
+      "loss": 0.7065,
+      "step": 575
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.42388302507437037,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.6564,
+      "step": 576
+    },
+    {
+      "epoch": 0.4616,
+      "grad_norm": 0.4923716589001938,
+      "learning_rate": 0.00011727972150449544,
+      "loss": 0.7741,
+      "step": 577
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.4178327260273074,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7493,
+      "step": 578
+    },
+    {
+      "epoch": 0.4632,
+      "grad_norm": 0.3762637913831736,
+      "learning_rate": 0.00011676887525607271,
+      "loss": 0.7049,
+      "step": 579
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3969977833164792,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7313,
+      "step": 580
+    },
+    {
+      "epoch": 0.4648,
+      "grad_norm": 0.4335735319780596,
+      "learning_rate": 0.00011625757833888551,
+      "loss": 0.7144,
+      "step": 581
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.4138327453913176,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.77,
+      "step": 582
+    },
+    {
+      "epoch": 0.4664,
+      "grad_norm": 0.5047893118657163,
+      "learning_rate": 0.0001157458444941984,
+      "loss": 0.7011,
+      "step": 583
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3950191841703804,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7883,
+      "step": 584
+    },
+    {
+      "epoch": 0.468,
+      "grad_norm": 0.44424528822131326,
+      "learning_rate": 0.00011523368747501839,
+      "loss": 0.8441,
+      "step": 585
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.6385143117966893,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7202,
+      "step": 586
+    },
+    {
+      "epoch": 0.4696,
+      "grad_norm": 0.46516466356510056,
+      "learning_rate": 0.00011472112104572547,
+      "loss": 0.6778,
+      "step": 587
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.4395435007469664,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.7256,
+      "step": 588
+    },
+    {
+      "epoch": 0.4712,
+      "grad_norm": 0.43429061744253084,
+      "learning_rate": 0.0001142081589817027,
+      "loss": 0.6587,
+      "step": 589
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4498098946248189,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7129,
+      "step": 590
+    },
+    {
+      "epoch": 0.4728,
+      "grad_norm": 0.41908385366486545,
+      "learning_rate": 0.00011369481506896582,
+      "loss": 0.7348,
+      "step": 591
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.40052577581532633,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7171,
+      "step": 592
+    },
+    {
+      "epoch": 0.4744,
+      "grad_norm": 0.4008782072503879,
+      "learning_rate": 0.00011318110310379301,
+      "loss": 0.6499,
+      "step": 593
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.39949035280289624,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7325,
+      "step": 594
+    },
+    {
+      "epoch": 0.476,
+      "grad_norm": 0.4338227097132858,
+      "learning_rate": 0.00011266703689235394,
+      "loss": 0.68,
+      "step": 595
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.4792328739644269,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.6854,
+      "step": 596
+    },
+    {
+      "epoch": 0.4776,
+      "grad_norm": 0.5433559597110078,
+      "learning_rate": 0.00011215263025033869,
+      "loss": 0.7641,
+      "step": 597
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.6008664840716792,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.848,
+      "step": 598
+    },
+    {
+      "epoch": 0.4792,
+      "grad_norm": 0.5521876730877993,
+      "learning_rate": 0.00011163789700258655,
+      "loss": 0.7713,
+      "step": 599
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.46207986616914726,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7989,
+      "step": 600
+    },
+    {
+      "epoch": 0.4808,
+      "grad_norm": 0.43605957294049125,
+      "learning_rate": 0.00011112285098271451,
+      "loss": 0.7604,
+      "step": 601
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.4622362897937555,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.7442,
+      "step": 602
+    },
+    {
+      "epoch": 0.4824,
+      "grad_norm": 0.4718725513449837,
+      "learning_rate": 0.00011060750603274535,
+      "loss": 0.747,
+      "step": 603
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3847387882100169,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.6951,
+      "step": 604
+    },
+    {
+      "epoch": 0.484,
+      "grad_norm": 0.558777399168513,
+      "learning_rate": 0.00011009187600273566,
+      "loss": 0.7929,
+      "step": 605
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.46752261709614024,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.8129,
+      "step": 606
+    },
+    {
+      "epoch": 0.4856,
+      "grad_norm": 0.4249726636385701,
+      "learning_rate": 0.00010957597475040373,
+      "loss": 0.7479,
+      "step": 607
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.5005423717044616,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7445,
+      "step": 608
+    },
+    {
+      "epoch": 0.4872,
+      "grad_norm": 0.4861351628893515,
+      "learning_rate": 0.00010905981614075693,
+      "loss": 0.7791,
+      "step": 609
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.5639604973250558,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.8149,
+      "step": 610
+    },
+    {
+      "epoch": 0.4888,
+      "grad_norm": 0.45354479874298465,
+      "learning_rate": 0.00010854341404571928,
+      "loss": 0.7934,
+      "step": 611
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.4628445069619903,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.7374,
+      "step": 612
+    },
+    {
+      "epoch": 0.4904,
+      "grad_norm": 0.5195955341225196,
+      "learning_rate": 0.00010802678234375851,
+      "loss": 0.8352,
+      "step": 613
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.43595476728041826,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7256,
+      "step": 614
+    },
+    {
+      "epoch": 0.492,
+      "grad_norm": 0.3606912458601585,
+      "learning_rate": 0.0001075099349195131,
+      "loss": 0.6895,
+      "step": 615
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4643672351016709,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.7274,
+      "step": 616
+    },
+    {
+      "epoch": 0.4936,
+      "grad_norm": 0.5095297400473262,
+      "learning_rate": 0.00010699288566341914,
+      "loss": 0.779,
+      "step": 617
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.3910073850309763,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7128,
+      "step": 618
+    },
+    {
+      "epoch": 0.4952,
+      "grad_norm": 0.4219109386141692,
+      "learning_rate": 0.000106475648471337,
+      "loss": 0.7065,
+      "step": 619
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4997152339765753,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7847,
+      "step": 620
+    },
+    {
+      "epoch": 0.4968,
+      "grad_norm": 0.5290062282289308,
+      "learning_rate": 0.00010595823724417795,
+      "loss": 0.8088,
+      "step": 621
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.4437823849088452,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.7569,
+      "step": 622
+    },
+    {
+      "epoch": 0.4984,
+      "grad_norm": 0.48287565653761594,
+      "learning_rate": 0.00010544066588753044,
+      "loss": 0.7397,
+      "step": 623
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.47399955722247405,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.7304,
+      "step": 624
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.48212727147786694,
+      "learning_rate": 0.00010492294831128641,
+      "loss": 0.7857,
+      "step": 625
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.4081587531539875,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7182,
+      "step": 626
+    },
+    {
+      "epoch": 0.5016,
+      "grad_norm": 0.38747498955693305,
+      "learning_rate": 0.00010440509842926767,
+      "loss": 0.6738,
+      "step": 627
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.4781253938986905,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.8037,
+      "step": 628
+    },
+    {
+      "epoch": 0.5032,
+      "grad_norm": 0.44129600676019193,
+      "learning_rate": 0.00010388713015885161,
+      "loss": 0.7042,
+      "step": 629
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.42560708559719823,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.7282,
+      "step": 630
+    },
+    {
+      "epoch": 0.5048,
+      "grad_norm": 0.4477193550610481,
+      "learning_rate": 0.00010336905742059742,
+      "loss": 0.7277,
+      "step": 631
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.5024188974022757,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7614,
+      "step": 632
+    },
+    {
+      "epoch": 0.5064,
+      "grad_norm": 0.4616493200347517,
+      "learning_rate": 0.0001028508941378719,
+      "loss": 0.7196,
+      "step": 633
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.44716244620334705,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7382,
+      "step": 634
+    },
+    {
+      "epoch": 0.508,
+      "grad_norm": 0.5509178451941221,
+      "learning_rate": 0.00010233265423647523,
+      "loss": 0.8428,
+      "step": 635
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.4648368930268642,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.7804,
+      "step": 636
+    },
+    {
+      "epoch": 0.5096,
+      "grad_norm": 0.41769829538371145,
+      "learning_rate": 0.00010181435164426676,
+      "loss": 0.6763,
+      "step": 637
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.49249667084061416,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.6776,
+      "step": 638
+    },
+    {
+      "epoch": 0.5112,
+      "grad_norm": 0.4368224988064386,
+      "learning_rate": 0.00010129600029079072,
+      "loss": 0.7158,
+      "step": 639
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.6091638158209373,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7423,
+      "step": 640
+    },
+    {
+      "epoch": 0.5128,
+      "grad_norm": 0.525020322096225,
+      "learning_rate": 0.00010077761410690172,
+      "loss": 0.7972,
+      "step": 641
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.5862462392458725,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.8035,
+      "step": 642
+    },
+    {
+      "epoch": 0.5144,
+      "grad_norm": 0.5150342090196197,
+      "learning_rate": 0.00010025920702439051,
+      "loss": 0.7262,
+      "step": 643
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4827917839076806,
+      "learning_rate": 0.0001,
+      "loss": 0.687,
+      "step": 644
+    },
+    {
+      "epoch": 0.516,
+      "grad_norm": 0.49756591436412745,
+      "learning_rate": 9.97407929756095e-05,
+      "loss": 0.7972,
+      "step": 645
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.6491106991630037,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7236,
+      "step": 646
+    },
+    {
+      "epoch": 0.5176,
+      "grad_norm": 0.45070946662545036,
+      "learning_rate": 9.92223858930983e-05,
+      "loss": 0.733,
+      "step": 647
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.4404908086167703,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.676,
+      "step": 648
+    },
+    {
+      "epoch": 0.5192,
+      "grad_norm": 0.42822642977967657,
+      "learning_rate": 9.870399970920932e-05,
+      "loss": 0.6744,
+      "step": 649
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.42878976231733074,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7756,
+      "step": 650
+    },
+    {
+      "epoch": 0.5208,
+      "grad_norm": 0.41192836438899527,
+      "learning_rate": 9.818564835573323e-05,
+      "loss": 0.7018,
+      "step": 651
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3886159645768922,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.6888,
+      "step": 652
+    },
+    {
+      "epoch": 0.5224,
+      "grad_norm": 0.42732623258735747,
+      "learning_rate": 9.766734576352478e-05,
+      "loss": 0.7341,
+      "step": 653
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.5206134048709076,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.7495,
+      "step": 654
+    },
+    {
+      "epoch": 0.524,
+      "grad_norm": 0.4736546649795084,
+      "learning_rate": 9.714910586212816e-05,
+      "loss": 0.761,
+      "step": 655
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4325929971249264,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.6619,
+      "step": 656
+    },
+    {
+      "epoch": 0.5256,
+      "grad_norm": 0.5365900497916197,
+      "learning_rate": 9.663094257940258e-05,
+      "loss": 0.7366,
+      "step": 657
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.4769972496588147,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7723,
+      "step": 658
+    },
+    {
+      "epoch": 0.5272,
+      "grad_norm": 0.43283977877619345,
+      "learning_rate": 9.611286984114841e-05,
+      "loss": 0.7172,
+      "step": 659
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4838519468109514,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.8092,
+      "step": 660
+    },
+    {
+      "epoch": 0.5288,
+      "grad_norm": 0.5482584578563439,
+      "learning_rate": 9.559490157073236e-05,
+      "loss": 0.7234,
+      "step": 661
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.3719346143111832,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.6572,
+      "step": 662
+    },
+    {
+      "epoch": 0.5304,
+      "grad_norm": 0.4558970091769671,
+      "learning_rate": 9.507705168871358e-05,
+      "loss": 0.7157,
+      "step": 663
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.48361966098232073,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6677,
+      "step": 664
+    },
+    {
+      "epoch": 0.532,
+      "grad_norm": 0.5467192328851579,
+      "learning_rate": 9.455933411246958e-05,
+      "loss": 0.7161,
+      "step": 665
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.47218982542645954,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.7334,
+      "step": 666
+    },
+    {
+      "epoch": 0.5336,
+      "grad_norm": 0.44148771569782885,
+      "learning_rate": 9.404176275582208e-05,
+      "loss": 0.7564,
+      "step": 667
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.39846129196993135,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.6859,
+      "step": 668
+    },
+    {
+      "epoch": 0.5352,
+      "grad_norm": 0.4944897786007179,
+      "learning_rate": 9.352435152866298e-05,
+      "loss": 0.699,
+      "step": 669
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.5018041502566262,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.9113,
+      "step": 670
+    },
+    {
+      "epoch": 0.5368,
+      "grad_norm": 0.6628582201931914,
+      "learning_rate": 9.300711433658087e-05,
+      "loss": 0.8114,
+      "step": 671
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.4295285155624496,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7586,
+      "step": 672
+    },
+    {
+      "epoch": 0.5384,
+      "grad_norm": 0.4294367293079005,
+      "learning_rate": 9.249006508048694e-05,
+      "loss": 0.6762,
+      "step": 673
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.49039532092187726,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.674,
+      "step": 674
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.36857242851828004,
+      "learning_rate": 9.197321765624152e-05,
+      "loss": 0.628,
+      "step": 675
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.4879511571759796,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7196,
+      "step": 676
+    },
+    {
+      "epoch": 0.5416,
+      "grad_norm": 0.4271865794426687,
+      "learning_rate": 9.145658595428074e-05,
+      "loss": 0.6725,
+      "step": 677
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.46956215263281625,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7399,
+      "step": 678
+    },
+    {
+      "epoch": 0.5432,
+      "grad_norm": 0.43799030257027066,
+      "learning_rate": 9.09401838592431e-05,
+      "loss": 0.7,
+      "step": 679
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.5321456602261782,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.7986,
+      "step": 680
+    },
+    {
+      "epoch": 0.5448,
+      "grad_norm": 0.4675180839233065,
+      "learning_rate": 9.04240252495963e-05,
+      "loss": 0.7248,
+      "step": 681
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.5032933922334141,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.7039,
+      "step": 682
+    },
+    {
+      "epoch": 0.5464,
+      "grad_norm": 0.45260383572623736,
+      "learning_rate": 8.990812399726435e-05,
+      "loss": 0.6801,
+      "step": 683
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.42635949104610027,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7046,
+      "step": 684
+    },
+    {
+      "epoch": 0.548,
+      "grad_norm": 0.49559611472926934,
+      "learning_rate": 8.939249396725467e-05,
+      "loss": 0.7455,
+      "step": 685
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.4777232582339619,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.8166,
+      "step": 686
+    },
+    {
+      "epoch": 0.5496,
+      "grad_norm": 0.5557898093105165,
+      "learning_rate": 8.887714901728551e-05,
+      "loss": 0.741,
+      "step": 687
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.48456383759255994,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.8253,
+      "step": 688
+    },
+    {
+      "epoch": 0.5512,
+      "grad_norm": 0.4095521800669326,
+      "learning_rate": 8.836210299741346e-05,
+      "loss": 0.6651,
+      "step": 689
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.42270261837701667,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7433,
+      "step": 690
+    },
+    {
+      "epoch": 0.5528,
+      "grad_norm": 0.4638294362795246,
+      "learning_rate": 8.784736974966135e-05,
+      "loss": 0.8163,
+      "step": 691
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4935870098713025,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.7585,
+      "step": 692
+    },
+    {
+      "epoch": 0.5544,
+      "grad_norm": 0.45068959511196954,
+      "learning_rate": 8.733296310764611e-05,
+      "loss": 0.7753,
+      "step": 693
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.4031024404777808,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.6753,
+      "step": 694
+    },
+    {
+      "epoch": 0.556,
+      "grad_norm": 0.4236257907655874,
+      "learning_rate": 8.6818896896207e-05,
+      "loss": 0.764,
+      "step": 695
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.4423818785729849,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7659,
+      "step": 696
+    },
+    {
+      "epoch": 0.5576,
+      "grad_norm": 0.36826114157455997,
+      "learning_rate": 8.63051849310342e-05,
+      "loss": 0.7201,
+      "step": 697
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.4490013780803883,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.7866,
+      "step": 698
+    },
+    {
+      "epoch": 0.5592,
+      "grad_norm": 0.42498937946884835,
+      "learning_rate": 8.579184101829734e-05,
+      "loss": 0.7276,
+      "step": 699
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4558168107297979,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6715,
+      "step": 700
+    },
+    {
+      "epoch": 0.5608,
+      "grad_norm": 0.44808068761867476,
+      "learning_rate": 8.527887895427454e-05,
+      "loss": 0.7468,
+      "step": 701
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.5426271223678154,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6919,
+      "step": 702
+    },
+    {
+      "epoch": 0.5624,
+      "grad_norm": 0.41706166479314355,
+      "learning_rate": 8.476631252498162e-05,
+      "loss": 0.6811,
+      "step": 703
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.449909836051013,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7287,
+      "step": 704
+    },
+    {
+      "epoch": 0.564,
+      "grad_norm": 0.40904128257034156,
+      "learning_rate": 8.425415550580162e-05,
+      "loss": 0.6821,
+      "step": 705
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.5514079567161478,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.7014,
+      "step": 706
+    },
+    {
+      "epoch": 0.5656,
+      "grad_norm": 0.5621941021255352,
+      "learning_rate": 8.374242166111448e-05,
+      "loss": 0.714,
+      "step": 707
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.48922281319432914,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.6964,
+      "step": 708
+    },
+    {
+      "epoch": 0.5672,
+      "grad_norm": 0.40495064573395556,
+      "learning_rate": 8.323112474392731e-05,
+      "loss": 0.6596,
+      "step": 709
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.37669628443541675,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.679,
+      "step": 710
+    },
+    {
+      "epoch": 0.5688,
+      "grad_norm": 0.38212624787048943,
+      "learning_rate": 8.272027849550457e-05,
+      "loss": 0.7044,
+      "step": 711
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4164961167082215,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7001,
+      "step": 712
+    },
+    {
+      "epoch": 0.5704,
+      "grad_norm": 0.5806986076006299,
+      "learning_rate": 8.220989664499878e-05,
+      "loss": 0.7471,
+      "step": 713
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.4128919453980456,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6802,
+      "step": 714
+    },
+    {
+      "epoch": 0.572,
+      "grad_norm": 0.4699849720330913,
+      "learning_rate": 8.169999290908188e-05,
+      "loss": 0.7399,
+      "step": 715
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.5687699333630892,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.7319,
+      "step": 716
+    },
+    {
+      "epoch": 0.5736,
+      "grad_norm": 0.41632514043834745,
+      "learning_rate": 8.119058099157604e-05,
+      "loss": 0.6833,
+      "step": 717
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.48479864488960833,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7826,
+      "step": 718
+    },
+    {
+      "epoch": 0.5752,
+      "grad_norm": 0.48285527273343337,
+      "learning_rate": 8.068167458308582e-05,
+      "loss": 0.7422,
+      "step": 719
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.43286802937523056,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.7611,
+      "step": 720
+    },
+    {
+      "epoch": 0.5768,
+      "grad_norm": 0.4075535963903668,
+      "learning_rate": 8.017328736063006e-05,
+      "loss": 0.6694,
+      "step": 721
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.3753612060340906,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6854,
+      "step": 722
+    },
+    {
+      "epoch": 0.5784,
+      "grad_norm": 0.3843107108459075,
+      "learning_rate": 7.966543298727425e-05,
+      "loss": 0.6427,
+      "step": 723
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.4233515369127332,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.7574,
+      "step": 724
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.6060253045716254,
+      "learning_rate": 7.915812511176347e-05,
+      "loss": 0.7259,
+      "step": 725
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.43464604597808115,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.7254,
+      "step": 726
+    },
+    {
+      "epoch": 0.5816,
+      "grad_norm": 0.46273691599511046,
+      "learning_rate": 7.865137736815535e-05,
+      "loss": 0.6386,
+      "step": 727
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.4274034364439089,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6916,
+      "step": 728
+    },
+    {
+      "epoch": 0.5832,
+      "grad_norm": 0.4377273291277029,
+      "learning_rate": 7.814520337545406e-05,
+      "loss": 0.7152,
+      "step": 729
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.49562762295599727,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.7728,
+      "step": 730
+    },
+    {
+      "epoch": 0.5848,
+      "grad_norm": 0.4119996776096918,
+      "learning_rate": 7.763961673724379e-05,
+      "loss": 0.6834,
+      "step": 731
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.4360591441700369,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6787,
+      "step": 732
+    },
+    {
+      "epoch": 0.5864,
+      "grad_norm": 0.5539604553956203,
+      "learning_rate": 7.713463104132345e-05,
+      "loss": 0.7181,
+      "step": 733
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.4987585680019293,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.7772,
+      "step": 734
+    },
+    {
+      "epoch": 0.588,
+      "grad_norm": 0.4818796800652592,
+      "learning_rate": 7.663025985934158e-05,
+      "loss": 0.7785,
+      "step": 735
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.38384019808049874,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.648,
+      "step": 736
+    },
+    {
+      "epoch": 0.5896,
+      "grad_norm": 0.5149515623933224,
+      "learning_rate": 7.61265167464313e-05,
+      "loss": 0.7902,
+      "step": 737
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.46017946680628197,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.6885,
+      "step": 738
+    },
+    {
+      "epoch": 0.5912,
+      "grad_norm": 0.3825756824730468,
+      "learning_rate": 7.562341524084623e-05,
+      "loss": 0.6916,
+      "step": 739
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.44635582841279664,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.7616,
+      "step": 740
+    },
+    {
+      "epoch": 0.5928,
+      "grad_norm": 0.43836625715361827,
+      "learning_rate": 7.512096886359664e-05,
+      "loss": 0.7535,
+      "step": 741
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.39445380579688616,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.7074,
+      "step": 742
+    },
+    {
+      "epoch": 0.5944,
+      "grad_norm": 0.47492623940823814,
+      "learning_rate": 7.461919111808595e-05,
+      "loss": 0.6366,
+      "step": 743
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.4140802668973822,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.6884,
+      "step": 744
+    },
+    {
+      "epoch": 0.596,
+      "grad_norm": 0.43969996365262,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.7218,
+      "step": 745
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.4315000246809719,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6829,
+      "step": 746
+    },
+    {
+      "epoch": 0.5976,
+      "grad_norm": 0.38695555281571414,
+      "learning_rate": 7.361769544568425e-05,
+      "loss": 0.6766,
+      "step": 747
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.45378683454472246,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.7154,
+      "step": 748
+    },
+    {
+      "epoch": 0.5992,
+      "grad_norm": 0.5025158375202976,
+      "learning_rate": 7.311800443430251e-05,
+      "loss": 0.8012,
+      "step": 749
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.5368880369743676,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7461,
+      "step": 750
+    },
+    {
+      "epoch": 0.6008,
+      "grad_norm": 0.4469626531765936,
+      "learning_rate": 7.26190358849548e-05,
+      "loss": 0.6611,
+      "step": 751
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3904067865100921,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.6571,
+      "step": 752
+    },
+    {
+      "epoch": 0.6024,
+      "grad_norm": 0.5395588619367233,
+      "learning_rate": 7.212080320757695e-05,
+      "loss": 0.7605,
+      "step": 753
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.4063505434642953,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.7216,
+      "step": 754
+    },
+    {
+      "epoch": 0.604,
+      "grad_norm": 0.439733970220297,
+      "learning_rate": 7.162331979232783e-05,
+      "loss": 0.7483,
+      "step": 755
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.3903217297594714,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.6226,
+      "step": 756
+    },
+    {
+      "epoch": 0.6056,
+      "grad_norm": 0.35791207602972686,
+      "learning_rate": 7.112659900922976e-05,
+      "loss": 0.675,
+      "step": 757
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.46305859768985486,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7425,
+      "step": 758
+    },
+    {
+      "epoch": 0.6072,
+      "grad_norm": 0.3819436794869962,
+      "learning_rate": 7.06306542078091e-05,
+      "loss": 0.6778,
+      "step": 759
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.46174364856153316,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.7653,
+      "step": 760
+    },
+    {
+      "epoch": 0.6088,
+      "grad_norm": 0.4073259287429649,
+      "learning_rate": 7.013549871673736e-05,
+      "loss": 0.6721,
+      "step": 761
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.5036835056625664,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6868,
+      "step": 762
+    },
+    {
+      "epoch": 0.6104,
+      "grad_norm": 0.4921489753535104,
+      "learning_rate": 6.964114584347316e-05,
+      "loss": 0.6549,
+      "step": 763
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.42186934467470866,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6843,
+      "step": 764
+    },
+    {
+      "epoch": 0.612,
+      "grad_norm": 0.5145851167911595,
+      "learning_rate": 6.914760887390452e-05,
+      "loss": 0.7688,
+      "step": 765
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.44355951530594984,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6989,
+      "step": 766
+    },
+    {
+      "epoch": 0.6136,
+      "grad_norm": 0.46686407305336036,
+      "learning_rate": 6.865490107199181e-05,
+      "loss": 0.6879,
+      "step": 767
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.46822811242944595,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.7705,
+      "step": 768
+    },
+    {
+      "epoch": 0.6152,
+      "grad_norm": 0.5005001982499331,
+      "learning_rate": 6.816303567941112e-05,
+      "loss": 0.7039,
+      "step": 769
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.6557598549144436,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.7628,
+      "step": 770
+    },
+    {
+      "epoch": 0.6168,
+      "grad_norm": 0.5907766808878118,
+      "learning_rate": 6.767202591519875e-05,
+      "loss": 0.7857,
+      "step": 771
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3754334115771533,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.6747,
+      "step": 772
+    },
+    {
+      "epoch": 0.6184,
+      "grad_norm": 0.46714188110580834,
+      "learning_rate": 6.718188497539554e-05,
+      "loss": 0.6875,
+      "step": 773
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.46643998644250145,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.6485,
+      "step": 774
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.4962242868389372,
+      "learning_rate": 6.669262603269246e-05,
+      "loss": 0.7604,
+      "step": 775
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.4856780772069199,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.7647,
+      "step": 776
+    },
+    {
+      "epoch": 0.6216,
+      "grad_norm": 0.40212979381388003,
+      "learning_rate": 6.620426223607654e-05,
+      "loss": 0.6808,
+      "step": 777
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.4092513557560957,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7228,
+      "step": 778
+    },
+    {
+      "epoch": 0.6232,
+      "grad_norm": 0.468504700291808,
+      "learning_rate": 6.571680671047749e-05,
+      "loss": 0.7467,
+      "step": 779
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.48832366184278075,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7542,
+      "step": 780
+    },
+    {
+      "epoch": 0.6248,
+      "grad_norm": 0.45914601401061483,
+      "learning_rate": 6.523027255641493e-05,
+      "loss": 0.7258,
+      "step": 781
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.4310698149491041,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.6253,
+      "step": 782
+    },
+    {
+      "epoch": 0.6264,
+      "grad_norm": 0.4269385113862225,
+      "learning_rate": 6.474467284964634e-05,
+      "loss": 0.7287,
+      "step": 783
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.5374205730161659,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.7529,
+      "step": 784
+    },
+    {
+      "epoch": 0.628,
+      "grad_norm": 0.4238875672676686,
+      "learning_rate": 6.426002064081565e-05,
+      "loss": 0.7317,
+      "step": 785
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.4080467891819177,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7255,
+      "step": 786
+    },
+    {
+      "epoch": 0.6296,
+      "grad_norm": 0.4796802337184242,
+      "learning_rate": 6.377632895510248e-05,
+      "loss": 0.6419,
+      "step": 787
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.4398667233380474,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6051,
+      "step": 788
+    },
+    {
+      "epoch": 0.6312,
+      "grad_norm": 0.4571763101900469,
+      "learning_rate": 6.329361079187199e-05,
+      "loss": 0.6306,
+      "step": 789
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.44610926768714976,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.6575,
+      "step": 790
+    },
+    {
+      "epoch": 0.6328,
+      "grad_norm": 0.39803843376689096,
+      "learning_rate": 6.281187912432587e-05,
+      "loss": 0.6636,
+      "step": 791
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.4044923123399956,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6976,
+      "step": 792
+    },
+    {
+      "epoch": 0.6344,
+      "grad_norm": 0.41279920205145776,
+      "learning_rate": 6.233114689915316e-05,
+      "loss": 0.7651,
+      "step": 793
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.49797839499652,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.7738,
+      "step": 794
+    },
+    {
+      "epoch": 0.636,
+      "grad_norm": 0.4941980334443274,
+      "learning_rate": 6.18514270361827e-05,
+      "loss": 0.6992,
+      "step": 795
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.4013061255393633,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.684,
+      "step": 796
+    },
+    {
+      "epoch": 0.6376,
+      "grad_norm": 0.5030148764960346,
+      "learning_rate": 6.13727324280358e-05,
+      "loss": 0.7332,
+      "step": 797
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.4256492278337513,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.594,
+      "step": 798
+    },
+    {
+      "epoch": 0.6392,
+      "grad_norm": 0.5368193272908817,
+      "learning_rate": 6.08950759397797e-05,
+      "loss": 0.7029,
+      "step": 799
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.4462677045957923,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6916,
+      "step": 800
+    },
+    {
+      "epoch": 0.6408,
+      "grad_norm": 0.425300862246148,
+      "learning_rate": 6.0418470408581774e-05,
+      "loss": 0.7489,
+      "step": 801
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.5005085207709417,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.7802,
+      "step": 802
+    },
+    {
+      "epoch": 0.6424,
+      "grad_norm": 0.3752762320754259,
+      "learning_rate": 5.9942928643364724e-05,
+      "loss": 0.7251,
+      "step": 803
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.4425702481945523,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6513,
+      "step": 804
+    },
+    {
+      "epoch": 0.644,
+      "grad_norm": 0.4299899094756773,
+      "learning_rate": 5.946846342446214e-05,
+      "loss": 0.6924,
+      "step": 805
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.4226631171202636,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.7055,
+      "step": 806
+    },
+    {
+      "epoch": 0.6456,
+      "grad_norm": 0.4246977710889074,
+      "learning_rate": 5.899508750327501e-05,
+      "loss": 0.6853,
+      "step": 807
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.480018168315808,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.7548,
+      "step": 808
+    },
+    {
+      "epoch": 0.6472,
+      "grad_norm": 0.41398085518741246,
+      "learning_rate": 5.8522813601929324e-05,
+      "loss": 0.6821,
+      "step": 809
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.3877776812865788,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6502,
+      "step": 810
+    },
+    {
+      "epoch": 0.6488,
+      "grad_norm": 0.39052380092941585,
+      "learning_rate": 5.80516544129337e-05,
+      "loss": 0.7182,
+      "step": 811
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.44351355052746444,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6959,
+      "step": 812
+    },
+    {
+      "epoch": 0.6504,
+      "grad_norm": 0.42646815820261946,
+      "learning_rate": 5.758162259883867e-05,
+      "loss": 0.6902,
+      "step": 813
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.4228695561254,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.6941,
+      "step": 814
+    },
+    {
+      "epoch": 0.652,
+      "grad_norm": 0.4100351559141488,
+      "learning_rate": 5.7112730791896207e-05,
+      "loss": 0.6686,
+      "step": 815
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.42136708533291056,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.6992,
+      "step": 816
+    },
+    {
+      "epoch": 0.6536,
+      "grad_norm": 0.42797444786900646,
+      "learning_rate": 5.664499159372017e-05,
+      "loss": 0.6868,
+      "step": 817
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.4908314945911969,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.7185,
+      "step": 818
+    },
+    {
+      "epoch": 0.6552,
+      "grad_norm": 0.38129021246302564,
+      "learning_rate": 5.617841757494762e-05,
+      "loss": 0.6333,
+      "step": 819
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4654414447152424,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.7339,
+      "step": 820
+    },
+    {
+      "epoch": 0.6568,
+      "grad_norm": 0.5224291883635922,
+      "learning_rate": 5.5713021274901335e-05,
+      "loss": 0.6882,
+      "step": 821
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.42876193303301113,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.7271,
+      "step": 822
+    },
+    {
+      "epoch": 0.6584,
+      "grad_norm": 0.416216982433621,
+      "learning_rate": 5.524881520125229e-05,
+      "loss": 0.6564,
+      "step": 823
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.4282729312446709,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6968,
+      "step": 824
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.44441536086740685,
+      "learning_rate": 5.4785811829683764e-05,
+      "loss": 0.6945,
+      "step": 825
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.4564075558753912,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.7581,
+      "step": 826
+    },
+    {
+      "epoch": 0.6616,
+      "grad_norm": 0.4637977268593527,
+      "learning_rate": 5.432402360355615e-05,
+      "loss": 0.7087,
+      "step": 827
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4235543963213367,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.6951,
+      "step": 828
+    },
+    {
+      "epoch": 0.6632,
+      "grad_norm": 0.42142190752970404,
+      "learning_rate": 5.386346293357242e-05,
+      "loss": 0.7701,
+      "step": 829
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4605267541051739,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6899,
+      "step": 830
+    },
+    {
+      "epoch": 0.6648,
+      "grad_norm": 0.44996033056969564,
+      "learning_rate": 5.3404142197444506e-05,
+      "loss": 0.6572,
+      "step": 831
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.4648115272389889,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.7327,
+      "step": 832
+    },
+    {
+      "epoch": 0.6664,
+      "grad_norm": 0.42083961567449996,
+      "learning_rate": 5.2946073739560706e-05,
+      "loss": 0.6481,
+      "step": 833
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.3678257321600699,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6567,
+      "step": 834
+    },
+    {
+      "epoch": 0.668,
+      "grad_norm": 0.41713246313477464,
+      "learning_rate": 5.248926987065417e-05,
+      "loss": 0.6805,
+      "step": 835
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.37052743732415666,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6812,
+      "step": 836
+    },
+    {
+      "epoch": 0.6696,
+      "grad_norm": 0.4344230327391137,
+      "learning_rate": 5.203374286747158e-05,
+      "loss": 0.6924,
+      "step": 837
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.44582569093875274,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.7686,
+      "step": 838
+    },
+    {
+      "epoch": 0.6712,
+      "grad_norm": 0.4199989527857804,
+      "learning_rate": 5.15795049724435e-05,
+      "loss": 0.6835,
+      "step": 839
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.40433614049645716,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.6678,
+      "step": 840
+    },
+    {
+      "epoch": 0.6728,
+      "grad_norm": 0.49111083242813114,
+      "learning_rate": 5.112656839335543e-05,
+      "loss": 0.7783,
+      "step": 841
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.40452113487742064,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.699,
+      "step": 842
+    },
+    {
+      "epoch": 0.6744,
+      "grad_norm": 0.4185998946571293,
+      "learning_rate": 5.0674945303019526e-05,
+      "loss": 0.6536,
+      "step": 843
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.42129377379980115,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6622,
+      "step": 844
+    },
+    {
+      "epoch": 0.676,
+      "grad_norm": 0.4778900709802409,
+      "learning_rate": 5.022464783894744e-05,
+      "loss": 0.723,
+      "step": 845
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.47538770537483455,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7077,
+      "step": 846
+    },
+    {
+      "epoch": 0.6776,
+      "grad_norm": 0.46213185401583895,
+      "learning_rate": 4.977568810302432e-05,
+      "loss": 0.7447,
+      "step": 847
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.46342637710809953,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6423,
+      "step": 848
+    },
+    {
+      "epoch": 0.6792,
+      "grad_norm": 0.4603269264208612,
+      "learning_rate": 4.9328078161183464e-05,
+      "loss": 0.7205,
+      "step": 849
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.3958040158831917,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.7042,
+      "step": 850
+    },
+    {
+      "epoch": 0.6808,
+      "grad_norm": 0.4394189681367613,
+      "learning_rate": 4.88818300430819e-05,
+      "loss": 0.7338,
+      "step": 851
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.38026300704001265,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6468,
+      "step": 852
+    },
+    {
+      "epoch": 0.6824,
+      "grad_norm": 0.4585870632104486,
+      "learning_rate": 4.843695574177737e-05,
+      "loss": 0.7556,
+      "step": 853
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.5708905518795303,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.6803,
+      "step": 854
+    },
+    {
+      "epoch": 0.684,
+      "grad_norm": 0.44859739726686104,
+      "learning_rate": 4.7993467213405706e-05,
+      "loss": 0.6862,
+      "step": 855
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.45353344720099065,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6213,
+      "step": 856
+    },
+    {
+      "epoch": 0.6856,
+      "grad_norm": 0.4514540013654558,
+      "learning_rate": 4.755137637685979e-05,
+      "loss": 0.7197,
+      "step": 857
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.43520247996323425,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.6859,
+      "step": 858
+    },
+    {
+      "epoch": 0.6872,
+      "grad_norm": 0.44573263487448433,
+      "learning_rate": 4.7110695113469085e-05,
+      "loss": 0.7486,
+      "step": 859
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.5369980225986257,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.7221,
+      "step": 860
+    },
+    {
+      "epoch": 0.6888,
+      "grad_norm": 0.4684742349759021,
+      "learning_rate": 4.6671435266680216e-05,
+      "loss": 0.704,
+      "step": 861
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.4247240492781613,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.6429,
+      "step": 862
+    },
+    {
+      "epoch": 0.6904,
+      "grad_norm": 0.4352750428153809,
+      "learning_rate": 4.623360864173893e-05,
+      "loss": 0.6176,
+      "step": 863
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3745093117472935,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.6385,
+      "step": 864
+    },
+    {
+      "epoch": 0.692,
+      "grad_norm": 0.37654844380452335,
+      "learning_rate": 4.579722700537268e-05,
+      "loss": 0.6749,
+      "step": 865
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.3795730674439509,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6537,
+      "step": 866
+    },
+    {
+      "epoch": 0.6936,
+      "grad_norm": 0.5229481781582966,
+      "learning_rate": 4.5362302085474254e-05,
+      "loss": 0.694,
+      "step": 867
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3632109076521302,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.6259,
+      "step": 868
+    },
+    {
+      "epoch": 0.6952,
+      "grad_norm": 0.41367182887741916,
+      "learning_rate": 4.492884557078688e-05,
+      "loss": 0.6311,
+      "step": 869
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.4537464387618468,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7032,
+      "step": 870
+    },
+    {
+      "epoch": 0.6968,
+      "grad_norm": 0.4741123520579128,
+      "learning_rate": 4.449686911058992e-05,
+      "loss": 0.7449,
+      "step": 871
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.4514703347255224,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.6484,
+      "step": 872
+    },
+    {
+      "epoch": 0.6984,
+      "grad_norm": 0.39202622885904026,
+      "learning_rate": 4.406638431438576e-05,
+      "loss": 0.6691,
+      "step": 873
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.47745896342818134,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.7081,
+      "step": 874
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.434060322789685,
+      "learning_rate": 4.36374027515878e-05,
+      "loss": 0.6934,
+      "step": 875
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.527185929501206,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6384,
+      "step": 876
+    },
+    {
+      "epoch": 0.7016,
+      "grad_norm": 0.38250138247721094,
+      "learning_rate": 4.320993595120969e-05,
+      "loss": 0.6515,
+      "step": 877
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.40168446920004564,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6932,
+      "step": 878
+    },
+    {
+      "epoch": 0.7032,
+      "grad_norm": 0.41208113755503484,
+      "learning_rate": 4.278399540155536e-05,
+      "loss": 0.682,
+      "step": 879
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.7080535876026216,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.6448,
+      "step": 880
+    },
+    {
+      "epoch": 0.7048,
+      "grad_norm": 0.4556640559810192,
+      "learning_rate": 4.2359592549910145e-05,
+      "loss": 0.697,
+      "step": 881
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.44159196719908,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.68,
+      "step": 882
+    },
+    {
+      "epoch": 0.7064,
+      "grad_norm": 0.3754475837443555,
+      "learning_rate": 4.193673880223339e-05,
+      "loss": 0.6864,
+      "step": 883
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.40973853079506767,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6169,
+      "step": 884
+    },
+    {
+      "epoch": 0.708,
+      "grad_norm": 0.40982493587788504,
+      "learning_rate": 4.1515445522851784e-05,
+      "loss": 0.6973,
+      "step": 885
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.39166629221274496,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6458,
+      "step": 886
+    },
+    {
+      "epoch": 0.7096,
+      "grad_norm": 0.4396057241312862,
+      "learning_rate": 4.109572403415386e-05,
+      "loss": 0.6047,
+      "step": 887
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.45675566700922193,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.676,
+      "step": 888
+    },
+    {
+      "epoch": 0.7112,
+      "grad_norm": 0.3919823955872468,
+      "learning_rate": 4.0677585616285774e-05,
+      "loss": 0.6901,
+      "step": 889
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.4447866341329715,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.7109,
+      "step": 890
+    },
+    {
+      "epoch": 0.7128,
+      "grad_norm": 0.38144466911816505,
+      "learning_rate": 4.026104150684835e-05,
+      "loss": 0.6441,
+      "step": 891
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.5572834420081134,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6963,
+      "step": 892
+    },
+    {
+      "epoch": 0.7144,
+      "grad_norm": 0.3841106808846123,
+      "learning_rate": 3.984610290059467e-05,
+      "loss": 0.6573,
+      "step": 893
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.42414157313789447,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.6725,
+      "step": 894
+    },
+    {
+      "epoch": 0.716,
+      "grad_norm": 0.42756170338582344,
+      "learning_rate": 3.943278094912946e-05,
+      "loss": 0.6926,
+      "step": 895
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.4355363485737852,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.7292,
+      "step": 896
+    },
+    {
+      "epoch": 0.7176,
+      "grad_norm": 0.3934071933253508,
+      "learning_rate": 3.902108676060937e-05,
+      "loss": 0.6016,
+      "step": 897
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.5496966401262666,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.7274,
+      "step": 898
+    },
+    {
+      "epoch": 0.7192,
+      "grad_norm": 0.3790492810481393,
+      "learning_rate": 3.861103139944449e-05,
+      "loss": 0.6842,
+      "step": 899
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.4401342359507008,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.6831,
+      "step": 900
+    },
+    {
+      "epoch": 0.7208,
+      "grad_norm": 0.3971144677232109,
+      "learning_rate": 3.820262588600074e-05,
+      "loss": 0.6354,
+      "step": 901
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.39471198945480024,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6892,
+      "step": 902
+    },
+    {
+      "epoch": 0.7224,
+      "grad_norm": 0.3987529330890745,
+      "learning_rate": 3.7795881196303995e-05,
+      "loss": 0.6804,
+      "step": 903
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.5701865245980139,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.7172,
+      "step": 904
+    },
+    {
+      "epoch": 0.724,
+      "grad_norm": 0.5008429174971117,
+      "learning_rate": 3.739080826174498e-05,
+      "loss": 0.6922,
+      "step": 905
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.4902919216380368,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.6681,
+      "step": 906
+    },
+    {
+      "epoch": 0.7256,
+      "grad_norm": 0.3357650120743114,
+      "learning_rate": 3.6987417968785366e-05,
+      "loss": 0.6205,
+      "step": 907
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.4600035146676257,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.7735,
+      "step": 908
+    },
+    {
+      "epoch": 0.7272,
+      "grad_norm": 0.4402983412029306,
+      "learning_rate": 3.658572115866541e-05,
+      "loss": 0.7874,
+      "step": 909
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4187051978636392,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6772,
+      "step": 910
+    },
+    {
+      "epoch": 0.7288,
+      "grad_norm": 0.41472219827114154,
+      "learning_rate": 3.618572862711247e-05,
+      "loss": 0.6537,
+      "step": 911
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.46571200032117377,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.7351,
+      "step": 912
+    },
+    {
+      "epoch": 0.7304,
+      "grad_norm": 0.4768836196404258,
+      "learning_rate": 3.578745112405083e-05,
+      "loss": 0.6293,
+      "step": 913
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.3916949256445519,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.6879,
+      "step": 914
+    },
+    {
+      "epoch": 0.732,
+      "grad_norm": 0.4375340971358518,
+      "learning_rate": 3.539089935331294e-05,
+      "loss": 0.7221,
+      "step": 915
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.40701462045279024,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6553,
+      "step": 916
+    },
+    {
+      "epoch": 0.7336,
+      "grad_norm": 0.7622096756879763,
+      "learning_rate": 3.4996083972351515e-05,
+      "loss": 0.7598,
+      "step": 917
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.4453193853925462,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6814,
+      "step": 918
+    },
+    {
+      "epoch": 0.7352,
+      "grad_norm": 0.3594708239161031,
+      "learning_rate": 3.4603015591953395e-05,
+      "loss": 0.6828,
+      "step": 919
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.49749631565866714,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.7251,
+      "step": 920
+    },
+    {
+      "epoch": 0.7368,
+      "grad_norm": 0.45747003096606015,
+      "learning_rate": 3.421170477595419e-05,
+      "loss": 0.7191,
+      "step": 921
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.4000803837289144,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7036,
+      "step": 922
+    },
+    {
+      "epoch": 0.7384,
+      "grad_norm": 0.4446653500117122,
+      "learning_rate": 3.3822162040954354e-05,
+      "loss": 0.6766,
+      "step": 923
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.4210251334594636,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6718,
+      "step": 924
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.4470565411188531,
+      "learning_rate": 3.34343978560367e-05,
+      "loss": 0.734,
+      "step": 925
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.45021310458295094,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.7136,
+      "step": 926
+    },
+    {
+      "epoch": 0.7416,
+      "grad_norm": 0.5674513660698329,
+      "learning_rate": 3.3048422642484886e-05,
+      "loss": 0.6502,
+      "step": 927
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.44632307726456727,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6521,
+      "step": 928
+    },
+    {
+      "epoch": 0.7432,
+      "grad_norm": 0.4798937095611223,
+      "learning_rate": 3.266424677350346e-05,
+      "loss": 0.687,
+      "step": 929
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.45454249364117827,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6255,
+      "step": 930
+    },
+    {
+      "epoch": 0.7448,
+      "grad_norm": 0.461447759914991,
+      "learning_rate": 3.228188057393895e-05,
+      "loss": 0.7076,
+      "step": 931
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.4068559853634475,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.6594,
+      "step": 932
+    },
+    {
+      "epoch": 0.7464,
+      "grad_norm": 0.3704032851078179,
+      "learning_rate": 3.190133432000252e-05,
+      "loss": 0.6398,
+      "step": 933
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.491308392611432,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.7316,
+      "step": 934
+    },
+    {
+      "epoch": 0.748,
+      "grad_norm": 0.5178312919433019,
+      "learning_rate": 3.1522618238993725e-05,
+      "loss": 0.7616,
+      "step": 935
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.41611180745604653,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.6684,
+      "step": 936
+    },
+    {
+      "epoch": 0.7496,
+      "grad_norm": 0.49009980647577833,
+      "learning_rate": 3.114574250902558e-05,
+      "loss": 0.7914,
+      "step": 937
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.49550826102102696,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.7058,
+      "step": 938
+    },
+    {
+      "epoch": 0.7512,
+      "grad_norm": 0.5778361024572423,
+      "learning_rate": 3.077071725875116e-05,
+      "loss": 0.78,
+      "step": 939
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.4231097330482353,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6936,
+      "step": 940
+    },
+    {
+      "epoch": 0.7528,
+      "grad_norm": 0.390916302418808,
+      "learning_rate": 3.0397552567091337e-05,
+      "loss": 0.6755,
+      "step": 941
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.3766208095255213,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.64,
+      "step": 942
+    },
+    {
+      "epoch": 0.7544,
+      "grad_norm": 0.44799919275752687,
+      "learning_rate": 3.0026258462963787e-05,
+      "loss": 0.7635,
+      "step": 943
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.38477675195872846,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.681,
+      "step": 944
+    },
+    {
+      "epoch": 0.756,
+      "grad_norm": 0.42836343972350627,
+      "learning_rate": 2.9656844925013637e-05,
+      "loss": 0.6173,
+      "step": 945
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.4387436284079448,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6526,
+      "step": 946
+    },
+    {
+      "epoch": 0.7576,
+      "grad_norm": 0.44143701497068183,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.7512,
+      "step": 947
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.402519873568703,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6481,
+      "step": 948
+    },
+    {
+      "epoch": 0.7592,
+      "grad_norm": 0.386754466514997,
+      "learning_rate": 2.8923699209255284e-05,
+      "loss": 0.6381,
+      "step": 949
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.4186412142799634,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.7032,
+      "step": 950
+    },
+    {
+      "epoch": 0.7608,
+      "grad_norm": 0.45146307527320995,
+      "learning_rate": 2.8559986734967282e-05,
+      "loss": 0.7515,
+      "step": 951
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.4417643283832573,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.698,
+      "step": 952
+    },
+    {
+      "epoch": 0.7624,
+      "grad_norm": 0.3444521337309674,
+      "learning_rate": 2.819819423336775e-05,
+      "loss": 0.6701,
+      "step": 953
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.42824992941627515,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6756,
+      "step": 954
+    },
+    {
+      "epoch": 0.764,
+      "grad_norm": 0.450921051363528,
+      "learning_rate": 2.7838331427743282e-05,
+      "loss": 0.7463,
+      "step": 955
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.3632777176469063,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6536,
+      "step": 956
+    },
+    {
+      "epoch": 0.7656,
+      "grad_norm": 0.3222506999183127,
+      "learning_rate": 2.7480407989519198e-05,
+      "loss": 0.5806,
+      "step": 957
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.40924533769273447,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.6717,
+      "step": 958
+    },
+    {
+      "epoch": 0.7672,
+      "grad_norm": 0.37597076668253954,
+      "learning_rate": 2.712443353799984e-05,
+      "loss": 0.6468,
+      "step": 959
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4058231666861766,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6655,
+      "step": 960
+    },
+    {
+      "epoch": 0.7688,
+      "grad_norm": 0.4341008530986436,
+      "learning_rate": 2.677041764010988e-05,
+      "loss": 0.6763,
+      "step": 961
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.4859754406120522,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6578,
+      "step": 962
+    },
+    {
+      "epoch": 0.7704,
+      "grad_norm": 0.37689948007678203,
+      "learning_rate": 2.6418369810137188e-05,
+      "loss": 0.6997,
+      "step": 963
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.40394761311539573,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6621,
+      "step": 964
+    },
+    {
+      "epoch": 0.772,
+      "grad_norm": 0.40040126144967014,
+      "learning_rate": 2.6068299509477266e-05,
+      "loss": 0.6598,
+      "step": 965
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.48835841579438144,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6989,
+      "step": 966
+    },
+    {
+      "epoch": 0.7736,
+      "grad_norm": 0.4305382837026107,
+      "learning_rate": 2.5720216146378917e-05,
+      "loss": 0.6461,
+      "step": 967
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4278732219711235,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.6726,
+      "step": 968
+    },
+    {
+      "epoch": 0.7752,
+      "grad_norm": 0.3689128305355096,
+      "learning_rate": 2.5374129075691265e-05,
+      "loss": 0.6815,
+      "step": 969
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.3759714721714428,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.6556,
+      "step": 970
+    },
+    {
+      "epoch": 0.7768,
+      "grad_norm": 0.4067960037837653,
+      "learning_rate": 2.503004759861258e-05,
+      "loss": 0.6577,
+      "step": 971
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.4528349081101457,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6634,
+      "step": 972
+    },
+    {
+      "epoch": 0.7784,
+      "grad_norm": 0.46024702381528826,
+      "learning_rate": 2.4687980962440072e-05,
+      "loss": 0.6808,
+      "step": 973
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.3950513918741196,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6331,
+      "step": 974
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.37157574354329653,
+      "learning_rate": 2.4347938360321566e-05,
+      "loss": 0.6373,
+      "step": 975
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.46104339234263786,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6874,
+      "step": 976
+    },
+    {
+      "epoch": 0.7816,
+      "grad_norm": 0.34276429191209973,
+      "learning_rate": 2.400992893100822e-05,
+      "loss": 0.6057,
+      "step": 977
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.4371487382126511,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6717,
+      "step": 978
+    },
+    {
+      "epoch": 0.7832,
+      "grad_norm": 0.4265062723224766,
+      "learning_rate": 2.3673961758609152e-05,
+      "loss": 0.6898,
+      "step": 979
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.5339472722709265,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.7031,
+      "step": 980
+    },
+    {
+      "epoch": 0.7848,
+      "grad_norm": 0.42599256174757205,
+      "learning_rate": 2.334004587234717e-05,
+      "loss": 0.6726,
+      "step": 981
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.5090136399906379,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.766,
+      "step": 982
+    },
+    {
+      "epoch": 0.7864,
+      "grad_norm": 0.35801248358183085,
+      "learning_rate": 2.300819024631603e-05,
+      "loss": 0.7139,
+      "step": 983
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.4168201896132661,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6341,
+      "step": 984
+    },
+    {
+      "epoch": 0.788,
+      "grad_norm": 0.4369302146936795,
+      "learning_rate": 2.26784037992395e-05,
+      "loss": 0.6986,
+      "step": 985
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.3672019449783104,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6283,
+      "step": 986
+    },
+    {
+      "epoch": 0.7896,
+      "grad_norm": 0.3900868830211544,
+      "learning_rate": 2.2350695394231345e-05,
+      "loss": 0.6306,
+      "step": 987
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.40015522687040905,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.7209,
+      "step": 988
+    },
+    {
+      "epoch": 0.7912,
+      "grad_norm": 0.4474832481268081,
+      "learning_rate": 2.2025073838557454e-05,
+      "loss": 0.715,
+      "step": 989
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.42044492288355934,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6581,
+      "step": 990
+    },
+    {
+      "epoch": 0.7928,
+      "grad_norm": 0.45455284502168075,
+      "learning_rate": 2.1701547883398922e-05,
+      "loss": 0.7389,
+      "step": 991
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.3559518422054641,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.5842,
+      "step": 992
+    },
+    {
+      "epoch": 0.7944,
+      "grad_norm": 0.4756430447722567,
+      "learning_rate": 2.138012622361689e-05,
+      "loss": 0.7225,
+      "step": 993
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.41960456820551933,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6515,
+      "step": 994
+    },
+    {
+      "epoch": 0.796,
+      "grad_norm": 0.4669997831410201,
+      "learning_rate": 2.106081749751897e-05,
+      "loss": 0.6711,
+      "step": 995
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.4570025939597547,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.7142,
+      "step": 996
+    },
+    {
+      "epoch": 0.7976,
+      "grad_norm": 0.4053240160707213,
+      "learning_rate": 2.0743630286627002e-05,
+      "loss": 0.6649,
+      "step": 997
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.38111176288776105,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6833,
+      "step": 998
+    },
+    {
+      "epoch": 0.7992,
+      "grad_norm": 0.4234102077802766,
+      "learning_rate": 2.0428573115446392e-05,
+      "loss": 0.6516,
+      "step": 999
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3765791652978205,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6913,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8008,
+      "grad_norm": 0.551148258949945,
+      "learning_rate": 2.011565445123711e-05,
+      "loss": 0.7425,
+      "step": 1001
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.42329375710806877,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.6798,
+      "step": 1002
+    },
+    {
+      "epoch": 0.8024,
+      "grad_norm": 0.3654041576107259,
+      "learning_rate": 1.980488270378612e-05,
+      "loss": 0.5991,
+      "step": 1003
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.5889008070953602,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.7691,
+      "step": 1004
+    },
+    {
+      "epoch": 0.804,
+      "grad_norm": 0.5321320962028306,
+      "learning_rate": 1.9496266225181248e-05,
+      "loss": 0.656,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 1.3128728693670322,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7597,
+      "step": 1006
+    },
+    {
+      "epoch": 0.8056,
+      "grad_norm": 0.551359241348889,
+      "learning_rate": 1.918981330958678e-05,
+      "loss": 0.7612,
+      "step": 1007
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.418425271349496,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.669,
+      "step": 1008
+    },
+    {
+      "epoch": 0.8072,
+      "grad_norm": 0.48724011845090404,
+      "learning_rate": 1.8885532193020704e-05,
+      "loss": 0.7145,
+      "step": 1009
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.4338801306917334,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7357,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8088,
+      "grad_norm": 0.4094866422544633,
+      "learning_rate": 1.8583431053133127e-05,
+      "loss": 0.6743,
+      "step": 1011
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.38638552613338145,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.6742,
+      "step": 1012
+    },
+    {
+      "epoch": 0.8104,
+      "grad_norm": 0.45972260958685873,
+      "learning_rate": 1.8283518008986567e-05,
+      "loss": 0.6301,
+      "step": 1013
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.41606548603580784,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.5795,
+      "step": 1014
+    },
+    {
+      "epoch": 0.812,
+      "grad_norm": 0.3507900711228258,
+      "learning_rate": 1.7985801120837865e-05,
+      "loss": 0.6413,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.44622918397105277,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6685,
+      "step": 1016
+    },
+    {
+      "epoch": 0.8136,
+      "grad_norm": 0.5112647107030255,
+      "learning_rate": 1.7690288389921493e-05,
+      "loss": 0.7931,
+      "step": 1017
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.4117469967232782,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.6661,
+      "step": 1018
+    },
+    {
+      "epoch": 0.8152,
+      "grad_norm": 0.4413963948413441,
+      "learning_rate": 1.739698775823442e-05,
+      "loss": 0.615,
+      "step": 1019
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.4945503410986367,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.7369,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8168,
+      "grad_norm": 0.47458541759364137,
+      "learning_rate": 1.7105907108322816e-05,
+      "loss": 0.6466,
+      "step": 1021
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.42592905361928207,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.6743,
+      "step": 1022
+    },
+    {
+      "epoch": 0.8184,
+      "grad_norm": 0.4214118291708083,
+      "learning_rate": 1.6817054263070174e-05,
+      "loss": 0.6624,
+      "step": 1023
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3744677763502848,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6638,
+      "step": 1024
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.4918867346221103,
+      "learning_rate": 1.6530436985486996e-05,
+      "loss": 0.7477,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.4085862430540114,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6843,
+      "step": 1026
+    },
+    {
+      "epoch": 0.8216,
+      "grad_norm": 0.44248196572285126,
+      "learning_rate": 1.6246062978502164e-05,
+      "loss": 0.6931,
+      "step": 1027
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.396506854793697,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6379,
+      "step": 1028
+    },
+    {
+      "epoch": 0.8232,
+      "grad_norm": 0.45465112135282115,
+      "learning_rate": 1.5963939884756042e-05,
+      "loss": 0.6958,
+      "step": 1029
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.38704113003396995,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6362,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8248,
+      "grad_norm": 0.41941980015963803,
+      "learning_rate": 1.5684075286394985e-05,
+      "loss": 0.6779,
+      "step": 1031
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3819239255711896,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6414,
+      "step": 1032
+    },
+    {
+      "epoch": 0.8264,
+      "grad_norm": 0.35946769399800094,
+      "learning_rate": 1.5406476704867524e-05,
+      "loss": 0.6134,
+      "step": 1033
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.412022695130824,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6416,
+      "step": 1034
+    },
+    {
+      "epoch": 0.828,
+      "grad_norm": 0.43726913529127953,
+      "learning_rate": 1.5131151600722337e-05,
+      "loss": 0.6597,
+      "step": 1035
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.41608067811211547,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6861,
+      "step": 1036
+    },
+    {
+      "epoch": 0.8296,
+      "grad_norm": 0.3956515683990938,
+      "learning_rate": 1.485810737340767e-05,
+      "loss": 0.6173,
+      "step": 1037
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.43835608143316607,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.6351,
+      "step": 1038
+    },
+    {
+      "epoch": 0.8312,
+      "grad_norm": 0.36496866124657096,
+      "learning_rate": 1.4587351361072454e-05,
+      "loss": 0.6065,
+      "step": 1039
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.42353022496668236,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.7297,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8328,
+      "grad_norm": 0.3976815646227283,
+      "learning_rate": 1.4318890840369182e-05,
+      "loss": 0.6252,
+      "step": 1041
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.46867159505574235,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.7288,
+      "step": 1042
+    },
+    {
+      "epoch": 0.8344,
+      "grad_norm": 0.4115098679614629,
+      "learning_rate": 1.4052733026258281e-05,
+      "loss": 0.6694,
+      "step": 1043
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.47141463490280416,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.794,
+      "step": 1044
+    },
+    {
+      "epoch": 0.836,
+      "grad_norm": 0.4073181131247274,
+      "learning_rate": 1.3788885071814172e-05,
+      "loss": 0.6754,
+      "step": 1045
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.4435975572514238,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6207,
+      "step": 1046
+    },
+    {
+      "epoch": 0.8376,
+      "grad_norm": 0.4405035884942258,
+      "learning_rate": 1.3527354068033139e-05,
+      "loss": 0.6805,
+      "step": 1047
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.43165585906812914,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.6287,
+      "step": 1048
+    },
+    {
+      "epoch": 0.8392,
+      "grad_norm": 0.48538558114212177,
+      "learning_rate": 1.326814704364262e-05,
+      "loss": 0.7143,
+      "step": 1049
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.42789275010803174,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6834,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8408,
+      "grad_norm": 0.4072666150542458,
+      "learning_rate": 1.3011270964912459e-05,
+      "loss": 0.6485,
+      "step": 1051
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.3962410302880435,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6406,
+      "step": 1052
+    },
+    {
+      "epoch": 0.8424,
+      "grad_norm": 0.3436008649179981,
+      "learning_rate": 1.275673273546758e-05,
+      "loss": 0.6453,
+      "step": 1053
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.39717262188667135,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6477,
+      "step": 1054
+    },
+    {
+      "epoch": 0.844,
+      "grad_norm": 0.37569259361888674,
+      "learning_rate": 1.2504539196102439e-05,
+      "loss": 0.6439,
+      "step": 1055
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.40099423473174634,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6668,
+      "step": 1056
+    },
+    {
+      "epoch": 0.8456,
+      "grad_norm": 0.3804594028424689,
+      "learning_rate": 1.2254697124597237e-05,
+      "loss": 0.6666,
+      "step": 1057
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.58156202113261,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.7181,
+      "step": 1058
+    },
+    {
+      "epoch": 0.8472,
+      "grad_norm": 0.37780741332473505,
+      "learning_rate": 1.2007213235535786e-05,
+      "loss": 0.6722,
+      "step": 1059
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4567096825147793,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.7099,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8488,
+      "grad_norm": 0.4050563966140698,
+      "learning_rate": 1.176209418012495e-05,
+      "loss": 0.6702,
+      "step": 1061
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.4467308030979363,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6345,
+      "step": 1062
+    },
+    {
+      "epoch": 0.8504,
+      "grad_norm": 0.6440589873431481,
+      "learning_rate": 1.1519346546015907e-05,
+      "loss": 0.6698,
+      "step": 1063
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.4877697380021303,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.72,
+      "step": 1064
+    },
+    {
+      "epoch": 0.852,
+      "grad_norm": 0.3703836395380511,
+      "learning_rate": 1.1278976857127311e-05,
+      "loss": 0.5958,
+      "step": 1065
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.5325718261370398,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6877,
+      "step": 1066
+    },
+    {
+      "epoch": 0.8536,
+      "grad_norm": 0.38565613271694577,
+      "learning_rate": 1.1040991573469629e-05,
+      "loss": 0.6124,
+      "step": 1067
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.45076272537905554,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6771,
+      "step": 1068
+    },
+    {
+      "epoch": 0.8552,
+      "grad_norm": 0.4467801054468768,
+      "learning_rate": 1.0805397090971737e-05,
+      "loss": 0.7325,
+      "step": 1069
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.36868814232178826,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6423,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8568,
+      "grad_norm": 0.4487373063261531,
+      "learning_rate": 1.057219974130903e-05,
+      "loss": 0.6863,
+      "step": 1071
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.4769150584728038,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6635,
+      "step": 1072
+    },
+    {
+      "epoch": 0.8584,
+      "grad_norm": 0.4736670630391326,
+      "learning_rate": 1.0341405791733183e-05,
+      "loss": 0.7362,
+      "step": 1073
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.40655879309727255,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6957,
+      "step": 1074
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.4812210423869589,
+      "learning_rate": 1.0113021444903726e-05,
+      "loss": 0.7792,
+      "step": 1075
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.4868531913578826,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.7368,
+      "step": 1076
+    },
+    {
+      "epoch": 0.8616,
+      "grad_norm": 0.4336837670380572,
+      "learning_rate": 9.887052838721322e-06,
+      "loss": 0.6543,
+      "step": 1077
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.4392646108534538,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.7096,
+      "step": 1078
+    },
+    {
+      "epoch": 0.8632,
+      "grad_norm": 0.44141586332695404,
+      "learning_rate": 9.663506046162985e-06,
+      "loss": 0.6493,
+      "step": 1079
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.46424690641874294,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6527,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8648,
+      "grad_norm": 0.4422767467200843,
+      "learning_rate": 9.44238707511862e-06,
+      "loss": 0.6429,
+      "step": 1081
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.46743448263495774,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.7272,
+      "step": 1082
+    },
+    {
+      "epoch": 0.8664,
+      "grad_norm": 0.48685890683770405,
+      "learning_rate": 9.22370186822965e-06,
+      "loss": 0.7151,
+      "step": 1083
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.4377172217417621,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6824,
+      "step": 1084
+    },
+    {
+      "epoch": 0.868,
+      "grad_norm": 0.4830596747463407,
+      "learning_rate": 9.0074563027294e-06,
+      "loss": 0.6788,
+      "step": 1085
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.38925973276029074,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6471,
+      "step": 1086
+    },
+    {
+      "epoch": 0.8696,
+      "grad_norm": 0.3961447377270664,
+      "learning_rate": 8.79365619028507e-06,
+      "loss": 0.6551,
+      "step": 1087
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.48245056779683043,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.73,
+      "step": 1088
+    },
+    {
+      "epoch": 0.8712,
+      "grad_norm": 0.444345919658435,
+      "learning_rate": 8.582307276841462e-06,
+      "loss": 0.6838,
+      "step": 1089
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.4079019102628482,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.711,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8728,
+      "grad_norm": 0.47973700172631495,
+      "learning_rate": 8.37341524246672e-06,
+      "loss": 0.7308,
+      "step": 1091
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.36030570169209764,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.6871,
+      "step": 1092
+    },
+    {
+      "epoch": 0.8744,
+      "grad_norm": 0.45583825575182646,
+      "learning_rate": 8.166985701199582e-06,
+      "loss": 0.6205,
+      "step": 1093
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.4569274995213183,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.702,
+      "step": 1094
+    },
+    {
+      "epoch": 0.876,
+      "grad_norm": 0.41678505062448895,
+      "learning_rate": 7.963024200898462e-06,
+      "loss": 0.7135,
+      "step": 1095
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.47719600813805696,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.7628,
+      "step": 1096
+    },
+    {
+      "epoch": 0.8776,
+      "grad_norm": 0.35211680639410153,
+      "learning_rate": 7.761536223092458e-06,
+      "loss": 0.6208,
+      "step": 1097
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.41121821181807183,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6527,
+      "step": 1098
+    },
+    {
+      "epoch": 0.8792,
+      "grad_norm": 0.3854040359397552,
+      "learning_rate": 7.562527182833978e-06,
+      "loss": 0.6421,
+      "step": 1099
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4685780953190182,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.7006,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8808,
+      "grad_norm": 0.42627774566843196,
+      "learning_rate": 7.366002428553153e-06,
+      "loss": 0.6251,
+      "step": 1101
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.4376010907122848,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.7067,
+      "step": 1102
+    },
+    {
+      "epoch": 0.8824,
+      "grad_norm": 0.36396613930614274,
+      "learning_rate": 7.171967241914224e-06,
+      "loss": 0.5501,
+      "step": 1103
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.39854325327008483,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6593,
+      "step": 1104
+    },
+    {
+      "epoch": 0.884,
+      "grad_norm": 0.4006920610890835,
+      "learning_rate": 6.980426837673437e-06,
+      "loss": 0.5991,
+      "step": 1105
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.4804281957029272,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.7043,
+      "step": 1106
+    },
+    {
+      "epoch": 0.8856,
+      "grad_norm": 0.4157677545297881,
+      "learning_rate": 6.791386363539065e-06,
+      "loss": 0.6422,
+      "step": 1107
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.3909471204538716,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.7383,
+      "step": 1108
+    },
+    {
+      "epoch": 0.8872,
+      "grad_norm": 0.5182573773191494,
+      "learning_rate": 6.604850900032955e-06,
+      "loss": 0.6585,
+      "step": 1109
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.4396361114163665,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6524,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8888,
+      "grad_norm": 0.43671327334515353,
+      "learning_rate": 6.420825460353974e-06,
+      "loss": 0.6736,
+      "step": 1111
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.4470597713138655,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6371,
+      "step": 1112
+    },
+    {
+      "epoch": 0.8904,
+      "grad_norm": 0.6447410630483885,
+      "learning_rate": 6.239314990243339e-06,
+      "loss": 0.6985,
+      "step": 1113
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.3784539577065875,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.6113,
+      "step": 1114
+    },
+    {
+      "epoch": 0.892,
+      "grad_norm": 0.4741807533832942,
+      "learning_rate": 6.0603243678516995e-06,
+      "loss": 0.7192,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.40696168117855497,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6546,
+      "step": 1116
+    },
+    {
+      "epoch": 0.8936,
+      "grad_norm": 0.46233229752226745,
+      "learning_rate": 5.883858403607967e-06,
+      "loss": 0.7013,
+      "step": 1117
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.4199909816108278,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6514,
+      "step": 1118
+    },
+    {
+      "epoch": 0.8952,
+      "grad_norm": 0.3903462015411026,
+      "learning_rate": 5.7099218400900716e-06,
+      "loss": 0.6777,
+      "step": 1119
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.43867300626612943,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6588,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8968,
+      "grad_norm": 0.4451731647413953,
+      "learning_rate": 5.538519351897575e-06,
+      "loss": 0.6186,
+      "step": 1121
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.4177041897523593,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6913,
+      "step": 1122
+    },
+    {
+      "epoch": 0.8984,
+      "grad_norm": 0.42231456720258476,
+      "learning_rate": 5.369655545525909e-06,
+      "loss": 0.6489,
+      "step": 1123
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.44824163944843665,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6784,
+      "step": 1124
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.43471567283177637,
+      "learning_rate": 5.2033349592426335e-06,
+      "loss": 0.7208,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.498934955818159,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6437,
+      "step": 1126
+    },
+    {
+      "epoch": 0.9016,
+      "grad_norm": 0.48580807688107563,
+      "learning_rate": 5.039562062965508e-06,
+      "loss": 0.668,
+      "step": 1127
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.43952589415837257,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6895,
+      "step": 1128
+    },
+    {
+      "epoch": 0.9032,
+      "grad_norm": 0.37149033642589935,
+      "learning_rate": 4.87834125814235e-06,
+      "loss": 0.67,
+      "step": 1129
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.4852786012636073,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6688,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9048,
+      "grad_norm": 0.36469666185134314,
+      "learning_rate": 4.719676877632639e-06,
+      "loss": 0.651,
+      "step": 1131
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.44325768299991547,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6559,
+      "step": 1132
+    },
+    {
+      "epoch": 0.9064,
+      "grad_norm": 0.4827536457562476,
+      "learning_rate": 4.563573185591219e-06,
+      "loss": 0.6705,
+      "step": 1133
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.3893119525845464,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6809,
+      "step": 1134
+    },
+    {
+      "epoch": 0.908,
+      "grad_norm": 0.39947775593612334,
+      "learning_rate": 4.4100343773536225e-06,
+      "loss": 0.5895,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.4268305287341812,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6476,
+      "step": 1136
+    },
+    {
+      "epoch": 0.9096,
+      "grad_norm": 0.3904970988564565,
+      "learning_rate": 4.259064579323302e-06,
+      "loss": 0.6318,
+      "step": 1137
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.5030706139039645,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.711,
+      "step": 1138
+    },
+    {
+      "epoch": 0.9112,
+      "grad_norm": 0.448425693298618,
+      "learning_rate": 4.1106678488607495e-06,
+      "loss": 0.6872,
+      "step": 1139
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3676551531074703,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.6875,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9128,
+      "grad_norm": 0.48434382375615254,
+      "learning_rate": 3.964848174174541e-06,
+      "loss": 0.7207,
+      "step": 1141
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.36427386682071533,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.6319,
+      "step": 1142
+    },
+    {
+      "epoch": 0.9144,
+      "grad_norm": 0.822845317066879,
+      "learning_rate": 3.821609474213983e-06,
+      "loss": 0.6233,
+      "step": 1143
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.37114922831083996,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6094,
+      "step": 1144
+    },
+    {
+      "epoch": 0.916,
+      "grad_norm": 0.3754095183151161,
+      "learning_rate": 3.6809555985639068e-06,
+      "loss": 0.6578,
+      "step": 1145
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.45078676903592013,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6827,
+      "step": 1146
+    },
+    {
+      "epoch": 0.9176,
+      "grad_norm": 0.4168990815173287,
+      "learning_rate": 3.5428903273411863e-06,
+      "loss": 0.7397,
+      "step": 1147
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.4507539159693322,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6386,
+      "step": 1148
+    },
+    {
+      "epoch": 0.9192,
+      "grad_norm": 0.39250543750014394,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.6932,
+      "step": 1149
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.40505225352855967,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6981,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9208,
+      "grad_norm": 0.4182252224511026,
+      "learning_rate": 3.2745403706978872e-06,
+      "loss": 0.7056,
+      "step": 1151
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4438636305914537,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.7024,
+      "step": 1152
+    },
+    {
+      "epoch": 0.9224,
+      "grad_norm": 0.44466805445876056,
+      "learning_rate": 3.1442628972662704e-06,
+      "loss": 0.6573,
+      "step": 1153
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.48050301828153263,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.682,
+      "step": 1154
+    },
+    {
+      "epoch": 0.924,
+      "grad_norm": 0.5135694817857425,
+      "learning_rate": 3.0165884520461316e-06,
+      "loss": 0.6818,
+      "step": 1155
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.5023625394808598,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.7109,
+      "step": 1156
+    },
+    {
+      "epoch": 0.9256,
+      "grad_norm": 0.40559397685771054,
+      "learning_rate": 2.8915204663281013e-06,
+      "loss": 0.6128,
+      "step": 1157
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.48228319917260726,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.652,
+      "step": 1158
+    },
+    {
+      "epoch": 0.9272,
+      "grad_norm": 0.4028627412024973,
+      "learning_rate": 2.7690623013533976e-06,
+      "loss": 0.6748,
+      "step": 1159
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4700410335749985,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.7462,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9288,
+      "grad_norm": 0.4492313961386215,
+      "learning_rate": 2.649217248223468e-06,
+      "loss": 0.6375,
+      "step": 1161
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.563654315078854,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6359,
+      "step": 1162
+    },
+    {
+      "epoch": 0.9304,
+      "grad_norm": 0.5057687381686432,
+      "learning_rate": 2.5319885278115906e-06,
+      "loss": 0.6714,
+      "step": 1163
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.4652536601650651,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.7764,
+      "step": 1164
+    },
+    {
+      "epoch": 0.932,
+      "grad_norm": 0.3662343111276865,
+      "learning_rate": 2.4173792906762804e-06,
+      "loss": 0.6893,
+      "step": 1165
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.4416713848647298,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.7013,
+      "step": 1166
+    },
+    {
+      "epoch": 0.9336,
+      "grad_norm": 0.381951299984924,
+      "learning_rate": 2.3053926169765984e-06,
+      "loss": 0.6632,
+      "step": 1167
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.37155212677639327,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6821,
+      "step": 1168
+    },
+    {
+      "epoch": 0.9352,
+      "grad_norm": 0.46471185332141823,
+      "learning_rate": 2.1960315163894075e-06,
+      "loss": 0.7189,
+      "step": 1169
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.4163494889726177,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.5728,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9368,
+      "grad_norm": 0.38152676389057105,
+      "learning_rate": 2.0892989280284823e-06,
+      "loss": 0.6235,
+      "step": 1171
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.408199847432243,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.7241,
+      "step": 1172
+    },
+    {
+      "epoch": 0.9384,
+      "grad_norm": 0.3855748055325008,
+      "learning_rate": 1.9851977203654835e-06,
+      "loss": 0.6325,
+      "step": 1173
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.5347027654755969,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.7415,
+      "step": 1174
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.3850161743575303,
+      "learning_rate": 1.8837306911529184e-06,
+      "loss": 0.6479,
+      "step": 1175
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.39447167520068405,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6581,
+      "step": 1176
+    },
+    {
+      "epoch": 0.9416,
+      "grad_norm": 0.3954999351156733,
+      "learning_rate": 1.7849005673489127e-06,
+      "loss": 0.627,
+      "step": 1177
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.47620026982022223,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.7013,
+      "step": 1178
+    },
+    {
+      "epoch": 0.9432,
+      "grad_norm": 0.47863278534676423,
+      "learning_rate": 1.6887100050439587e-06,
+      "loss": 0.6721,
+      "step": 1179
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.44263756236758506,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.705,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9448,
+      "grad_norm": 0.48622880718242784,
+      "learning_rate": 1.595161589389449e-06,
+      "loss": 0.6606,
+      "step": 1181
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.4518729241142917,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6761,
+      "step": 1182
+    },
+    {
+      "epoch": 0.9464,
+      "grad_norm": 0.36692094947734094,
+      "learning_rate": 1.5042578345283108e-06,
+      "loss": 0.6531,
+      "step": 1183
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.49770220770437756,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6987,
+      "step": 1184
+    },
+    {
+      "epoch": 0.948,
+      "grad_norm": 0.386846496222818,
+      "learning_rate": 1.4160011835273934e-06,
+      "loss": 0.6162,
+      "step": 1185
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.42295808925978085,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.6378,
+      "step": 1186
+    },
+    {
+      "epoch": 0.9496,
+      "grad_norm": 0.515620590376939,
+      "learning_rate": 1.3303940083117527e-06,
+      "loss": 0.6717,
+      "step": 1187
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.4236383792998744,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.694,
+      "step": 1188
+    },
+    {
+      "epoch": 0.9512,
+      "grad_norm": 0.3745933440933498,
+      "learning_rate": 1.2474386096010039e-06,
+      "loss": 0.6782,
+      "step": 1189
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.44548985289368986,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.6824,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9528,
+      "grad_norm": 0.351935666658701,
+      "learning_rate": 1.1671372168474138e-06,
+      "loss": 0.635,
+      "step": 1191
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.3927876866660279,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6844,
+      "step": 1192
+    },
+    {
+      "epoch": 0.9544,
+      "grad_norm": 0.4543533700238643,
+      "learning_rate": 1.089491988176017e-06,
+      "loss": 0.6545,
+      "step": 1193
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.4726408949660419,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.764,
+      "step": 1194
+    },
+    {
+      "epoch": 0.956,
+      "grad_norm": 0.4547022906964549,
+      "learning_rate": 1.014505010326583e-06,
+      "loss": 0.6669,
+      "step": 1195
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.4386988408797006,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.7046,
+      "step": 1196
+    },
+    {
+      "epoch": 0.9576,
+      "grad_norm": 0.364811545576542,
+      "learning_rate": 9.421782985976068e-07,
+      "loss": 0.6357,
+      "step": 1197
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.38316319547541294,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6351,
+      "step": 1198
+    },
+    {
+      "epoch": 0.9592,
+      "grad_norm": 0.409523938404599,
+      "learning_rate": 8.725137967920738e-07,
+      "loss": 0.7255,
+      "step": 1199
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3938547911550198,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6623,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9608,
+      "grad_norm": 0.4130480095438861,
+      "learning_rate": 8.055133771652345e-07,
+      "loss": 0.7109,
+      "step": 1201
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.4607401947793766,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6699,
+      "step": 1202
+    },
+    {
+      "epoch": 0.9624,
+      "grad_norm": 0.4095248421137157,
+      "learning_rate": 7.411788403743237e-07,
+      "loss": 0.6493,
+      "step": 1203
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.5025596720781565,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.7294,
+      "step": 1204
+    },
+    {
+      "epoch": 0.964,
+      "grad_norm": 0.40212299450152705,
+      "learning_rate": 6.7951191543012e-07,
+      "loss": 0.706,
+      "step": 1205
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.4353245366674736,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.7316,
+      "step": 1206
+    },
+    {
+      "epoch": 0.9656,
+      "grad_norm": 0.3289292410419503,
+      "learning_rate": 6.205142596505176e-07,
+      "loss": 0.5969,
+      "step": 1207
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.4485381283671457,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6299,
+      "step": 1208
+    },
+    {
+      "epoch": 0.9672,
+      "grad_norm": 0.4350396855142743,
+      "learning_rate": 5.64187458615939e-07,
+      "loss": 0.6427,
+      "step": 1209
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.44820779403912475,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6739,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9688,
+      "grad_norm": 0.45985438515252697,
+      "learning_rate": 5.105330261267916e-07,
+      "loss": 0.6196,
+      "step": 1211
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.4155698862169645,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6562,
+      "step": 1212
+    },
+    {
+      "epoch": 0.9704,
+      "grad_norm": 0.35992935478436117,
+      "learning_rate": 4.5955240416271084e-07,
+      "loss": 0.6739,
+      "step": 1213
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.4210190191253645,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.7084,
+      "step": 1214
+    },
+    {
+      "epoch": 0.972,
+      "grad_norm": 0.46506063315072393,
+      "learning_rate": 4.112469628438365e-07,
+      "loss": 0.641,
+      "step": 1215
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.45217811984125644,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6842,
+      "step": 1216
+    },
+    {
+      "epoch": 0.9736,
+      "grad_norm": 0.481584910999147,
+      "learning_rate": 3.6561800039403016e-07,
+      "loss": 0.7273,
+      "step": 1217
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.44495631426659343,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.6449,
+      "step": 1218
+    },
+    {
+      "epoch": 0.9752,
+      "grad_norm": 0.5867587286277899,
+      "learning_rate": 3.2266674310589273e-07,
+      "loss": 0.7682,
+      "step": 1219
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.41093637951300743,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6484,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9768,
+      "grad_norm": 0.3994694959002646,
+      "learning_rate": 2.8239434530792365e-07,
+      "loss": 0.6421,
+      "step": 1221
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.8741986088770987,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6413,
+      "step": 1222
+    },
+    {
+      "epoch": 0.9784,
+      "grad_norm": 0.42549697370180717,
+      "learning_rate": 2.448018893333681e-07,
+      "loss": 0.6889,
+      "step": 1223
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.36065938333251124,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6689,
+      "step": 1224
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.40473093376739466,
+      "learning_rate": 2.098903854912515e-07,
+      "loss": 0.6713,
+      "step": 1225
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.38014814555165943,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.5928,
+      "step": 1226
+    },
+    {
+      "epoch": 0.9816,
+      "grad_norm": 0.4300530742116439,
+      "learning_rate": 1.7766077203915655e-07,
+      "loss": 0.5796,
+      "step": 1227
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.36263394110562225,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6379,
+      "step": 1228
+    },
+    {
+      "epoch": 0.9832,
+      "grad_norm": 0.43216298500361494,
+      "learning_rate": 1.481139151579991e-07,
+      "loss": 0.6708,
+      "step": 1229
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.4035772994578955,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6585,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9848,
+      "grad_norm": 0.46471278001401517,
+      "learning_rate": 1.2125060892881346e-07,
+      "loss": 0.6209,
+      "step": 1231
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.39742181119724646,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.6714,
+      "step": 1232
+    },
+    {
+      "epoch": 0.9864,
+      "grad_norm": 0.4642886320610059,
+      "learning_rate": 9.707157531134713e-08,
+      "loss": 0.7515,
+      "step": 1233
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.3552316968541323,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.7118,
+      "step": 1234
+    },
+    {
+      "epoch": 0.988,
+      "grad_norm": 0.5832555040857262,
+      "learning_rate": 7.557746412468758e-08,
+      "loss": 0.702,
+      "step": 1235
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.4445173718634954,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6818,
+      "step": 1236
+    },
+    {
+      "epoch": 0.9896,
+      "grad_norm": 0.36950466729677595,
+      "learning_rate": 5.6768853029787184e-08,
+      "loss": 0.6382,
+      "step": 1237
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.3563943247336583,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6319,
+      "step": 1238
+    },
+    {
+      "epoch": 0.9912,
+      "grad_norm": 0.44786114440122127,
+      "learning_rate": 4.064624751394242e-08,
+      "loss": 0.6989,
+      "step": 1239
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.37817784586678505,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.606,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9928,
+      "grad_norm": 0.4459885744722213,
+      "learning_rate": 2.7210080877237976e-08,
+      "loss": 0.6281,
+      "step": 1241
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.485956429014473,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.7132,
+      "step": 1242
+    },
+    {
+      "epoch": 0.9944,
+      "grad_norm": 0.39125958924755905,
+      "learning_rate": 1.646071422083395e-08,
+      "loss": 0.6556,
+      "step": 1243
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.4158554391756689,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6668,
+      "step": 1244
+    },
+    {
+      "epoch": 0.996,
+      "grad_norm": 0.3993930174404647,
+      "learning_rate": 8.398436437317969e-09,
+      "loss": 0.7188,
+      "step": 1245
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.42408035043868864,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6531,
+      "step": 1246
+    },
+    {
+      "epoch": 0.9976,
+      "grad_norm": 0.47997507721109506,
+      "learning_rate": 3.023464202944748e-09,
+      "loss": 0.6357,
+      "step": 1247
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.42129152338144066,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.5859,
+      "step": 1248
+    },
+    {
+      "epoch": 0.9992,
+      "grad_norm": 0.5059206793056723,
+      "learning_rate": 3.3594197175190745e-10,
+      "loss": 0.6943,
+      "step": 1249
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.39015189642135556,
+      "learning_rate": 0.0,
+      "loss": 0.6774,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0,
+      "step": 1250,
+      "total_flos": 1075826133532672.0,
+      "train_loss": 0.7417825475215912,
+      "train_runtime": 19119.0811,
+      "train_samples_per_second": 1.046,
+      "train_steps_per_second": 0.065
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1075826133532672.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb9d3b3f3084162eed13db43858cadfc73f167bc
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "k_proj",
+    "up_proj",
+    "o_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ba12f28e8d2e6b1ef6d1ea09692cd8c5d81c8b5a
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e1e7a62972a4ff09f208264f5f966c407f1561455825f5d80f41c82ce076c7c
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4ed2b96d1aab89dc2713090062115cab8f9eeb4e
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:190eefee19dd4b71cfab88915cd3c3978b227d90c118cad6087b261cf0926aa1
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2147a9e04e5c7913b0682519cab83918500da535
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.8884397703958815,
+      "learning_rate": 5e-05,
+      "loss": 1.4159,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.0606920004031573,
+      "learning_rate": 0.0001,
+      "loss": 1.573,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.7084696914558983,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.1484,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.7205956606575595,
+      "learning_rate": 0.0002,
+      "loss": 1.0082,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.3784355426416521,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 1.1161,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.7951498968301047,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 1.0714,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.491689920077788,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 0.8671,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5745437616364344,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 0.9945,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5848934441972632,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 0.943,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.569529067821101,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 0.954,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.5674545381514693,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 0.9394,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.6029677361735906,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 0.8518,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.6336646808165951,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 0.9159,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5679261447513578,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 0.8465,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5308201822016704,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.8743,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.5085526050026933,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 0.9164,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.5142089161440812,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 0.8633,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.6806076228504198,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 0.9558,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.5181061101016946,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 0.8307,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5300147626591982,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 0.8933,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.5303314473113463,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 0.7733,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.5637808040036959,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 0.9473,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.4928223181651132,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 0.8076,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.46232268164750967,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 0.8499,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5801112496983529,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 0.9608,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.5567373224539722,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.8963,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5151562065420886,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 0.9003,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5278724877114119,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 0.9522,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.6208439508269434,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 1.0636,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3869070230027433,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 0.7688,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.5264876039649024,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 0.9261,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.5593455537479747,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 1.0018,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.4808510845543371,
+      "learning_rate": 0.000172967916579403,
+      "loss": 0.8686,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.49696919912770154,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 0.8674,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.4862474594477952,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 0.8905,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4180161696791824,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 0.8139,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4939094774176364,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.8379,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4253560025632094,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 0.8151,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.5572204490964708,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 0.892,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4723541523805916,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 0.8551,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.4568174408219518,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 0.7905,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.447517908676915,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 0.8477,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.4766204933369398,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 0.8327,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.4499203108768281,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 0.8197,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.528409694573524,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 0.9223,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.72502564770408,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 0.8012,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.5533119892695849,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 0.8605,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4049030465990177,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.8528,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.4808870640382263,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 0.87,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.52409820233413,
+      "learning_rate": 0.000136764169663272,
+      "loss": 0.8126,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.6056007216795501,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 0.9253,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4288099908418435,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 0.7949,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.5189621718791575,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 0.8815,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.45129110175069165,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 0.786,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.6277596482686542,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 0.9032,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.44406976016915317,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 0.7808,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.4918218002370572,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 0.8423,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.6049398582277967,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 0.9006,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.49953351896163867,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.8549,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.44150354205208875,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 0.8135,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.44711567607942587,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 0.7721,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.6576684715478033,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 0.9037,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.45292352334579544,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.8571,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4241065430917788,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.7847,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3914694358431383,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.7819,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4322735032742982,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.8738,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.4309376434320946,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.8367,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.46566584064135313,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.8552,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.429308500201854,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.8101,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.587185938724539,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.8888,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.5843203522868754,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.967,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4892032041186544,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.8529,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.6313546824278242,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.8197,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.46706294787211833,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.7694,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.5194463291566068,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.7919,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.5438788300581358,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.8728,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.5701658320866863,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.7784,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.5172945223466745,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.9284,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.45218007811343347,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.7992,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.45924177524997184,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 0.8009,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.6083016332888105,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.7786,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.44739935065083747,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.8756,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.42971294075618266,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.8298,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.5815137727505052,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.8224,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.38147701842400894,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.7335,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4322102435702118,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.7831,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.5089686624428282,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.8416,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.5505056218355707,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.8012,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.5070049831936402,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.8553,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3899330884875655,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 0.7763,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4056999323126134,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.7543,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.4412253695071463,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.8095,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4619018727704485,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.7423,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.49371015234697324,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.735,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.4298667400521944,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.758,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.44698443987236036,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.7787,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 1.192577588135792,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.9124,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.544145417635771,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.7845,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.42939269218996073,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.7219,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.44156059519958973,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.8113,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.44046523940845045,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.8134,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.5488593124276808,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 0.8677,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.5225130363831496,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.768,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.4608618963436507,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.7307,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.4858313632617286,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.8246,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.5168158601625553,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.8962,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.38964343140870383,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.7524,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.44153770705871775,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.8091,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.4201989209720781,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.7202,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5585657269663398,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.8157,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.39368801093203876,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.6741,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.5064472219894975,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.8613,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.47217273527245207,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.8323,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4526610600568625,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.836,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.4287421695372561,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.7581,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.5910599950264193,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.8301,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.4205494298112779,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.721,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4114647219715256,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.7665,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.43372997488992915,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.8017,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.4956446279256857,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.823,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.5412074518102485,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.8656,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4602496517708938,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.7907,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.4224483839145656,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.7249,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.38290363636099745,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.715,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.4116424800917143,
+      "learning_rate": 0.0,
+      "loss": 0.8042,
+      "step": 125
+    },
+    {
+      "epoch": 1.0,
+      "step": 125,
+      "total_flos": 108802645983232.0,
+      "train_loss": 0.8561052279472351,
+      "train_runtime": 1918.9923,
+      "train_samples_per_second": 1.042,
+      "train_steps_per_second": 0.065
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 108802645983232.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef5b433c4e8a8c22c5322be3073838a5d29dd92a
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "q_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..af0834266e95e92cb8662ec320fe4104a49f6419
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098e9c0d4365d730afb30afd26c20651b3b6d367b43b15c52091b7c2da9f2fe2
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fb63f3087bab7382842ba324cedeb7a7aeafb292
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd552e8627b7e5460c4b47166f452609fa9e49424e6b38e9ab8b059a7f44c099
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b4e9cd70429134f1eff34f2c9a87dd3192592245
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,476 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.992,
+  "eval_steps": 500,
+  "global_step": 62,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8881067032933168,
+      "learning_rate": 0.0001,
+      "loss": 1.4945,
+      "step": 1
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.7770342479329756,
+      "learning_rate": 0.0002,
+      "loss": 1.2212,
+      "step": 2
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.9220773230190236,
+      "learning_rate": 0.0001998629534754574,
+      "loss": 1.2649,
+      "step": 3
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.8155613335304368,
+      "learning_rate": 0.00019945218953682734,
+      "loss": 1.0363,
+      "step": 4
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5322614055457643,
+      "learning_rate": 0.00019876883405951377,
+      "loss": 0.9997,
+      "step": 5
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.44105392561516205,
+      "learning_rate": 0.00019781476007338058,
+      "loss": 0.9386,
+      "step": 6
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.7833219433873575,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.9249,
+      "step": 7
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.745363100150533,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 0.927,
+      "step": 8
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4819461525156197,
+      "learning_rate": 0.00019335804264972018,
+      "loss": 0.9433,
+      "step": 9
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.7445630307920716,
+      "learning_rate": 0.0001913545457642601,
+      "loss": 0.9034,
+      "step": 10
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4924623708308096,
+      "learning_rate": 0.0001891006524188368,
+      "loss": 0.8899,
+      "step": 11
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.403071806760822,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8527,
+      "step": 12
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4430564803213766,
+      "learning_rate": 0.00018386705679454242,
+      "loss": 0.9423,
+      "step": 13
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4014569573385412,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.9413,
+      "step": 14
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.44879724353918893,
+      "learning_rate": 0.0001777145961456971,
+      "loss": 0.9282,
+      "step": 15
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4431764500501959,
+      "learning_rate": 0.00017431448254773944,
+      "loss": 0.98,
+      "step": 16
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.43604774988284023,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.8848,
+      "step": 17
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.41322803717963497,
+      "learning_rate": 0.00016691306063588583,
+      "loss": 0.8688,
+      "step": 18
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.3438188514971592,
+      "learning_rate": 0.00016293203910498376,
+      "loss": 0.8404,
+      "step": 19
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4206534215903609,
+      "learning_rate": 0.00015877852522924732,
+      "loss": 0.8831,
+      "step": 20
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.3738207537435881,
+      "learning_rate": 0.00015446390350150273,
+      "loss": 0.8456,
+      "step": 21
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3330511764130146,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8315,
+      "step": 22
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4268615101234299,
+      "learning_rate": 0.00014539904997395468,
+      "loss": 0.864,
+      "step": 23
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.37305624152931305,
+      "learning_rate": 0.00014067366430758004,
+      "loss": 0.866,
+      "step": 24
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4441814226993271,
+      "learning_rate": 0.00013583679495453,
+      "loss": 0.8582,
+      "step": 25
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4018483090042229,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.8736,
+      "step": 26
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4021252408952179,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.8381,
+      "step": 27
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.42098023005943175,
+      "learning_rate": 0.00012079116908177593,
+      "loss": 0.8531,
+      "step": 28
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.49010442252024883,
+      "learning_rate": 0.0001156434465040231,
+      "loss": 0.8778,
+      "step": 29
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3425843067375386,
+      "learning_rate": 0.00011045284632676536,
+      "loss": 0.8429,
+      "step": 30
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.41190711302446653,
+      "learning_rate": 0.0001052335956242944,
+      "loss": 0.8413,
+      "step": 31
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3228174068831545,
+      "learning_rate": 0.0001,
+      "loss": 0.8297,
+      "step": 32
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4895637910198642,
+      "learning_rate": 9.476640437570562e-05,
+      "loss": 0.8342,
+      "step": 33
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.34085577946149404,
+      "learning_rate": 8.954715367323468e-05,
+      "loss": 0.8517,
+      "step": 34
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.37467874711346916,
+      "learning_rate": 8.435655349597689e-05,
+      "loss": 0.8541,
+      "step": 35
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3564264425585218,
+      "learning_rate": 7.920883091822408e-05,
+      "loss": 0.9149,
+      "step": 36
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4019276143627365,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.8024,
+      "step": 37
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3803497603106543,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.8459,
+      "step": 38
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.36057301404640224,
+      "learning_rate": 6.416320504546997e-05,
+      "loss": 0.8604,
+      "step": 39
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.331752868629376,
+      "learning_rate": 5.9326335692419995e-05,
+      "loss": 0.8089,
+      "step": 40
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3529189783563381,
+      "learning_rate": 5.4600950026045326e-05,
+      "loss": 0.8371,
+      "step": 41
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3853558526586511,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.837,
+      "step": 42
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.32720251243737186,
+      "learning_rate": 4.5536096498497295e-05,
+      "loss": 0.7666,
+      "step": 43
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.33512069371273234,
+      "learning_rate": 4.12214747707527e-05,
+      "loss": 0.829,
+      "step": 44
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.33183884892489623,
+      "learning_rate": 3.7067960895016275e-05,
+      "loss": 0.8323,
+      "step": 45
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.3138210466118749,
+      "learning_rate": 3.308693936411421e-05,
+      "loss": 0.797,
+      "step": 46
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3529433722657763,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.7543,
+      "step": 47
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.33443578542259444,
+      "learning_rate": 2.5685517452260567e-05,
+      "loss": 0.7823,
+      "step": 48
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4351743122080329,
+      "learning_rate": 2.2285403854302912e-05,
+      "loss": 0.8616,
+      "step": 49
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.31866210190267136,
+      "learning_rate": 1.9098300562505266e-05,
+      "loss": 0.78,
+      "step": 50
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.35992837737853833,
+      "learning_rate": 1.6132943205457606e-05,
+      "loss": 0.8502,
+      "step": 51
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3685387261092368,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.7642,
+      "step": 52
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.365422078433516,
+      "learning_rate": 1.0899347581163221e-05,
+      "loss": 0.8737,
+      "step": 53
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.29802642872061985,
+      "learning_rate": 8.645454235739903e-06,
+      "loss": 0.7968,
+      "step": 54
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3479896048917906,
+      "learning_rate": 6.6419573502798374e-06,
+      "loss": 0.7792,
+      "step": 55
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3341330140937082,
+      "learning_rate": 4.8943483704846475e-06,
+      "loss": 0.7838,
+      "step": 56
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.33890270082577884,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.8476,
+      "step": 57
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3855122596118422,
+      "learning_rate": 2.1852399266194314e-06,
+      "loss": 0.8064,
+      "step": 58
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3072954456332672,
+      "learning_rate": 1.231165940486234e-06,
+      "loss": 0.7585,
+      "step": 59
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.32832280350096327,
+      "learning_rate": 5.478104631726711e-07,
+      "loss": 0.8263,
+      "step": 60
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.432110699458833,
+      "learning_rate": 1.3704652454261668e-07,
+      "loss": 0.8444,
+      "step": 61
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.2950937605671529,
+      "learning_rate": 0.0,
+      "loss": 0.7306,
+      "step": 62
+    },
+    {
+      "epoch": 0.992,
+      "step": 62,
+      "total_flos": 156705435025408.0,
+      "train_loss": 0.8773936042862553,
+      "train_runtime": 1891.0897,
+      "train_samples_per_second": 1.058,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 62,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 156705435025408.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a362cdc6ced0373326ecc6d295eda4a1f3de46db
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "q_proj",
+    "o_proj",
+    "up_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f4380f4bd32963fe315e471f9e5ade402f303390
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5e8784ade77dde109b97f7f0c0f9930d41c11c4d0b5ae9879871d8a71f482b9
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2888fabbc04e6a99829c2f20d51e95bcfff05706
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cace5c78df0d415786b3c943fe25ccda287dd626adea57f389fdb8761dee8337
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..4fd41b48b605770a356eb324056c4adf1dd1e5c1
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.8761512161539348,
+      "learning_rate": 5e-05,
+      "loss": 1.4159,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.050411291756925,
+      "learning_rate": 0.0001,
+      "loss": 1.573,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.7042707932654642,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.1491,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.7180995438734898,
+      "learning_rate": 0.0002,
+      "loss": 1.0088,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.365840006786395,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 1.117,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.7805949745223308,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 1.0718,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.48790456350172695,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 0.8674,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.569211715314236,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 0.9944,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.6438576703248184,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 0.9435,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5691926039461578,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 0.9537,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.5735710492905692,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 0.9398,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5788683720151458,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 0.8513,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.6066358529501978,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 0.916,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.7458357794862776,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 0.847,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5091599140879502,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.8748,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.5021115561896219,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 0.916,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.49670110297886205,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 0.8618,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.6031073742650186,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 0.9549,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.4695881467204669,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 0.8321,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5447846782765617,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 0.8936,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.6433587271879366,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 0.7734,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.569068431075782,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 0.9471,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.504188410609563,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 0.8076,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.45271163917212337,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 0.8493,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5882580444650141,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 0.962,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.563830482355529,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.8956,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5110945514147994,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 0.8988,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4977997445045047,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 0.9509,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.656914022247126,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 1.0623,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.39541950218179106,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 0.7705,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.5298013165099509,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 0.927,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.5581973615381166,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 1.0056,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.4863895130531955,
+      "learning_rate": 0.000172967916579403,
+      "loss": 0.8686,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.5216196632515393,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 0.8659,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.4709873315394201,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 0.8897,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.42140975185568336,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 0.8153,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4850686126078359,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.839,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.42663092480266446,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 0.8168,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.53592925059155,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 0.8923,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4776274769679771,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 0.8552,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.4525156648112374,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 0.7905,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.44609347704483965,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 0.8436,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.46291422204767446,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 0.831,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.43426351508110217,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 0.8187,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5096568332860442,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 0.9222,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.6195395152770948,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 0.7948,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.5489690096452822,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 0.861,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4175506705673952,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.8515,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.4650054032567586,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 0.8685,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5081807510201565,
+      "learning_rate": 0.000136764169663272,
+      "loss": 0.8163,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.6056212320947826,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 0.9239,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.47480587943042707,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 0.7968,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.5429809344267971,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 0.8816,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4439436498680593,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 0.7849,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.5744111918919593,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 0.8998,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4364047448911903,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 0.7857,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.4896415969107107,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 0.8444,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.6044564082237883,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 0.9029,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4677944001963764,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.8561,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.423434247928326,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 0.8139,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.43531524490158446,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 0.7697,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.61656973701804,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 0.9032,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.41572884209120664,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.8574,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4098257972778533,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.7845,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3966625098251103,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.7811,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4344170465996877,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.8776,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.4276380086198874,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.8388,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4674414321713914,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.8589,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.41318359890735357,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.8143,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5788385254147412,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.8897,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.5405431977766236,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.9667,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.47114343948575044,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.8514,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.6246980866082275,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.8199,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.458244114171704,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.7683,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.5054436738510697,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.794,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.514804478607465,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.8751,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.44096381256801,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.7778,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.5144965548813178,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.9338,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.4378295019083376,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.7987,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.45383973182342063,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 0.8021,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.45819642148586637,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.7742,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4516052417805142,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.8765,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4408962164195109,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.8315,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.649416453323441,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.822,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.38103530904658756,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.7313,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4408418986490477,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.7804,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.4777463565487696,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.8402,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3918054071841872,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.8028,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.49300712253228707,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.8599,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.35718980891187124,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 0.7782,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.422830451467894,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.7543,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.4300306903126699,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.8066,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4164786718677969,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.7428,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.48481755337412774,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.7382,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.4288045666562381,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.7569,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.42925681786070835,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.7759,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.7704343295584768,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.9136,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.5260656963642337,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.7824,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.43132606359606707,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.7234,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.42908704681963655,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.8099,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.41398213676404483,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.8135,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.5811640214111465,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 0.8683,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.5444268920907986,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.7694,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.47278317044341817,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.7332,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.46965595254274717,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.8266,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.49441143115174263,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.8984,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.38642163233312743,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.7505,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3923023476439001,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.8077,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.43042764386880217,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.7211,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5049229468813194,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.8149,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.3982449153940663,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.674,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.5025847657970963,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.8631,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.4614716302588619,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.8314,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.45098796276315717,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.8341,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.4716224685687741,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.7573,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.5912212050693847,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.8294,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.41030071817831365,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.7215,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4448213077659771,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.7652,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.44403906442933844,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.8029,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.4561204266052629,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.8229,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.5281674345762041,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.8658,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4788575110845754,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.7892,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.4274777825254005,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.7265,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3849614574513801,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.7138,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.41576314638434686,
+      "learning_rate": 0.0,
+      "loss": 0.8058,
+      "step": 125
+    },
+    {
+      "epoch": 1.0,
+      "step": 125,
+      "total_flos": 108802645983232.0,
+      "train_loss": 0.8562675738334655,
+      "train_runtime": 1916.1262,
+      "train_samples_per_second": 1.044,
+      "train_steps_per_second": 0.065
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 108802645983232.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8223bbc9d4cfe40a6e4689d0ebc22ac5333f2fee
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "o_proj",
+    "v_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..086fb413b4f1ce56380d39826209c56704018d42
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1e61d2d7404d40c644ae5ad81cc28471577c720faf1a08d3b73334a331a5c5b
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2decdb525660fba77353047c761c17f5e578a364
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41c6d522067de7256ddf8ad16476dcd3bf63496b9ac6a049a6d232143081056c
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..df9bac2ea501484355d775e0df6cc31047895203
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,476 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.992,
+  "eval_steps": 500,
+  "global_step": 62,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8733075285737993,
+      "learning_rate": 0.0001,
+      "loss": 1.4945,
+      "step": 1
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.7626200098479858,
+      "learning_rate": 0.0002,
+      "loss": 1.2212,
+      "step": 2
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.9150383578127355,
+      "learning_rate": 0.0001998629534754574,
+      "loss": 1.2654,
+      "step": 3
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.8096181977268556,
+      "learning_rate": 0.00019945218953682734,
+      "loss": 1.0377,
+      "step": 4
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5351923550168005,
+      "learning_rate": 0.00019876883405951377,
+      "loss": 0.9988,
+      "step": 5
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4384781912498458,
+      "learning_rate": 0.00019781476007338058,
+      "loss": 0.9383,
+      "step": 6
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.7226007969998041,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.9236,
+      "step": 7
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.9113171413285274,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 0.9274,
+      "step": 8
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.5024327863132997,
+      "learning_rate": 0.00019335804264972018,
+      "loss": 0.9444,
+      "step": 9
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.7239658506567372,
+      "learning_rate": 0.0001913545457642601,
+      "loss": 0.9094,
+      "step": 10
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4996150518477214,
+      "learning_rate": 0.0001891006524188368,
+      "loss": 0.8923,
+      "step": 11
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.40452327684044664,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8539,
+      "step": 12
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.43841751967829246,
+      "learning_rate": 0.00018386705679454242,
+      "loss": 0.9433,
+      "step": 13
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.41410096043074957,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.9434,
+      "step": 14
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4184189668224682,
+      "learning_rate": 0.0001777145961456971,
+      "loss": 0.9297,
+      "step": 15
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4457447902412213,
+      "learning_rate": 0.00017431448254773944,
+      "loss": 0.9793,
+      "step": 16
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3818143910735278,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.8853,
+      "step": 17
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3783548112578088,
+      "learning_rate": 0.00016691306063588583,
+      "loss": 0.8698,
+      "step": 18
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.34701134667864025,
+      "learning_rate": 0.00016293203910498376,
+      "loss": 0.8403,
+      "step": 19
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4089284076654726,
+      "learning_rate": 0.00015877852522924732,
+      "loss": 0.8835,
+      "step": 20
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.3704098957366238,
+      "learning_rate": 0.00015446390350150273,
+      "loss": 0.8466,
+      "step": 21
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3373174414785272,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8322,
+      "step": 22
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.43441879102459396,
+      "learning_rate": 0.00014539904997395468,
+      "loss": 0.8635,
+      "step": 23
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.41692608982811813,
+      "learning_rate": 0.00014067366430758004,
+      "loss": 0.8661,
+      "step": 24
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.39519755910158144,
+      "learning_rate": 0.00013583679495453,
+      "loss": 0.857,
+      "step": 25
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.39860172267169647,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.8767,
+      "step": 26
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.34911065356900484,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.8397,
+      "step": 27
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4110063667874368,
+      "learning_rate": 0.00012079116908177593,
+      "loss": 0.8537,
+      "step": 28
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4020608405658079,
+      "learning_rate": 0.0001156434465040231,
+      "loss": 0.8779,
+      "step": 29
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3281161783726237,
+      "learning_rate": 0.00011045284632676536,
+      "loss": 0.8438,
+      "step": 30
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.39975007050586536,
+      "learning_rate": 0.0001052335956242944,
+      "loss": 0.8427,
+      "step": 31
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3045871941817785,
+      "learning_rate": 0.0001,
+      "loss": 0.8293,
+      "step": 32
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3131284149787335,
+      "learning_rate": 9.476640437570562e-05,
+      "loss": 0.835,
+      "step": 33
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3324456643384695,
+      "learning_rate": 8.954715367323468e-05,
+      "loss": 0.8543,
+      "step": 34
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3627683766090873,
+      "learning_rate": 8.435655349597689e-05,
+      "loss": 0.8516,
+      "step": 35
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.38696135375325097,
+      "learning_rate": 7.920883091822408e-05,
+      "loss": 0.9156,
+      "step": 36
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.45486873904707503,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.8027,
+      "step": 37
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.38133658750527805,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.8479,
+      "step": 38
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.37712780316547756,
+      "learning_rate": 6.416320504546997e-05,
+      "loss": 0.8609,
+      "step": 39
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.34231024678456085,
+      "learning_rate": 5.9326335692419995e-05,
+      "loss": 0.8113,
+      "step": 40
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.35345370336204857,
+      "learning_rate": 5.4600950026045326e-05,
+      "loss": 0.8387,
+      "step": 41
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3746605723300753,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.8365,
+      "step": 42
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.34761992672468933,
+      "learning_rate": 4.5536096498497295e-05,
+      "loss": 0.7645,
+      "step": 43
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3407070364965428,
+      "learning_rate": 4.12214747707527e-05,
+      "loss": 0.8283,
+      "step": 44
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.35823118986429253,
+      "learning_rate": 3.7067960895016275e-05,
+      "loss": 0.8343,
+      "step": 45
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.31586195949350904,
+      "learning_rate": 3.308693936411421e-05,
+      "loss": 0.7969,
+      "step": 46
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3346111912665213,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.7553,
+      "step": 47
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.32509412696237266,
+      "learning_rate": 2.5685517452260567e-05,
+      "loss": 0.7828,
+      "step": 48
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4154838364632596,
+      "learning_rate": 2.2285403854302912e-05,
+      "loss": 0.861,
+      "step": 49
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.33041909272930825,
+      "learning_rate": 1.9098300562505266e-05,
+      "loss": 0.7807,
+      "step": 50
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.35766555810690437,
+      "learning_rate": 1.6132943205457606e-05,
+      "loss": 0.851,
+      "step": 51
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3647144892579273,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.7653,
+      "step": 52
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.355996832861928,
+      "learning_rate": 1.0899347581163221e-05,
+      "loss": 0.8726,
+      "step": 53
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.29079425564647043,
+      "learning_rate": 8.645454235739903e-06,
+      "loss": 0.7976,
+      "step": 54
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.34186894289874137,
+      "learning_rate": 6.6419573502798374e-06,
+      "loss": 0.7814,
+      "step": 55
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.32976235250953534,
+      "learning_rate": 4.8943483704846475e-06,
+      "loss": 0.7826,
+      "step": 56
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.336610764133243,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.847,
+      "step": 57
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.44920111583511707,
+      "learning_rate": 2.1852399266194314e-06,
+      "loss": 0.8073,
+      "step": 58
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.303280682511107,
+      "learning_rate": 1.231165940486234e-06,
+      "loss": 0.7587,
+      "step": 59
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.33694982931979206,
+      "learning_rate": 5.478104631726711e-07,
+      "loss": 0.8262,
+      "step": 60
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.37592344631210955,
+      "learning_rate": 1.3704652454261668e-07,
+      "loss": 0.8454,
+      "step": 61
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3106932127809868,
+      "learning_rate": 0.0,
+      "loss": 0.732,
+      "step": 62
+    },
+    {
+      "epoch": 0.992,
+      "step": 62,
+      "total_flos": 156705435025408.0,
+      "train_loss": 0.87800558824693,
+      "train_runtime": 1890.0284,
+      "train_samples_per_second": 1.058,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 62,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 156705435025408.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..212513f92bc58f25f51cf6ad20b940b533d87927
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "gate_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f19d6a487bbb69fb73a2fb9ab5d1a5d228c9d80f
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:972b29cd4005ef2d7fdae4bbda94735be58210f4ad6bd0120728509dff4d0ece
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e07fa1cbb481a9260e9730e10bdc64b284e0bff2
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:837a86515142dba5f3b3a152dd870395470a4649be3885ec4837d31a27d1066d
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..604095031c07b3b7b54574b0abcfe869efe56a53
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.8879978778964512,
+      "learning_rate": 5e-05,
+      "loss": 1.4159,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.0640587353265116,
+      "learning_rate": 0.0001,
+      "loss": 1.573,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.7072842736256486,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.1483,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.7278177254379742,
+      "learning_rate": 0.0002,
+      "loss": 1.0088,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.3856128325715313,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 1.1177,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.7962273126153702,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 1.0713,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.4851005926062388,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 0.8668,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5704754468532358,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 0.9944,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.6370909561340997,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 0.9445,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5761117959066886,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 0.9539,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.5617353727973317,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 0.9408,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.6024206513926328,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 0.8523,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.6260623836282878,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 0.9149,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5506502816329165,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 0.8466,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5118404705648577,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.8748,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.5023021511003569,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 0.9148,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4929964716691261,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 0.8617,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.6016874638384124,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 0.9531,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.7943588764253823,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 0.8304,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5624067044042429,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 0.8937,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.5137368238230682,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 0.7733,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.5984174701135948,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 0.9474,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.49681397248538706,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 0.8082,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.454965958486758,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 0.8496,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5985658301217804,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 0.9601,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.5627873164605425,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.8974,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5026276671463572,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 0.9,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.486593416327982,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 0.9505,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.6283343501267957,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 1.0609,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.42772552574072337,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 0.7684,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.5736507946552474,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 0.9267,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.5790647433139106,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 1.0016,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.47462305797922866,
+      "learning_rate": 0.000172967916579403,
+      "loss": 0.8663,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.5666313504012336,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 0.871,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.4773490934060742,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 0.8904,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4234812230255282,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 0.8139,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4990989020796822,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.8389,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4770749552433828,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 0.8146,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.5428985898936901,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 0.8911,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.48556669182308065,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 0.856,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.45066101718445134,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 0.7889,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5596146939817267,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 0.8416,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.4637305909907296,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 0.8317,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.45397598252864746,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 0.8197,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5269813475211595,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 0.9236,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.6433775232426182,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 0.7944,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.5740391756017929,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 0.8614,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4145784713265135,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.8548,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.47076052554668874,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 0.8696,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5383204455379367,
+      "learning_rate": 0.000136764169663272,
+      "loss": 0.8132,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.6316835168318365,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 0.9257,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4389686857107661,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 0.7944,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.528528046003657,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 0.88,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4515615106184379,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 0.79,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.5638075965910785,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 0.9044,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4483567448450639,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 0.7843,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.4893105736697421,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 0.8439,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.7910093036400055,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 0.9019,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4593444659971679,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.8549,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4526634408729684,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 0.8131,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.437064926080817,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 0.7716,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.6253017004724702,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 0.9032,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.4339983184927352,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.8569,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4266326998319048,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.7823,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.42051755007256886,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.7825,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.44408921014152003,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.8759,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.40700600974757534,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.8383,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4517769595509566,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.8548,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.41724766653934686,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.8084,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5579726731830523,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.8889,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.5308010558194547,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.9703,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4267154953057778,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.8524,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.8832196614039924,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.8189,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4759374903354764,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.7674,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.590099439862415,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.7925,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.5315958714433993,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.8744,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.4757057861756777,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.7804,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.5836975764080544,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.9328,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.45735269301853904,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.8001,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.45530769593290443,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 0.8025,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.49428441937770373,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.7787,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4457052001040438,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.8766,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4708964909784785,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.8302,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.5617921551900704,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.8202,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.39166989111298783,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.7312,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4285481525663216,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.7795,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.48008755427076916,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.8391,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4186970025700412,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.8024,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.4926271179696444,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.8561,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3603863978855644,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 0.7751,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4037378309144934,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.7526,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.4306313376911504,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.8085,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.42392952585080557,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.7435,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.520342827531211,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.7343,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.47349477569468973,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.757,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.45447378956605405,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.7797,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.7328627041998275,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.9105,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.5035338569997073,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.7852,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.5665842460961545,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.7229,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.4494589285982409,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.8121,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.43254498390631047,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.815,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.617162777110529,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 0.8678,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.5279563176534565,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.7656,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.4863656459408922,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.7324,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.46759428694355865,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.8252,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.5053624111570554,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.8967,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.39953892951805164,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.7525,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.39344461115660584,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.8081,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.41587291844449736,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.72,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5107460153618854,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.8137,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.4283624775790327,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.6745,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.49728016420382526,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.8618,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.4631269586439,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.8318,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.46178615828089614,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.8361,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.562034028031936,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.7564,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.5856105220033522,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.8278,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.41761741807300334,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.7205,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.419373493627604,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.7671,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4406709425252194,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.7999,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.47182624579616544,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.8259,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.5641461266322076,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.869,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4480973058440261,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.789,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.44802480682065154,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.7251,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3824694645024767,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.7126,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.6799359587143076,
+      "learning_rate": 0.0,
+      "loss": 0.806,
+      "step": 125
+    },
+    {
+      "epoch": 1.0,
+      "step": 125,
+      "total_flos": 108802645983232.0,
+      "train_loss": 0.8560487117767334,
+      "train_runtime": 1917.3264,
+      "train_samples_per_second": 1.043,
+      "train_steps_per_second": 0.065
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 108802645983232.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ced61a90bfa4f993cfdc31fc5336ee45b8b47bb
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "o_proj",
+    "v_proj",
+    "q_proj",
+    "k_proj",
+    "down_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..169908976f9af289dce363eb08da88f3b0610d42
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22c060e9cf31c64d068903084611c41e12212e6f2260b5294053f8de5ff057b5
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..846b235c71968a2ab5144038ab426ec1a171b972
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86d111db45517ea0f1c039b14d931edba26238ac1a7b6b84cd68ec71e83040b1
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..900fc95de75dd016686c93138eeb39f3a98468ec
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,476 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.992,
+  "eval_steps": 500,
+  "global_step": 62,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8867986198698434,
+      "learning_rate": 0.0001,
+      "loss": 1.4945,
+      "step": 1
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.7738820351403444,
+      "learning_rate": 0.0002,
+      "loss": 1.2212,
+      "step": 2
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.9266735811948859,
+      "learning_rate": 0.0001998629534754574,
+      "loss": 1.265,
+      "step": 3
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.8094614882041049,
+      "learning_rate": 0.00019945218953682734,
+      "loss": 1.0362,
+      "step": 4
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5320508925575772,
+      "learning_rate": 0.00019876883405951377,
+      "loss": 0.9992,
+      "step": 5
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.43898395144316243,
+      "learning_rate": 0.00019781476007338058,
+      "loss": 0.9388,
+      "step": 6
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.7153392139613666,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.9224,
+      "step": 7
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.9509151430023896,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 0.9274,
+      "step": 8
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.5057363843742055,
+      "learning_rate": 0.00019335804264972018,
+      "loss": 0.9442,
+      "step": 9
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.7551699518116433,
+      "learning_rate": 0.0001913545457642601,
+      "loss": 0.9086,
+      "step": 10
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4975706847682533,
+      "learning_rate": 0.0001891006524188368,
+      "loss": 0.8915,
+      "step": 11
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.40630493493866054,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.852,
+      "step": 12
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.44773052860815865,
+      "learning_rate": 0.00018386705679454242,
+      "loss": 0.942,
+      "step": 13
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4107660930492455,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.9426,
+      "step": 14
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4433638296469529,
+      "learning_rate": 0.0001777145961456971,
+      "loss": 0.9291,
+      "step": 15
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.44739182714973635,
+      "learning_rate": 0.00017431448254773944,
+      "loss": 0.9804,
+      "step": 16
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3916367426859159,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.8843,
+      "step": 17
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3618881232704661,
+      "learning_rate": 0.00016691306063588583,
+      "loss": 0.8707,
+      "step": 18
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.3538209495683395,
+      "learning_rate": 0.00016293203910498376,
+      "loss": 0.8415,
+      "step": 19
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4145025403872177,
+      "learning_rate": 0.00015877852522924732,
+      "loss": 0.8846,
+      "step": 20
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4724154080257005,
+      "learning_rate": 0.00015446390350150273,
+      "loss": 0.8455,
+      "step": 21
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3422466750710581,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8319,
+      "step": 22
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.44464519689398796,
+      "learning_rate": 0.00014539904997395468,
+      "loss": 0.864,
+      "step": 23
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.3791084014576181,
+      "learning_rate": 0.00014067366430758004,
+      "loss": 0.8666,
+      "step": 24
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.38043102645498195,
+      "learning_rate": 0.00013583679495453,
+      "loss": 0.8567,
+      "step": 25
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4175641638088068,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.8739,
+      "step": 26
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.35545734698454556,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.8384,
+      "step": 27
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.41690406097653954,
+      "learning_rate": 0.00012079116908177593,
+      "loss": 0.8525,
+      "step": 28
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4062190743040883,
+      "learning_rate": 0.0001156434465040231,
+      "loss": 0.8763,
+      "step": 29
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3363888359929472,
+      "learning_rate": 0.00011045284632676536,
+      "loss": 0.844,
+      "step": 30
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3930958740166345,
+      "learning_rate": 0.0001052335956242944,
+      "loss": 0.841,
+      "step": 31
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.37847718532763863,
+      "learning_rate": 0.0001,
+      "loss": 0.8293,
+      "step": 32
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3219461813331027,
+      "learning_rate": 9.476640437570562e-05,
+      "loss": 0.8349,
+      "step": 33
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3661640714738879,
+      "learning_rate": 8.954715367323468e-05,
+      "loss": 0.8533,
+      "step": 34
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.38591513795082677,
+      "learning_rate": 8.435655349597689e-05,
+      "loss": 0.8526,
+      "step": 35
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.36236104928671203,
+      "learning_rate": 7.920883091822408e-05,
+      "loss": 0.9167,
+      "step": 36
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4145275115654983,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.8045,
+      "step": 37
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4369453021225843,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.8479,
+      "step": 38
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.37218240505222444,
+      "learning_rate": 6.416320504546997e-05,
+      "loss": 0.8621,
+      "step": 39
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3429280973786549,
+      "learning_rate": 5.9326335692419995e-05,
+      "loss": 0.8107,
+      "step": 40
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.35109691547833866,
+      "learning_rate": 5.4600950026045326e-05,
+      "loss": 0.836,
+      "step": 41
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4341433309538033,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.8351,
+      "step": 42
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.31035920324535987,
+      "learning_rate": 4.5536096498497295e-05,
+      "loss": 0.7654,
+      "step": 43
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.37521399138917083,
+      "learning_rate": 4.12214747707527e-05,
+      "loss": 0.8292,
+      "step": 44
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3411541590331352,
+      "learning_rate": 3.7067960895016275e-05,
+      "loss": 0.8334,
+      "step": 45
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.44473950816153507,
+      "learning_rate": 3.308693936411421e-05,
+      "loss": 0.7954,
+      "step": 46
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3398517801939119,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.7537,
+      "step": 47
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3483422078250861,
+      "learning_rate": 2.5685517452260567e-05,
+      "loss": 0.7816,
+      "step": 48
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.49908330553390123,
+      "learning_rate": 2.2285403854302912e-05,
+      "loss": 0.8616,
+      "step": 49
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.32219378472981447,
+      "learning_rate": 1.9098300562505266e-05,
+      "loss": 0.7809,
+      "step": 50
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3576051040484641,
+      "learning_rate": 1.6132943205457606e-05,
+      "loss": 0.8512,
+      "step": 51
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3735596853921932,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.7639,
+      "step": 52
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3560450371172911,
+      "learning_rate": 1.0899347581163221e-05,
+      "loss": 0.8724,
+      "step": 53
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.29511183321492873,
+      "learning_rate": 8.645454235739903e-06,
+      "loss": 0.7966,
+      "step": 54
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.44031345535426497,
+      "learning_rate": 6.6419573502798374e-06,
+      "loss": 0.7795,
+      "step": 55
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3309328569866633,
+      "learning_rate": 4.8943483704846475e-06,
+      "loss": 0.7844,
+      "step": 56
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3603181651932308,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.8482,
+      "step": 57
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.466040489343428,
+      "learning_rate": 2.1852399266194314e-06,
+      "loss": 0.8068,
+      "step": 58
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.30075922405904837,
+      "learning_rate": 1.231165940486234e-06,
+      "loss": 0.7587,
+      "step": 59
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3538523629140039,
+      "learning_rate": 5.478104631726711e-07,
+      "loss": 0.8258,
+      "step": 60
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3569845319895022,
+      "learning_rate": 1.3704652454261668e-07,
+      "loss": 0.8445,
+      "step": 61
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.2949839015637882,
+      "learning_rate": 0.0,
+      "loss": 0.7312,
+      "step": 62
+    },
+    {
+      "epoch": 0.992,
+      "step": 62,
+      "total_flos": 156705435025408.0,
+      "train_loss": 0.877654165990891,
+      "train_runtime": 1892.0343,
+      "train_samples_per_second": 1.057,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 62,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 156705435025408.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea468be95b86557f25bc23a1f4720bbd1de0bf29
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj",
+    "q_proj",
+    "up_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9609247b12e0beaa02adc29ca7fe922ff26f9f81
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32eb43786b7fabe1f87e1fef21f803622ba7d21ad538e091c646d44679c373a6
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2a5f37bdc5952bfadf2a7121a1723b23fb31207e
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b66f972032a2f1b66440386c612ad56a7754415649a955e982b27b3d64eecde
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..72283c922ee259fe9289169468447bbb978ffd40
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,13167 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005333333333333334,
+      "grad_norm": 0.9085631201505874,
+      "learning_rate": 3.5087719298245615e-06,
+      "loss": 1.3821,
+      "step": 1
+    },
+    {
+      "epoch": 0.0010666666666666667,
+      "grad_norm": 1.068586851219567,
+      "learning_rate": 7.017543859649123e-06,
+      "loss": 1.4152,
+      "step": 2
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.1395277735386191,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.5638,
+      "step": 3
+    },
+    {
+      "epoch": 0.0021333333333333334,
+      "grad_norm": 1.0135378511790643,
+      "learning_rate": 1.4035087719298246e-05,
+      "loss": 1.4522,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026666666666666666,
+      "grad_norm": 0.80918697698487,
+      "learning_rate": 1.7543859649122806e-05,
+      "loss": 1.2903,
+      "step": 5
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9639299157020399,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.4671,
+      "step": 6
+    },
+    {
+      "epoch": 0.0037333333333333333,
+      "grad_norm": 0.9027715291571133,
+      "learning_rate": 2.456140350877193e-05,
+      "loss": 1.2785,
+      "step": 7
+    },
+    {
+      "epoch": 0.004266666666666667,
+      "grad_norm": 0.9988954920089032,
+      "learning_rate": 2.8070175438596492e-05,
+      "loss": 1.2609,
+      "step": 8
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.8737678584771005,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.2049,
+      "step": 9
+    },
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 0.8872070696199936,
+      "learning_rate": 3.508771929824561e-05,
+      "loss": 1.0834,
+      "step": 10
+    },
+    {
+      "epoch": 0.005866666666666667,
+      "grad_norm": 0.8304838669952055,
+      "learning_rate": 3.859649122807018e-05,
+      "loss": 1.0387,
+      "step": 11
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8864023129511615,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.0163,
+      "step": 12
+    },
+    {
+      "epoch": 0.006933333333333333,
+      "grad_norm": 0.7752916378982887,
+      "learning_rate": 4.56140350877193e-05,
+      "loss": 0.9885,
+      "step": 13
+    },
+    {
+      "epoch": 0.007466666666666667,
+      "grad_norm": 0.8762016533492555,
+      "learning_rate": 4.912280701754386e-05,
+      "loss": 1.0494,
+      "step": 14
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.7165092903247978,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 0.9618,
+      "step": 15
+    },
+    {
+      "epoch": 0.008533333333333334,
+      "grad_norm": 0.7726654208325324,
+      "learning_rate": 5.6140350877192984e-05,
+      "loss": 0.9431,
+      "step": 16
+    },
+    {
+      "epoch": 0.009066666666666667,
+      "grad_norm": 0.9228722088023148,
+      "learning_rate": 5.9649122807017544e-05,
+      "loss": 0.9847,
+      "step": 17
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.746543578214571,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.0573,
+      "step": 18
+    },
+    {
+      "epoch": 0.010133333333333333,
+      "grad_norm": 0.8047522847578392,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 1.0574,
+      "step": 19
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": 0.5964144600984687,
+      "learning_rate": 7.017543859649122e-05,
+      "loss": 0.9676,
+      "step": 20
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.60045647351782,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.9894,
+      "step": 21
+    },
+    {
+      "epoch": 0.011733333333333333,
+      "grad_norm": 0.7442466900966471,
+      "learning_rate": 7.719298245614036e-05,
+      "loss": 0.9578,
+      "step": 22
+    },
+    {
+      "epoch": 0.012266666666666667,
+      "grad_norm": 0.5321420807004554,
+      "learning_rate": 8.070175438596491e-05,
+      "loss": 0.8634,
+      "step": 23
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.6032438075403467,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.9023,
+      "step": 24
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 0.7980831015576652,
+      "learning_rate": 8.771929824561403e-05,
+      "loss": 1.0827,
+      "step": 25
+    },
+    {
+      "epoch": 0.013866666666666666,
+      "grad_norm": 0.5555494381624063,
+      "learning_rate": 9.12280701754386e-05,
+      "loss": 0.9044,
+      "step": 26
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.6266601305928058,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9779,
+      "step": 27
+    },
+    {
+      "epoch": 0.014933333333333333,
+      "grad_norm": 0.5186428351623814,
+      "learning_rate": 9.824561403508771e-05,
+      "loss": 0.825,
+      "step": 28
+    },
+    {
+      "epoch": 0.015466666666666667,
+      "grad_norm": 0.5084666492543204,
+      "learning_rate": 0.0001017543859649123,
+      "loss": 0.8728,
+      "step": 29
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5653541946622131,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.9396,
+      "step": 30
+    },
+    {
+      "epoch": 0.016533333333333334,
+      "grad_norm": 0.4669593198331131,
+      "learning_rate": 0.00010877192982456141,
+      "loss": 0.8382,
+      "step": 31
+    },
+    {
+      "epoch": 0.017066666666666667,
+      "grad_norm": 0.6007810072183271,
+      "learning_rate": 0.00011228070175438597,
+      "loss": 0.9553,
+      "step": 32
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.6628881782369296,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.915,
+      "step": 33
+    },
+    {
+      "epoch": 0.018133333333333335,
+      "grad_norm": 0.5895110692873703,
+      "learning_rate": 0.00011929824561403509,
+      "loss": 0.9507,
+      "step": 34
+    },
+    {
+      "epoch": 0.018666666666666668,
+      "grad_norm": 0.5528181448399232,
+      "learning_rate": 0.00012280701754385965,
+      "loss": 0.8627,
+      "step": 35
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.4809400678734925,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.866,
+      "step": 36
+    },
+    {
+      "epoch": 0.019733333333333332,
+      "grad_norm": 0.6669136602328961,
+      "learning_rate": 0.0001298245614035088,
+      "loss": 0.8893,
+      "step": 37
+    },
+    {
+      "epoch": 0.020266666666666665,
+      "grad_norm": 0.45424161088883114,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.8701,
+      "step": 38
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.9312064245199226,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.903,
+      "step": 39
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 0.4876153422693338,
+      "learning_rate": 0.00014035087719298245,
+      "loss": 0.8838,
+      "step": 40
+    },
+    {
+      "epoch": 0.021866666666666666,
+      "grad_norm": 0.5678233493416778,
+      "learning_rate": 0.00014385964912280703,
+      "loss": 0.8759,
+      "step": 41
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5465196824018712,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.848,
+      "step": 42
+    },
+    {
+      "epoch": 0.022933333333333333,
+      "grad_norm": 0.5273025747227208,
+      "learning_rate": 0.00015087719298245616,
+      "loss": 0.9227,
+      "step": 43
+    },
+    {
+      "epoch": 0.023466666666666667,
+      "grad_norm": 0.5067360005902175,
+      "learning_rate": 0.0001543859649122807,
+      "loss": 0.8986,
+      "step": 44
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.47579307220958644,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8301,
+      "step": 45
+    },
+    {
+      "epoch": 0.024533333333333334,
+      "grad_norm": 0.460989520919059,
+      "learning_rate": 0.00016140350877192982,
+      "loss": 0.821,
+      "step": 46
+    },
+    {
+      "epoch": 0.025066666666666668,
+      "grad_norm": 0.5989674101088642,
+      "learning_rate": 0.0001649122807017544,
+      "loss": 0.8319,
+      "step": 47
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5960449532932831,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.9335,
+      "step": 48
+    },
+    {
+      "epoch": 0.026133333333333335,
+      "grad_norm": 0.54580761575243,
+      "learning_rate": 0.00017192982456140353,
+      "loss": 0.8667,
+      "step": 49
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.5038509422201911,
+      "learning_rate": 0.00017543859649122806,
+      "loss": 0.8917,
+      "step": 50
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.5679216690440128,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8189,
+      "step": 51
+    },
+    {
+      "epoch": 0.027733333333333332,
+      "grad_norm": 0.5871413820404406,
+      "learning_rate": 0.0001824561403508772,
+      "loss": 0.8273,
+      "step": 52
+    },
+    {
+      "epoch": 0.028266666666666666,
+      "grad_norm": 0.4889911646751289,
+      "learning_rate": 0.00018596491228070177,
+      "loss": 0.8638,
+      "step": 53
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5489075681198782,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8531,
+      "step": 54
+    },
+    {
+      "epoch": 0.029333333333333333,
+      "grad_norm": 0.45848667873814786,
+      "learning_rate": 0.00019298245614035088,
+      "loss": 0.8195,
+      "step": 55
+    },
+    {
+      "epoch": 0.029866666666666666,
+      "grad_norm": 0.4868086323108441,
+      "learning_rate": 0.00019649122807017543,
+      "loss": 0.8177,
+      "step": 56
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.5498866339393426,
+      "learning_rate": 0.0002,
+      "loss": 0.9293,
+      "step": 57
+    },
+    {
+      "epoch": 0.030933333333333334,
+      "grad_norm": 0.4392085273854709,
+      "learning_rate": 0.00019999985069241055,
+      "loss": 0.7674,
+      "step": 58
+    },
+    {
+      "epoch": 0.031466666666666664,
+      "grad_norm": 0.4883096165410377,
+      "learning_rate": 0.00019999940277008808,
+      "loss": 0.7862,
+      "step": 59
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.564816874703866,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8073,
+      "step": 60
+    },
+    {
+      "epoch": 0.03253333333333333,
+      "grad_norm": 0.49496782164668834,
+      "learning_rate": 0.00019999761108748597,
+      "loss": 0.7516,
+      "step": 61
+    },
+    {
+      "epoch": 0.03306666666666667,
+      "grad_norm": 0.5796277324241305,
+      "learning_rate": 0.00019999626733255662,
+      "loss": 0.9386,
+      "step": 62
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.5380688214470547,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.9645,
+      "step": 63
+    },
+    {
+      "epoch": 0.034133333333333335,
+      "grad_norm": 0.5143663475650041,
+      "learning_rate": 0.00019999268401550447,
+      "loss": 0.8644,
+      "step": 64
+    },
+    {
+      "epoch": 0.034666666666666665,
+      "grad_norm": 0.6383979690564672,
+      "learning_rate": 0.000199990444464082,
+      "loss": 0.8857,
+      "step": 65
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.42852494697972915,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.7589,
+      "step": 66
+    },
+    {
+      "epoch": 0.03573333333333333,
+      "grad_norm": 0.4998944058773902,
+      "learning_rate": 0.00019998506960888256,
+      "loss": 0.8256,
+      "step": 67
+    },
+    {
+      "epoch": 0.03626666666666667,
+      "grad_norm": 0.5793129631947758,
+      "learning_rate": 0.00019998193432115572,
+      "loss": 0.8614,
+      "step": 68
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.713890759559541,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8407,
+      "step": 69
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 0.5369980492105534,
+      "learning_rate": 0.00019997476807225985,
+      "loss": 0.8322,
+      "step": 70
+    },
+    {
+      "epoch": 0.037866666666666667,
+      "grad_norm": 0.4529369284224218,
+      "learning_rate": 0.0001999707371324904,
+      "loss": 0.7193,
+      "step": 71
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5690506518321555,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8708,
+      "step": 72
+    },
+    {
+      "epoch": 0.038933333333333334,
+      "grad_norm": 0.5521579446525727,
+      "learning_rate": 0.00019996177968249334,
+      "loss": 0.7786,
+      "step": 73
+    },
+    {
+      "epoch": 0.039466666666666664,
+      "grad_norm": 0.5336067285134041,
+      "learning_rate": 0.0001999568531990141,
+      "loss": 0.8664,
+      "step": 74
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.5518599034723031,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.9068,
+      "step": 75
+    },
+    {
+      "epoch": 0.04053333333333333,
+      "grad_norm": 0.6877374072176633,
+      "learning_rate": 0.00019994610478865011,
+      "loss": 0.9134,
+      "step": 76
+    },
+    {
+      "epoch": 0.04106666666666667,
+      "grad_norm": 0.4970737269568522,
+      "learning_rate": 0.0001999402828938618,
+      "loss": 0.8253,
+      "step": 77
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5219763125913343,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8578,
+      "step": 78
+    },
+    {
+      "epoch": 0.042133333333333335,
+      "grad_norm": 0.4674974856082471,
+      "learning_rate": 0.00019992774381199778,
+      "loss": 0.8279,
+      "step": 79
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 0.45539256115663496,
+      "learning_rate": 0.00019992102666236566,
+      "loss": 0.7985,
+      "step": 80
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.4777055756746316,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8642,
+      "step": 81
+    },
+    {
+      "epoch": 0.04373333333333333,
+      "grad_norm": 0.5450178037063758,
+      "learning_rate": 0.00019990669724599336,
+      "loss": 0.8814,
+      "step": 82
+    },
+    {
+      "epoch": 0.04426666666666667,
+      "grad_norm": 0.4614070714159011,
+      "learning_rate": 0.00019989908502204292,
+      "loss": 0.8954,
+      "step": 83
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5573297368971848,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8862,
+      "step": 84
+    },
+    {
+      "epoch": 0.04533333333333334,
+      "grad_norm": 0.5187820436747881,
+      "learning_rate": 0.00019988296565626987,
+      "loss": 0.8315,
+      "step": 85
+    },
+    {
+      "epoch": 0.04586666666666667,
+      "grad_norm": 0.5045760148611361,
+      "learning_rate": 0.00019987445856258206,
+      "loss": 0.7794,
+      "step": 86
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.5545621591400848,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.9663,
+      "step": 87
+    },
+    {
+      "epoch": 0.046933333333333334,
+      "grad_norm": 0.5889943305281856,
+      "learning_rate": 0.00019985654968062122,
+      "loss": 0.7928,
+      "step": 88
+    },
+    {
+      "epoch": 0.047466666666666664,
+      "grad_norm": 0.4180158904494214,
+      "learning_rate": 0.00019984714794582683,
+      "loss": 0.7492,
+      "step": 89
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5208286483969923,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8654,
+      "step": 90
+    },
+    {
+      "epoch": 0.04853333333333333,
+      "grad_norm": 0.5379832421251245,
+      "learning_rate": 0.000199827450028985,
+      "loss": 0.9188,
+      "step": 91
+    },
+    {
+      "epoch": 0.04906666666666667,
+      "grad_norm": 0.4178084704556727,
+      "learning_rate": 0.00019981715390575858,
+      "loss": 0.7925,
+      "step": 92
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.6423083548955081,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8799,
+      "step": 93
+    },
+    {
+      "epoch": 0.050133333333333335,
+      "grad_norm": 0.4363063954138688,
+      "learning_rate": 0.00019979566748342347,
+      "loss": 0.7833,
+      "step": 94
+    },
+    {
+      "epoch": 0.050666666666666665,
+      "grad_norm": 0.4492942195522816,
+      "learning_rate": 0.00019978447724847652,
+      "loss": 0.7223,
+      "step": 95
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.40651863801450383,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.7098,
+      "step": 96
+    },
+    {
+      "epoch": 0.05173333333333333,
+      "grad_norm": 0.5485179420244607,
+      "learning_rate": 0.00019976120289810247,
+      "loss": 0.8244,
+      "step": 97
+    },
+    {
+      "epoch": 0.05226666666666667,
+      "grad_norm": 0.5216996443685623,
+      "learning_rate": 0.00019974911885217608,
+      "loss": 0.8438,
+      "step": 98
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.4892007208218577,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.813,
+      "step": 99
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.5623388944452945,
+      "learning_rate": 0.0001997240571992685,
+      "loss": 0.7848,
+      "step": 100
+    },
+    {
+      "epoch": 0.05386666666666667,
+      "grad_norm": 0.4975467207735736,
+      "learning_rate": 0.00019971107966712518,
+      "loss": 0.8596,
+      "step": 101
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.5020233346000809,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.7924,
+      "step": 102
+    },
+    {
+      "epoch": 0.054933333333333334,
+      "grad_norm": 0.509802842567309,
+      "learning_rate": 0.0001996842313852238,
+      "loss": 0.8483,
+      "step": 103
+    },
+    {
+      "epoch": 0.055466666666666664,
+      "grad_norm": 0.4752531543766072,
+      "learning_rate": 0.00019967036071563877,
+      "loss": 0.7753,
+      "step": 104
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.5400673395064166,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.8772,
+      "step": 105
+    },
+    {
+      "epoch": 0.05653333333333333,
+      "grad_norm": 0.5281212630791382,
+      "learning_rate": 0.0001996417265262996,
+      "loss": 0.8371,
+      "step": 106
+    },
+    {
+      "epoch": 0.05706666666666667,
+      "grad_norm": 0.45669538101918083,
+      "learning_rate": 0.00019962696309205148,
+      "loss": 0.8797,
+      "step": 107
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5383261579870747,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8391,
+      "step": 108
+    },
+    {
+      "epoch": 0.058133333333333335,
+      "grad_norm": 0.41927024388142425,
+      "learning_rate": 0.0001995965437648273,
+      "loss": 0.7537,
+      "step": 109
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 0.4806245830012148,
+      "learning_rate": 0.00019958088796268793,
+      "loss": 0.8412,
+      "step": 110
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.47358504691457715,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8281,
+      "step": 111
+    },
+    {
+      "epoch": 0.05973333333333333,
+      "grad_norm": 0.5134737814762356,
+      "learning_rate": 0.00019954868431510764,
+      "loss": 0.8028,
+      "step": 112
+    },
+    {
+      "epoch": 0.06026666666666667,
+      "grad_norm": 0.6156393013431956,
+      "learning_rate": 0.00019953213656583168,
+      "loss": 0.9312,
+      "step": 113
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.546049450545441,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8717,
+      "step": 114
+    },
+    {
+      "epoch": 0.06133333333333333,
+      "grad_norm": 0.46144715715128237,
+      "learning_rate": 0.00019949814946337838,
+      "loss": 0.7497,
+      "step": 115
+    },
+    {
+      "epoch": 0.06186666666666667,
+      "grad_norm": 0.45456147951236814,
+      "learning_rate": 0.00019948071021169174,
+      "loss": 0.8142,
+      "step": 116
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.5334480256071559,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.8136,
+      "step": 117
+    },
+    {
+      "epoch": 0.06293333333333333,
+      "grad_norm": 0.4785825495210441,
+      "learning_rate": 0.00019944494056777946,
+      "loss": 0.8399,
+      "step": 118
+    },
+    {
+      "epoch": 0.06346666666666667,
+      "grad_norm": 0.4429872267481487,
+      "learning_rate": 0.00019942661028236745,
+      "loss": 0.829,
+      "step": 119
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.41957670746548625,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.7656,
+      "step": 120
+    },
+    {
+      "epoch": 0.06453333333333333,
+      "grad_norm": 0.5226265400262154,
+      "learning_rate": 0.00019938905905831654,
+      "loss": 0.8057,
+      "step": 121
+    },
+    {
+      "epoch": 0.06506666666666666,
+      "grad_norm": 0.45197453618568495,
+      "learning_rate": 0.00019936983823181132,
+      "loss": 0.8162,
+      "step": 122
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.5370842279108822,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.858,
+      "step": 123
+    },
+    {
+      "epoch": 0.06613333333333334,
+      "grad_norm": 0.5278598246578419,
+      "learning_rate": 0.00019933050643682269,
+      "loss": 0.8529,
+      "step": 124
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.4307941956210138,
+      "learning_rate": 0.00019931039558578997,
+      "loss": 0.7943,
+      "step": 125
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.9311392635671509,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.8568,
+      "step": 126
+    },
+    {
+      "epoch": 0.06773333333333334,
+      "grad_norm": 0.47878729794796454,
+      "learning_rate": 0.00019926928427691786,
+      "loss": 0.8028,
+      "step": 127
+    },
+    {
+      "epoch": 0.06826666666666667,
+      "grad_norm": 0.5159824714235858,
+      "learning_rate": 0.00019924828394184306,
+      "loss": 0.7925,
+      "step": 128
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.5310097267625962,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.7569,
+      "step": 129
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 0.5304017620742338,
+      "learning_rate": 0.0001992053942239668,
+      "loss": 0.9217,
+      "step": 130
+    },
+    {
+      "epoch": 0.06986666666666666,
+      "grad_norm": 0.477716276955551,
+      "learning_rate": 0.0001991835049692405,
+      "loss": 0.8125,
+      "step": 131
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.4725067764282243,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8805,
+      "step": 132
+    },
+    {
+      "epoch": 0.07093333333333333,
+      "grad_norm": 0.6881740377890267,
+      "learning_rate": 0.0001991388379950346,
+      "loss": 0.887,
+      "step": 133
+    },
+    {
+      "epoch": 0.07146666666666666,
+      "grad_norm": 0.4875043852244181,
+      "learning_rate": 0.0001991160604089374,
+      "loss": 0.782,
+      "step": 134
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5130414160416645,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.7694,
+      "step": 135
+    },
+    {
+      "epoch": 0.07253333333333334,
+      "grad_norm": 0.5391783404356717,
+      "learning_rate": 0.00019906961737884077,
+      "loss": 0.8577,
+      "step": 136
+    },
+    {
+      "epoch": 0.07306666666666667,
+      "grad_norm": 0.5712372275179636,
+      "learning_rate": 0.00019904595207352737,
+      "loss": 0.7942,
+      "step": 137
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.45363887852434787,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.839,
+      "step": 138
+    },
+    {
+      "epoch": 0.07413333333333333,
+      "grad_norm": 0.5545661226467888,
+      "learning_rate": 0.000198997734235711,
+      "loss": 0.9437,
+      "step": 139
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.595253959791159,
+      "learning_rate": 0.00019897318184719385,
+      "loss": 0.8857,
+      "step": 140
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.49420251696911627,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.801,
+      "step": 141
+    },
+    {
+      "epoch": 0.07573333333333333,
+      "grad_norm": 0.5828222082646621,
+      "learning_rate": 0.0001989231904975272,
+      "loss": 0.8431,
+      "step": 142
+    },
+    {
+      "epoch": 0.07626666666666666,
+      "grad_norm": 0.456940614648609,
+      "learning_rate": 0.00019889775168565943,
+      "loss": 0.7611,
+      "step": 143
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5262527757792694,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7483,
+      "step": 144
+    },
+    {
+      "epoch": 0.07733333333333334,
+      "grad_norm": 0.5489396698689593,
+      "learning_rate": 0.00019884598816767563,
+      "loss": 0.8635,
+      "step": 145
+    },
+    {
+      "epoch": 0.07786666666666667,
+      "grad_norm": 0.5121258488880777,
+      "learning_rate": 0.0001988196636161333,
+      "loss": 0.7931,
+      "step": 146
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.509625436737974,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8112,
+      "step": 147
+    },
+    {
+      "epoch": 0.07893333333333333,
+      "grad_norm": 0.579146513596682,
+      "learning_rate": 0.00019876612932099308,
+      "loss": 0.9082,
+      "step": 148
+    },
+    {
+      "epoch": 0.07946666666666667,
+      "grad_norm": 0.5797729980185812,
+      "learning_rate": 0.0001987389197372567,
+      "loss": 0.8965,
+      "step": 149
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.45670114935471384,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8003,
+      "step": 150
+    },
+    {
+      "epoch": 0.08053333333333333,
+      "grad_norm": 0.4443985588243516,
+      "learning_rate": 0.00019868361610371097,
+      "loss": 0.7844,
+      "step": 151
+    },
+    {
+      "epoch": 0.08106666666666666,
+      "grad_norm": 0.5746325961849151,
+      "learning_rate": 0.00019865552221904665,
+      "loss": 0.9032,
+      "step": 152
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.4566250635842938,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.7799,
+      "step": 153
+    },
+    {
+      "epoch": 0.08213333333333334,
+      "grad_norm": 0.4908818510619844,
+      "learning_rate": 0.00019859845073339787,
+      "loss": 0.7718,
+      "step": 154
+    },
+    {
+      "epoch": 0.08266666666666667,
+      "grad_norm": 0.416211607415965,
+      "learning_rate": 0.00019856947330283752,
+      "loss": 0.7958,
+      "step": 155
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.631855768937812,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.9386,
+      "step": 156
+    },
+    {
+      "epoch": 0.08373333333333334,
+      "grad_norm": 0.4724998311690075,
+      "learning_rate": 0.0001985106354988997,
+      "loss": 0.814,
+      "step": 157
+    },
+    {
+      "epoch": 0.08426666666666667,
+      "grad_norm": 0.4803535727141574,
+      "learning_rate": 0.00019848077530122083,
+      "loss": 0.7836,
+      "step": 158
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.5313118301780214,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8319,
+      "step": 159
+    },
+    {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.5191932761602162,
+      "learning_rate": 0.00019842017276027832,
+      "loss": 0.8023,
+      "step": 160
+    },
+    {
+      "epoch": 0.08586666666666666,
+      "grad_norm": 0.4806050779256414,
+      "learning_rate": 0.00019838943059798304,
+      "loss": 0.7554,
+      "step": 161
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.40209591541789275,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.7309,
+      "step": 162
+    },
+    {
+      "epoch": 0.08693333333333333,
+      "grad_norm": 0.4604431057839949,
+      "learning_rate": 0.0001983270649487481,
+      "loss": 0.7839,
+      "step": 163
+    },
+    {
+      "epoch": 0.08746666666666666,
+      "grad_norm": 0.4769642986057626,
+      "learning_rate": 0.0001982954416480417,
+      "loss": 0.7833,
+      "step": 164
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.422729784192873,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.762,
+      "step": 165
+    },
+    {
+      "epoch": 0.08853333333333334,
+      "grad_norm": 0.44142648781893834,
+      "learning_rate": 0.00019823131456661063,
+      "loss": 0.7934,
+      "step": 166
+    },
+    {
+      "epoch": 0.08906666666666667,
+      "grad_norm": 0.5541096201508515,
+      "learning_rate": 0.00019819881097737915,
+      "loss": 0.8321,
+      "step": 167
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5422485307509299,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.8839,
+      "step": 168
+    },
+    {
+      "epoch": 0.09013333333333333,
+      "grad_norm": 0.5227081457902739,
+      "learning_rate": 0.00019813292418718732,
+      "loss": 0.8218,
+      "step": 169
+    },
+    {
+      "epoch": 0.09066666666666667,
+      "grad_norm": 0.534815575976312,
+      "learning_rate": 0.0001980995411829749,
+      "loss": 0.7774,
+      "step": 170
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.5403203503391552,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.7982,
+      "step": 171
+    },
+    {
+      "epoch": 0.09173333333333333,
+      "grad_norm": 0.576693983122081,
+      "learning_rate": 0.0001980318964547504,
+      "loss": 0.9144,
+      "step": 172
+    },
+    {
+      "epoch": 0.09226666666666666,
+      "grad_norm": 0.4169461755825807,
+      "learning_rate": 0.0001979976349327357,
+      "loss": 0.8474,
+      "step": 173
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4870863960739979,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.8239,
+      "step": 174
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 0.5749497155337887,
+      "learning_rate": 0.00019792823408445174,
+      "loss": 0.8456,
+      "step": 175
+    },
+    {
+      "epoch": 0.09386666666666667,
+      "grad_norm": 0.5064223745333338,
+      "learning_rate": 0.0001978930949654239,
+      "loss": 0.9018,
+      "step": 176
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.5939486595032151,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.9005,
+      "step": 177
+    },
+    {
+      "epoch": 0.09493333333333333,
+      "grad_norm": 0.49815470909038534,
+      "learning_rate": 0.00019782193986224995,
+      "loss": 0.8434,
+      "step": 178
+    },
+    {
+      "epoch": 0.09546666666666667,
+      "grad_norm": 0.481621086254783,
+      "learning_rate": 0.00019778592409058378,
+      "loss": 0.8234,
+      "step": 179
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.505841358462432,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.7911,
+      "step": 180
+    },
+    {
+      "epoch": 0.09653333333333333,
+      "grad_norm": 0.4751337016096947,
+      "learning_rate": 0.0001977130166448355,
+      "loss": 0.8224,
+      "step": 181
+    },
+    {
+      "epoch": 0.09706666666666666,
+      "grad_norm": 0.4888071344716639,
+      "learning_rate": 0.00019767612518846608,
+      "loss": 0.8283,
+      "step": 182
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.42852639924395564,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.7659,
+      "step": 183
+    },
+    {
+      "epoch": 0.09813333333333334,
+      "grad_norm": 0.4894706337105787,
+      "learning_rate": 0.00019760146735955388,
+      "loss": 0.8146,
+      "step": 184
+    },
+    {
+      "epoch": 0.09866666666666667,
+      "grad_norm": 0.47995694376677095,
+      "learning_rate": 0.00019756370120995066,
+      "loss": 0.7677,
+      "step": 185
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.40956111264017714,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7849,
+      "step": 186
+    },
+    {
+      "epoch": 0.09973333333333333,
+      "grad_norm": 0.5005873859350766,
+      "learning_rate": 0.000197487295004327,
+      "loss": 0.8069,
+      "step": 187
+    },
+    {
+      "epoch": 0.10026666666666667,
+      "grad_norm": 0.44969092925862897,
+      "learning_rate": 0.00019744865517646706,
+      "loss": 0.8382,
+      "step": 188
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.4949683388611651,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.8437,
+      "step": 189
+    },
+    {
+      "epoch": 0.10133333333333333,
+      "grad_norm": 0.558226216988814,
+      "learning_rate": 0.0001973705026475726,
+      "loss": 0.7963,
+      "step": 190
+    },
+    {
+      "epoch": 0.10186666666666666,
+      "grad_norm": 0.45001350872696166,
+      "learning_rate": 0.00019733099017991341,
+      "loss": 0.7551,
+      "step": 191
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4928566894907739,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.7632,
+      "step": 192
+    },
+    {
+      "epoch": 0.10293333333333334,
+      "grad_norm": 0.43447746260199227,
+      "learning_rate": 0.0001972510934281218,
+      "loss": 0.7922,
+      "step": 193
+    },
+    {
+      "epoch": 0.10346666666666667,
+      "grad_norm": 0.40950198847704977,
+      "learning_rate": 0.00019721070938257324,
+      "loss": 0.7839,
+      "step": 194
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.451123103338457,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8374,
+      "step": 195
+    },
+    {
+      "epoch": 0.10453333333333334,
+      "grad_norm": 0.5638232141960325,
+      "learning_rate": 0.0001971290705551347,
+      "loss": 0.8671,
+      "step": 196
+    },
+    {
+      "epoch": 0.10506666666666667,
+      "grad_norm": 0.4926491803108822,
+      "learning_rate": 0.00019708781601703065,
+      "loss": 0.828,
+      "step": 197
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.417058057741698,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.7885,
+      "step": 198
+    },
+    {
+      "epoch": 0.10613333333333333,
+      "grad_norm": 0.5193930068953387,
+      "learning_rate": 0.00019700443730801413,
+      "loss": 0.8408,
+      "step": 199
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.6467116680706188,
+      "learning_rate": 0.00019696231338608316,
+      "loss": 0.9473,
+      "step": 200
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.4807857740866099,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.8567,
+      "step": 201
+    },
+    {
+      "epoch": 0.10773333333333333,
+      "grad_norm": 0.444645386230921,
+      "learning_rate": 0.00019687719703631755,
+      "loss": 0.8155,
+      "step": 202
+    },
+    {
+      "epoch": 0.10826666666666666,
+      "grad_norm": 0.605533417685553,
+      "learning_rate": 0.00019683420486265327,
+      "loss": 0.8157,
+      "step": 203
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.5625649255755447,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.9247,
+      "step": 204
+    },
+    {
+      "epoch": 0.10933333333333334,
+      "grad_norm": 0.5199827137553125,
+      "learning_rate": 0.0001967473531596671,
+      "loss": 0.8048,
+      "step": 205
+    },
+    {
+      "epoch": 0.10986666666666667,
+      "grad_norm": 0.42294231496595297,
+      "learning_rate": 0.0001967034938896976,
+      "loss": 0.755,
+      "step": 206
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.5387133298218557,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8904,
+      "step": 207
+    },
+    {
+      "epoch": 0.11093333333333333,
+      "grad_norm": 0.4930695402459019,
+      "learning_rate": 0.0001966149091676575,
+      "loss": 0.8664,
+      "step": 208
+    },
+    {
+      "epoch": 0.11146666666666667,
+      "grad_norm": 0.4987578810937898,
+      "learning_rate": 0.00019657018398011434,
+      "loss": 0.7706,
+      "step": 209
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.46922494214437366,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.783,
+      "step": 210
+    },
+    {
+      "epoch": 0.11253333333333333,
+      "grad_norm": 0.5742260794727807,
+      "learning_rate": 0.00019647986861976246,
+      "loss": 0.8679,
+      "step": 211
+    },
+    {
+      "epoch": 0.11306666666666666,
+      "grad_norm": 0.42914800487842647,
+      "learning_rate": 0.0001964342787166491,
+      "loss": 0.8216,
+      "step": 212
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.42547217981059254,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7727,
+      "step": 213
+    },
+    {
+      "epoch": 0.11413333333333334,
+      "grad_norm": 0.5358386531443239,
+      "learning_rate": 0.0001963422351452389,
+      "loss": 0.787,
+      "step": 214
+    },
+    {
+      "epoch": 0.11466666666666667,
+      "grad_norm": 0.40809276943976613,
+      "learning_rate": 0.0001962957817517982,
+      "loss": 0.7589,
+      "step": 215
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4424225122688686,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.7482,
+      "step": 216
+    },
+    {
+      "epoch": 0.11573333333333333,
+      "grad_norm": 0.5158570076959549,
+      "learning_rate": 0.00019620201244302952,
+      "loss": 0.8352,
+      "step": 217
+    },
+    {
+      "epoch": 0.11626666666666667,
+      "grad_norm": 0.5310650627081238,
+      "learning_rate": 0.00019615469680771096,
+      "loss": 0.8178,
+      "step": 218
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.47431280993029923,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.8207,
+      "step": 219
+    },
+    {
+      "epoch": 0.11733333333333333,
+      "grad_norm": 0.5369714632768122,
+      "learning_rate": 0.00019605920428166323,
+      "loss": 0.8084,
+      "step": 220
+    },
+    {
+      "epoch": 0.11786666666666666,
+      "grad_norm": 0.5593803465684531,
+      "learning_rate": 0.00019601102767608923,
+      "loss": 0.9041,
+      "step": 221
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5610667976183104,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8802,
+      "step": 222
+    },
+    {
+      "epoch": 0.11893333333333334,
+      "grad_norm": 0.7061778966452937,
+      "learning_rate": 0.00019591381449915397,
+      "loss": 0.8686,
+      "step": 223
+    },
+    {
+      "epoch": 0.11946666666666667,
+      "grad_norm": 0.45701406729521243,
+      "learning_rate": 0.00019586477821808597,
+      "loss": 0.7871,
+      "step": 224
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5166685942247737,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.8284,
+      "step": 225
+    },
+    {
+      "epoch": 0.12053333333333334,
+      "grad_norm": 0.6590093338992272,
+      "learning_rate": 0.00019576584700289768,
+      "loss": 0.894,
+      "step": 226
+    },
+    {
+      "epoch": 0.12106666666666667,
+      "grad_norm": 0.4459829543325769,
+      "learning_rate": 0.00019571595236420102,
+      "loss": 0.8063,
+      "step": 227
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.41774887674303934,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.7991,
+      "step": 228
+    },
+    {
+      "epoch": 0.12213333333333333,
+      "grad_norm": 0.5112565978792027,
+      "learning_rate": 0.00019561530576956703,
+      "loss": 0.7925,
+      "step": 229
+    },
+    {
+      "epoch": 0.12266666666666666,
+      "grad_norm": 0.7507651170717319,
+      "learning_rate": 0.00019556455411417573,
+      "loss": 0.7663,
+      "step": 230
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.5510335315201351,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8079,
+      "step": 231
+    },
+    {
+      "epoch": 0.12373333333333333,
+      "grad_norm": 0.43242819507526936,
+      "learning_rate": 0.00019546219484500475,
+      "loss": 0.7853,
+      "step": 232
+    },
+    {
+      "epoch": 0.12426666666666666,
+      "grad_norm": 0.4938371182719445,
+      "learning_rate": 0.00019541058753688538,
+      "loss": 0.8019,
+      "step": 233
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.4253874813934552,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7361,
+      "step": 234
+    },
+    {
+      "epoch": 0.12533333333333332,
+      "grad_norm": 0.43124181324287203,
+      "learning_rate": 0.00019530651834411474,
+      "loss": 0.7265,
+      "step": 235
+    },
+    {
+      "epoch": 0.12586666666666665,
+      "grad_norm": 0.5106344521185178,
+      "learning_rate": 0.00019525405677022989,
+      "loss": 0.7504,
+      "step": 236
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.5402561312252943,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.8647,
+      "step": 237
+    },
+    {
+      "epoch": 0.12693333333333334,
+      "grad_norm": 0.4606947598104532,
+      "learning_rate": 0.0001951482804507517,
+      "loss": 0.772,
+      "step": 238
+    },
+    {
+      "epoch": 0.12746666666666667,
+      "grad_norm": 0.4397113395758952,
+      "learning_rate": 0.00019509496602102252,
+      "loss": 0.7404,
+      "step": 239
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.573784095147914,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.8604,
+      "step": 240
+    },
+    {
+      "epoch": 0.12853333333333333,
+      "grad_norm": 0.4922693165855001,
+      "learning_rate": 0.00019498748541760846,
+      "loss": 0.8037,
+      "step": 241
+    },
+    {
+      "epoch": 0.12906666666666666,
+      "grad_norm": 0.4415770650690034,
+      "learning_rate": 0.0001949333195648769,
+      "loss": 0.7467,
+      "step": 242
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.7202700055785279,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8997,
+      "step": 243
+    },
+    {
+      "epoch": 0.13013333333333332,
+      "grad_norm": 0.46620151369209545,
+      "learning_rate": 0.00019482413756610173,
+      "loss": 0.721,
+      "step": 244
+    },
+    {
+      "epoch": 0.13066666666666665,
+      "grad_norm": 0.48374808497981153,
+      "learning_rate": 0.0001947691217460921,
+      "loss": 0.7327,
+      "step": 245
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.46228533989157017,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.813,
+      "step": 246
+    },
+    {
+      "epoch": 0.13173333333333334,
+      "grad_norm": 0.48389874544582195,
+      "learning_rate": 0.00019465824128625617,
+      "loss": 0.8091,
+      "step": 247
+    },
+    {
+      "epoch": 0.13226666666666667,
+      "grad_norm": 0.4495867885666229,
+      "learning_rate": 0.00019460237697753577,
+      "loss": 0.7903,
+      "step": 248
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.4943482100204703,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8486,
+      "step": 249
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.39190517051078805,
+      "learning_rate": 0.00019448980103658613,
+      "loss": 0.7646,
+      "step": 250
+    },
+    {
+      "epoch": 0.13386666666666666,
+      "grad_norm": 0.46164485820158224,
+      "learning_rate": 0.0001944330897405257,
+      "loss": 0.8112,
+      "step": 251
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5599512420613735,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8938,
+      "step": 252
+    },
+    {
+      "epoch": 0.13493333333333332,
+      "grad_norm": 0.4994115747001175,
+      "learning_rate": 0.00019431882134397598,
+      "loss": 0.8471,
+      "step": 253
+    },
+    {
+      "epoch": 0.13546666666666668,
+      "grad_norm": 0.44477486433110613,
+      "learning_rate": 0.00019426126458470936,
+      "loss": 0.7916,
+      "step": 254
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4864415309029597,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.8253,
+      "step": 255
+    },
+    {
+      "epoch": 0.13653333333333334,
+      "grad_norm": 0.5150608337350671,
+      "learning_rate": 0.00019414530680355837,
+      "loss": 0.8124,
+      "step": 256
+    },
+    {
+      "epoch": 0.13706666666666667,
+      "grad_norm": 0.4123466909003194,
+      "learning_rate": 0.00019408690612794148,
+      "loss": 0.7501,
+      "step": 257
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.4968072417994427,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7999,
+      "step": 258
+    },
+    {
+      "epoch": 0.13813333333333333,
+      "grad_norm": 0.5521519947033275,
+      "learning_rate": 0.00019396926207859084,
+      "loss": 0.883,
+      "step": 259
+    },
+    {
+      "epoch": 0.13866666666666666,
+      "grad_norm": 0.49723016490414984,
+      "learning_rate": 0.0001939100190561601,
+      "loss": 0.8368,
+      "step": 260
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.5706367109724512,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8496,
+      "step": 261
+    },
+    {
+      "epoch": 0.13973333333333332,
+      "grad_norm": 0.4222133193599781,
+      "learning_rate": 0.0001937906919003304,
+      "loss": 0.7991,
+      "step": 262
+    },
+    {
+      "epoch": 0.14026666666666668,
+      "grad_norm": 0.5061623424371642,
+      "learning_rate": 0.00019373060812326052,
+      "loss": 0.7889,
+      "step": 263
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5372415116588385,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.7752,
+      "step": 264
+    },
+    {
+      "epoch": 0.14133333333333334,
+      "grad_norm": 0.4303635042507931,
+      "learning_rate": 0.00019360960106790643,
+      "loss": 0.7048,
+      "step": 265
+    },
+    {
+      "epoch": 0.14186666666666667,
+      "grad_norm": 0.46242534095827137,
+      "learning_rate": 0.0001935486781509677,
+      "loss": 0.8492,
+      "step": 266
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.470057726299895,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7727,
+      "step": 267
+    },
+    {
+      "epoch": 0.14293333333333333,
+      "grad_norm": 0.6738560257297568,
+      "learning_rate": 0.00019342599444819168,
+      "loss": 0.8035,
+      "step": 268
+    },
+    {
+      "epoch": 0.14346666666666666,
+      "grad_norm": 0.45484972653733813,
+      "learning_rate": 0.00019336423402870653,
+      "loss": 0.7347,
+      "step": 269
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4165893434189079,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.7177,
+      "step": 270
+    },
+    {
+      "epoch": 0.14453333333333335,
+      "grad_norm": 0.47973820077673446,
+      "learning_rate": 0.0001932398769756714,
+      "loss": 0.8863,
+      "step": 271
+    },
+    {
+      "epoch": 0.14506666666666668,
+      "grad_norm": 0.5063715481565947,
+      "learning_rate": 0.0001931772807134704,
+      "loss": 0.7737,
+      "step": 272
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.48835833856488436,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.8563,
+      "step": 273
+    },
+    {
+      "epoch": 0.14613333333333334,
+      "grad_norm": 0.4395440943990699,
+      "learning_rate": 0.00019305125365231084,
+      "loss": 0.7804,
+      "step": 274
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 0.4979492435398362,
+      "learning_rate": 0.00019298782322968815,
+      "loss": 0.7873,
+      "step": 275
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.41945522453959166,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.7641,
+      "step": 276
+    },
+    {
+      "epoch": 0.14773333333333333,
+      "grad_norm": 0.5756452943189352,
+      "learning_rate": 0.0001928601295474208,
+      "loss": 0.8103,
+      "step": 277
+    },
+    {
+      "epoch": 0.14826666666666666,
+      "grad_norm": 0.5278354578833819,
+      "learning_rate": 0.00019279586666908884,
+      "loss": 0.7678,
+      "step": 278
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.5187228675688395,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7779,
+      "step": 279
+    },
+    {
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.5084344970815391,
+      "learning_rate": 0.00019266650979752136,
+      "loss": 0.8791,
+      "step": 280
+    },
+    {
+      "epoch": 0.14986666666666668,
+      "grad_norm": 0.43249457702628513,
+      "learning_rate": 0.00019260141619056507,
+      "loss": 0.724,
+      "step": 281
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.8073685700016736,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.9622,
+      "step": 282
+    },
+    {
+      "epoch": 0.15093333333333334,
+      "grad_norm": 0.4914654636072854,
+      "learning_rate": 0.0001924703996062038,
+      "loss": 0.8337,
+      "step": 283
+    },
+    {
+      "epoch": 0.15146666666666667,
+      "grad_norm": 0.48357646647529584,
+      "learning_rate": 0.0001924044770200342,
+      "loss": 0.7225,
+      "step": 284
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.4310719779438626,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.8143,
+      "step": 285
+    },
+    {
+      "epoch": 0.15253333333333333,
+      "grad_norm": 0.393733200337738,
+      "learning_rate": 0.0001922718042439908,
+      "loss": 0.7469,
+      "step": 286
+    },
+    {
+      "epoch": 0.15306666666666666,
+      "grad_norm": 0.46631053413694085,
+      "learning_rate": 0.000192205054450298,
+      "loss": 0.8536,
+      "step": 287
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.5267547426065811,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.7167,
+      "step": 288
+    },
+    {
+      "epoch": 0.15413333333333334,
+      "grad_norm": 0.5298582908589178,
+      "learning_rate": 0.00019207072904819486,
+      "loss": 0.8328,
+      "step": 289
+    },
+    {
+      "epoch": 0.15466666666666667,
+      "grad_norm": 0.4962593380924952,
+      "learning_rate": 0.00019200315384090044,
+      "loss": 0.8272,
+      "step": 290
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.47633365700766406,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7841,
+      "step": 291
+    },
+    {
+      "epoch": 0.15573333333333333,
+      "grad_norm": 0.6184731909045871,
+      "learning_rate": 0.00019186717942277462,
+      "loss": 0.8716,
+      "step": 292
+    },
+    {
+      "epoch": 0.15626666666666666,
+      "grad_norm": 0.5423682614680138,
+      "learning_rate": 0.00019179878061798347,
+      "loss": 0.8162,
+      "step": 293
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.4753038045880565,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.7919,
+      "step": 294
+    },
+    {
+      "epoch": 0.15733333333333333,
+      "grad_norm": 0.43126625082440967,
+      "learning_rate": 0.00019166116083819002,
+      "loss": 0.7528,
+      "step": 295
+    },
+    {
+      "epoch": 0.15786666666666666,
+      "grad_norm": 0.5098174762604958,
+      "learning_rate": 0.00019159194027414128,
+      "loss": 0.7522,
+      "step": 296
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.4521083102475084,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.7991,
+      "step": 297
+    },
+    {
+      "epoch": 0.15893333333333334,
+      "grad_norm": 0.4491062878449407,
+      "learning_rate": 0.00019145267883125482,
+      "loss": 0.7777,
+      "step": 298
+    },
+    {
+      "epoch": 0.15946666666666667,
+      "grad_norm": 0.437593424681035,
+      "learning_rate": 0.00019138263836827288,
+      "loss": 0.763,
+      "step": 299
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5399001237902789,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.7841,
+      "step": 300
+    },
+    {
+      "epoch": 0.16053333333333333,
+      "grad_norm": 0.4954818390198195,
+      "learning_rate": 0.00019124173900498818,
+      "loss": 0.7606,
+      "step": 301
+    },
+    {
+      "epoch": 0.16106666666666666,
+      "grad_norm": 0.4880227953266813,
+      "learning_rate": 0.00019117088052543233,
+      "loss": 0.8003,
+      "step": 302
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.4714495974948282,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8491,
+      "step": 303
+    },
+    {
+      "epoch": 0.16213333333333332,
+      "grad_norm": 0.4139041793775968,
+      "learning_rate": 0.00019102834702846387,
+      "loss": 0.7971,
+      "step": 304
+    },
+    {
+      "epoch": 0.16266666666666665,
+      "grad_norm": 0.4376309534083468,
+      "learning_rate": 0.0001909566724366779,
+      "loss": 0.7318,
+      "step": 305
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.4970619286626978,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.8498,
+      "step": 306
+    },
+    {
+      "epoch": 0.16373333333333334,
+      "grad_norm": 0.4405073792264246,
+      "learning_rate": 0.00019081250863665794,
+      "loss": 0.8222,
+      "step": 307
+    },
+    {
+      "epoch": 0.16426666666666667,
+      "grad_norm": 0.4159170308634145,
+      "learning_rate": 0.0001907400198589189,
+      "loss": 0.7393,
+      "step": 308
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.4918985235769081,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.8494,
+      "step": 309
+    },
+    {
+      "epoch": 0.16533333333333333,
+      "grad_norm": 0.43915509097101807,
+      "learning_rate": 0.00019059422963029464,
+      "loss": 0.7508,
+      "step": 310
+    },
+    {
+      "epoch": 0.16586666666666666,
+      "grad_norm": 0.4262514693238682,
+      "learning_rate": 0.0001905209286147611,
+      "loss": 0.7063,
+      "step": 311
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.44518080468138016,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7023,
+      "step": 312
+    },
+    {
+      "epoch": 0.16693333333333332,
+      "grad_norm": 0.44798453783013126,
+      "learning_rate": 0.0001903735158756905,
+      "loss": 0.8001,
+      "step": 313
+    },
+    {
+      "epoch": 0.16746666666666668,
+      "grad_norm": 0.5221309013695491,
+      "learning_rate": 0.0001902994045923502,
+      "loss": 0.763,
+      "step": 314
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.5015011598188029,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.7695,
+      "step": 315
+    },
+    {
+      "epoch": 0.16853333333333334,
+      "grad_norm": 0.4717158377294084,
+      "learning_rate": 0.0001901503733045967,
+      "loss": 0.7668,
+      "step": 316
+    },
+    {
+      "epoch": 0.16906666666666667,
+      "grad_norm": 0.519902869646817,
+      "learning_rate": 0.00019007545374521355,
+      "loss": 0.8886,
+      "step": 317
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.40475351672985693,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7456,
+      "step": 318
+    },
+    {
+      "epoch": 0.17013333333333333,
+      "grad_norm": 0.47398524364066114,
+      "learning_rate": 0.00018992480791403958,
+      "loss": 0.7906,
+      "step": 319
+    },
+    {
+      "epoch": 0.17066666666666666,
+      "grad_norm": 0.5131552000446418,
+      "learning_rate": 0.0001898490820921001,
+      "loss": 0.7802,
+      "step": 320
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.4865027391662875,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.7945,
+      "step": 321
+    },
+    {
+      "epoch": 0.17173333333333332,
+      "grad_norm": 0.4072220371863116,
+      "learning_rate": 0.0001896968257661595,
+      "loss": 0.7384,
+      "step": 322
+    },
+    {
+      "epoch": 0.17226666666666668,
+      "grad_norm": 0.5496872257803109,
+      "learning_rate": 0.00018962029571681886,
+      "loss": 0.8269,
+      "step": 323
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.5145981423411348,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.6772,
+      "step": 324
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 0.4821094201599681,
+      "learning_rate": 0.00018946643298804793,
+      "loss": 0.7669,
+      "step": 325
+    },
+    {
+      "epoch": 0.17386666666666667,
+      "grad_norm": 0.4761904012202571,
+      "learning_rate": 0.00018938910076807513,
+      "loss": 0.756,
+      "step": 326
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.5751089208138552,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.8186,
+      "step": 327
+    },
+    {
+      "epoch": 0.17493333333333333,
+      "grad_norm": 0.5357565464675319,
+      "learning_rate": 0.0001892336357715829,
+      "loss": 0.7737,
+      "step": 328
+    },
+    {
+      "epoch": 0.17546666666666666,
+      "grad_norm": 0.44306512734774434,
+      "learning_rate": 0.0001891555034593055,
+      "loss": 0.79,
+      "step": 329
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4441691047508839,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.812,
+      "step": 330
+    },
+    {
+      "epoch": 0.17653333333333332,
+      "grad_norm": 0.41442330109161496,
+      "learning_rate": 0.00018899844037326225,
+      "loss": 0.7088,
+      "step": 331
+    },
+    {
+      "epoch": 0.17706666666666668,
+      "grad_norm": 0.45195857961777847,
+      "learning_rate": 0.0001889195100685106,
+      "loss": 0.7549,
+      "step": 332
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.4611292039707245,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7933,
+      "step": 333
+    },
+    {
+      "epoch": 0.17813333333333334,
+      "grad_norm": 0.47194603710974975,
+      "learning_rate": 0.00018876085311403593,
+      "loss": 0.7678,
+      "step": 334
+    },
+    {
+      "epoch": 0.17866666666666667,
+      "grad_norm": 0.4181099450281632,
+      "learning_rate": 0.00018868112693808665,
+      "loss": 0.8026,
+      "step": 335
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4039528236321422,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7123,
+      "step": 336
+    },
+    {
+      "epoch": 0.17973333333333333,
+      "grad_norm": 0.43416047403619185,
+      "learning_rate": 0.00018852088037913577,
+      "loss": 0.7105,
+      "step": 337
+    },
+    {
+      "epoch": 0.18026666666666666,
+      "grad_norm": 0.4122624485330412,
+      "learning_rate": 0.0001884403604746547,
+      "loss": 0.726,
+      "step": 338
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.5140401902686076,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8084,
+      "step": 339
+    },
+    {
+      "epoch": 0.18133333333333335,
+      "grad_norm": 0.4081266792459306,
+      "learning_rate": 0.00018827852861790398,
+      "loss": 0.7446,
+      "step": 340
+    },
+    {
+      "epoch": 0.18186666666666668,
+      "grad_norm": 0.4610111592380031,
+      "learning_rate": 0.00018819721714888877,
+      "loss": 0.7713,
+      "step": 341
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.5000624203011732,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.7945,
+      "step": 342
+    },
+    {
+      "epoch": 0.18293333333333334,
+      "grad_norm": 0.5525391954986797,
+      "learning_rate": 0.00018803380434362,
+      "loss": 0.7058,
+      "step": 343
+    },
+    {
+      "epoch": 0.18346666666666667,
+      "grad_norm": 0.4935700600524609,
+      "learning_rate": 0.0001879517034953418,
+      "loss": 0.8355,
+      "step": 344
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.5179993188442348,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7575,
+      "step": 345
+    },
+    {
+      "epoch": 0.18453333333333333,
+      "grad_norm": 0.45452978446209813,
+      "learning_rate": 0.00018778671413332513,
+      "loss": 0.6695,
+      "step": 346
+    },
+    {
+      "epoch": 0.18506666666666666,
+      "grad_norm": 0.4721788518305318,
+      "learning_rate": 0.00018770382611226987,
+      "loss": 0.762,
+      "step": 347
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.4409461330278918,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.7311,
+      "step": 348
+    },
+    {
+      "epoch": 0.18613333333333335,
+      "grad_norm": 0.42867380804011745,
+      "learning_rate": 0.000187537264627646,
+      "loss": 0.7423,
+      "step": 349
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.4642311275096459,
+      "learning_rate": 0.00018745359166145523,
+      "loss": 0.7603,
+      "step": 350
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.5211127221392075,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7624,
+      "step": 351
+    },
+    {
+      "epoch": 0.18773333333333334,
+      "grad_norm": 0.50544600584653,
+      "learning_rate": 0.00018728546253061614,
+      "loss": 0.7266,
+      "step": 352
+    },
+    {
+      "epoch": 0.18826666666666667,
+      "grad_norm": 0.48471960478633924,
+      "learning_rate": 0.00018720100686802694,
+      "loss": 0.8137,
+      "step": 353
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5004536148113095,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.8497,
+      "step": 354
+    },
+    {
+      "epoch": 0.18933333333333333,
+      "grad_norm": 0.4341950230096382,
+      "learning_rate": 0.00018703131460949554,
+      "loss": 0.7625,
+      "step": 355
+    },
+    {
+      "epoch": 0.18986666666666666,
+      "grad_norm": 0.4163684696163548,
+      "learning_rate": 0.0001869460785202802,
+      "loss": 0.7397,
+      "step": 356
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.4621245017314665,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7982,
+      "step": 357
+    },
+    {
+      "epoch": 0.19093333333333334,
+      "grad_norm": 0.4986620999162991,
+      "learning_rate": 0.00018677482769458904,
+      "loss": 0.8684,
+      "step": 358
+    },
+    {
+      "epoch": 0.19146666666666667,
+      "grad_norm": 0.6101277639283995,
+      "learning_rate": 0.00018668881346949417,
+      "loss": 0.855,
+      "step": 359
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.5920423165090527,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.9037,
+      "step": 360
+    },
+    {
+      "epoch": 0.19253333333333333,
+      "grad_norm": 0.43327268782626305,
+      "learning_rate": 0.00018651600867906272,
+      "loss": 0.7659,
+      "step": 361
+    },
+    {
+      "epoch": 0.19306666666666666,
+      "grad_norm": 0.45571022796915445,
+      "learning_rate": 0.00018642921862974742,
+      "loss": 0.818,
+      "step": 362
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.5126591759054586,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.8502,
+      "step": 363
+    },
+    {
+      "epoch": 0.19413333333333332,
+      "grad_norm": 0.5093667930965703,
+      "learning_rate": 0.00018625486451875843,
+      "loss": 0.7239,
+      "step": 364
+    },
+    {
+      "epoch": 0.19466666666666665,
+      "grad_norm": 0.4468011573800045,
+      "learning_rate": 0.0001861673009777325,
+      "loss": 0.803,
+      "step": 365
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.39014778684031004,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.7288,
+      "step": 366
+    },
+    {
+      "epoch": 0.19573333333333334,
+      "grad_norm": 0.5208611631026197,
+      "learning_rate": 0.00018599140223200716,
+      "loss": 0.8064,
+      "step": 367
+    },
+    {
+      "epoch": 0.19626666666666667,
+      "grad_norm": 0.5658619006275716,
+      "learning_rate": 0.0001859030675525681,
+      "loss": 0.79,
+      "step": 368
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.5306869958440832,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.8517,
+      "step": 369
+    },
+    {
+      "epoch": 0.19733333333333333,
+      "grad_norm": 0.41596871920773343,
+      "learning_rate": 0.0001857256288994402,
+      "loss": 0.7658,
+      "step": 370
+    },
+    {
+      "epoch": 0.19786666666666666,
+      "grad_norm": 0.4769088573609008,
+      "learning_rate": 0.00018563652545561013,
+      "loss": 0.8171,
+      "step": 371
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.39326204339613774,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.6806,
+      "step": 372
+    },
+    {
+      "epoch": 0.19893333333333332,
+      "grad_norm": 0.4723424029911581,
+      "learning_rate": 0.000185457551663799,
+      "loss": 0.7456,
+      "step": 373
+    },
+    {
+      "epoch": 0.19946666666666665,
+      "grad_norm": 0.42068601070158923,
+      "learning_rate": 0.00018536768185026083,
+      "loss": 0.7408,
+      "step": 374
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.47245125655841735,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.7825,
+      "step": 375
+    },
+    {
+      "epoch": 0.20053333333333334,
+      "grad_norm": 0.45630395670920204,
+      "learning_rate": 0.00018518717772974302,
+      "loss": 0.7221,
+      "step": 376
+    },
+    {
+      "epoch": 0.20106666666666667,
+      "grad_norm": 0.45953926555663455,
+      "learning_rate": 0.00018509654396177609,
+      "loss": 0.6982,
+      "step": 377
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4068974961988575,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.7189,
+      "step": 378
+    },
+    {
+      "epoch": 0.20213333333333333,
+      "grad_norm": 0.5866832999064415,
+      "learning_rate": 0.00018491451436365627,
+      "loss": 0.904,
+      "step": 379
+    },
+    {
+      "epoch": 0.20266666666666666,
+      "grad_norm": 0.4332083949076997,
+      "learning_rate": 0.0001848231190770714,
+      "loss": 0.6994,
+      "step": 380
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.5086504725226426,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.8404,
+      "step": 381
+    },
+    {
+      "epoch": 0.20373333333333332,
+      "grad_norm": 0.54160624793637,
+      "learning_rate": 0.00018463956889345194,
+      "loss": 0.7675,
+      "step": 382
+    },
+    {
+      "epoch": 0.20426666666666668,
+      "grad_norm": 0.4638294028941344,
+      "learning_rate": 0.00018454741454452603,
+      "loss": 0.8093,
+      "step": 383
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4418799635284932,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7754,
+      "step": 384
+    },
+    {
+      "epoch": 0.20533333333333334,
+      "grad_norm": 0.4387775361332178,
+      "learning_rate": 0.00018436234870837547,
+      "loss": 0.7714,
+      "step": 385
+    },
+    {
+      "epoch": 0.20586666666666667,
+      "grad_norm": 0.47276823056876205,
+      "learning_rate": 0.00018426943777378552,
+      "loss": 0.6988,
+      "step": 386
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4347940653732354,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7045,
+      "step": 387
+    },
+    {
+      "epoch": 0.20693333333333333,
+      "grad_norm": 0.4954054214191427,
+      "learning_rate": 0.00018408286125880604,
+      "loss": 0.7661,
+      "step": 388
+    },
+    {
+      "epoch": 0.20746666666666666,
+      "grad_norm": 0.4070067571016391,
+      "learning_rate": 0.00018398919623556238,
+      "loss": 0.7161,
+      "step": 389
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4663594433106534,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.7952,
+      "step": 390
+    },
+    {
+      "epoch": 0.20853333333333332,
+      "grad_norm": 0.46838253955764675,
+      "learning_rate": 0.0001838011140560562,
+      "loss": 0.8083,
+      "step": 391
+    },
+    {
+      "epoch": 0.20906666666666668,
+      "grad_norm": 0.5202814517890559,
+      "learning_rate": 0.00018370669746143564,
+      "loss": 0.7643,
+      "step": 392
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5123534446161135,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.8352,
+      "step": 393
+    },
+    {
+      "epoch": 0.21013333333333334,
+      "grad_norm": 0.49453488048027294,
+      "learning_rate": 0.0001835171146721701,
+      "loss": 0.8083,
+      "step": 394
+    },
+    {
+      "epoch": 0.21066666666666667,
+      "grad_norm": 0.4944787757352668,
+      "learning_rate": 0.00018342194904364813,
+      "loss": 0.8,
+      "step": 395
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.6122735332559011,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.7948,
+      "step": 396
+    },
+    {
+      "epoch": 0.21173333333333333,
+      "grad_norm": 0.6068704476523478,
+      "learning_rate": 0.00018323087073971993,
+      "loss": 0.8312,
+      "step": 397
+    },
+    {
+      "epoch": 0.21226666666666666,
+      "grad_norm": 0.56855732304193,
+      "learning_rate": 0.00018313495863490258,
+      "loss": 0.7921,
+      "step": 398
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.43206934589137125,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.758,
+      "step": 399
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.4280314397137237,
+      "learning_rate": 0.00018294238995160094,
+      "loss": 0.7688,
+      "step": 400
+    },
+    {
+      "epoch": 0.21386666666666668,
+      "grad_norm": 0.44525241902820006,
+      "learning_rate": 0.00018284573394815597,
+      "loss": 0.7484,
+      "step": 401
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.4672559340463771,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7284,
+      "step": 402
+    },
+    {
+      "epoch": 0.21493333333333334,
+      "grad_norm": 0.47010851180359864,
+      "learning_rate": 0.00018265168006082437,
+      "loss": 0.7738,
+      "step": 403
+    },
+    {
+      "epoch": 0.21546666666666667,
+      "grad_norm": 0.4347907834237806,
+      "learning_rate": 0.00018255428275641214,
+      "loss": 0.7609,
+      "step": 404
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5185870081958338,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7889,
+      "step": 405
+    },
+    {
+      "epoch": 0.21653333333333333,
+      "grad_norm": 0.38201472047316243,
+      "learning_rate": 0.0001823587488803095,
+      "loss": 0.6871,
+      "step": 406
+    },
+    {
+      "epoch": 0.21706666666666666,
+      "grad_norm": 0.5105212067074328,
+      "learning_rate": 0.00018226061289251298,
+      "loss": 0.7924,
+      "step": 407
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4566184882131342,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7331,
+      "step": 408
+    },
+    {
+      "epoch": 0.21813333333333335,
+      "grad_norm": 0.419991866669449,
+      "learning_rate": 0.00018206360428267332,
+      "loss": 0.6958,
+      "step": 409
+    },
+    {
+      "epoch": 0.21866666666666668,
+      "grad_norm": 0.42825530377397397,
+      "learning_rate": 0.00018196473224892784,
+      "loss": 0.7512,
+      "step": 410
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.39509894216822994,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.6676,
+      "step": 411
+    },
+    {
+      "epoch": 0.21973333333333334,
+      "grad_norm": 0.426398643286428,
+      "learning_rate": 0.0001817662542000192,
+      "loss": 0.7766,
+      "step": 412
+    },
+    {
+      "epoch": 0.22026666666666667,
+      "grad_norm": 0.437866021319674,
+      "learning_rate": 0.0001816666487775416,
+      "loss": 0.7289,
+      "step": 413
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4768363463629006,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7541,
+      "step": 414
+    },
+    {
+      "epoch": 0.22133333333333333,
+      "grad_norm": 0.43166674310102465,
+      "learning_rate": 0.00018146670662372354,
+      "loss": 0.7144,
+      "step": 415
+    },
+    {
+      "epoch": 0.22186666666666666,
+      "grad_norm": 0.5097188904050277,
+      "learning_rate": 0.0001813663704894407,
+      "loss": 0.7777,
+      "step": 416
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.527118178931027,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.8253,
+      "step": 417
+    },
+    {
+      "epoch": 0.22293333333333334,
+      "grad_norm": 0.46308375087316267,
+      "learning_rate": 0.00018116496960422107,
+      "loss": 0.6781,
+      "step": 418
+    },
+    {
+      "epoch": 0.22346666666666667,
+      "grad_norm": 0.4265418606802498,
+      "learning_rate": 0.00018106390545469795,
+      "loss": 0.7129,
+      "step": 419
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5470685105632853,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.754,
+      "step": 420
+    },
+    {
+      "epoch": 0.22453333333333333,
+      "grad_norm": 0.5449023406053731,
+      "learning_rate": 0.00018086105125078857,
+      "loss": 0.7436,
+      "step": 421
+    },
+    {
+      "epoch": 0.22506666666666666,
+      "grad_norm": 0.5002866378720431,
+      "learning_rate": 0.00018075926180215576,
+      "loss": 0.809,
+      "step": 422
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.436075859668813,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.7931,
+      "step": 423
+    },
+    {
+      "epoch": 0.22613333333333333,
+      "grad_norm": 0.5547698136524428,
+      "learning_rate": 0.0001805549597313267,
+      "loss": 0.7451,
+      "step": 424
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 0.42423881229772237,
+      "learning_rate": 0.0001804524477192075,
+      "loss": 0.7424,
+      "step": 425
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.41932308479173486,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7207,
+      "step": 426
+    },
+    {
+      "epoch": 0.22773333333333334,
+      "grad_norm": 0.44222308190052617,
+      "learning_rate": 0.00018024670327214084,
+      "loss": 0.8142,
+      "step": 427
+    },
+    {
+      "epoch": 0.22826666666666667,
+      "grad_norm": 0.43376550354966625,
+      "learning_rate": 0.00018014347145157755,
+      "loss": 0.7304,
+      "step": 428
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.45554808496225185,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7902,
+      "step": 429
+    },
+    {
+      "epoch": 0.22933333333333333,
+      "grad_norm": 0.39233811953378955,
+      "learning_rate": 0.0001799362901577196,
+      "loss": 0.6922,
+      "step": 430
+    },
+    {
+      "epoch": 0.22986666666666666,
+      "grad_norm": 0.5143180631551983,
+      "learning_rate": 0.00017983234130309968,
+      "loss": 0.8508,
+      "step": 431
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.5093664841627964,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7694,
+      "step": 432
+    },
+    {
+      "epoch": 0.23093333333333332,
+      "grad_norm": 0.42795052337689815,
+      "learning_rate": 0.00017962372873051252,
+      "loss": 0.7434,
+      "step": 433
+    },
+    {
+      "epoch": 0.23146666666666665,
+      "grad_norm": 0.41525741778223646,
+      "learning_rate": 0.00017951906563549397,
+      "loss": 0.7508,
+      "step": 434
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.47323428827658764,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.834,
+      "step": 435
+    },
+    {
+      "epoch": 0.23253333333333334,
+      "grad_norm": 0.4801983649843368,
+      "learning_rate": 0.00017930902739070562,
+      "loss": 0.7548,
+      "step": 436
+    },
+    {
+      "epoch": 0.23306666666666667,
+      "grad_norm": 0.5060015069434444,
+      "learning_rate": 0.00017920365286814183,
+      "loss": 0.858,
+      "step": 437
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4361409062518517,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7244,
+      "step": 438
+    },
+    {
+      "epoch": 0.23413333333333333,
+      "grad_norm": 0.49103920706529286,
+      "learning_rate": 0.0001789921945959958,
+      "loss": 0.7844,
+      "step": 439
+    },
+    {
+      "epoch": 0.23466666666666666,
+      "grad_norm": 0.5653299481861203,
+      "learning_rate": 0.00017888611147786002,
+      "loss": 0.7917,
+      "step": 440
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.4353872202467468,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7144,
+      "step": 441
+    },
+    {
+      "epoch": 0.23573333333333332,
+      "grad_norm": 0.5014059158632823,
+      "learning_rate": 0.00017867323886136348,
+      "loss": 0.7122,
+      "step": 442
+    },
+    {
+      "epoch": 0.23626666666666668,
+      "grad_norm": 0.5436335750211385,
+      "learning_rate": 0.00017856644999867264,
+      "loss": 0.7866,
+      "step": 443
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.41721531036335957,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.6763,
+      "step": 444
+    },
+    {
+      "epoch": 0.23733333333333334,
+      "grad_norm": 0.507161918631008,
+      "learning_rate": 0.00017835216875884368,
+      "loss": 0.7603,
+      "step": 445
+    },
+    {
+      "epoch": 0.23786666666666667,
+      "grad_norm": 0.5885444045544327,
+      "learning_rate": 0.0001782446770215819,
+      "loss": 0.8038,
+      "step": 446
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.40265878560133256,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7168,
+      "step": 447
+    },
+    {
+      "epoch": 0.23893333333333333,
+      "grad_norm": 0.6017448386092624,
+      "learning_rate": 0.00017802899291729585,
+      "loss": 0.7982,
+      "step": 448
+    },
+    {
+      "epoch": 0.23946666666666666,
+      "grad_norm": 0.45832934136293313,
+      "learning_rate": 0.0001779208011943371,
+      "loss": 0.8608,
+      "step": 449
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.41844917394248277,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.8087,
+      "step": 450
+    },
+    {
+      "epoch": 0.24053333333333332,
+      "grad_norm": 0.46507691971111825,
+      "learning_rate": 0.00017770372002217172,
+      "loss": 0.7491,
+      "step": 451
+    },
+    {
+      "epoch": 0.24106666666666668,
+      "grad_norm": 0.47351688392240704,
+      "learning_rate": 0.00017759483122120238,
+      "loss": 0.8074,
+      "step": 452
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.4154239201061571,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.729,
+      "step": 453
+    },
+    {
+      "epoch": 0.24213333333333334,
+      "grad_norm": 0.44021123134243856,
+      "learning_rate": 0.00017737635881528196,
+      "loss": 0.6933,
+      "step": 454
+    },
+    {
+      "epoch": 0.24266666666666667,
+      "grad_norm": 0.5328921997969187,
+      "learning_rate": 0.00017726677586272263,
+      "loss": 0.8426,
+      "step": 455
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.5102523624853145,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7099,
+      "step": 456
+    },
+    {
+      "epoch": 0.24373333333333333,
+      "grad_norm": 0.457351829008867,
+      "learning_rate": 0.00017704691809456143,
+      "loss": 0.7707,
+      "step": 457
+    },
+    {
+      "epoch": 0.24426666666666666,
+      "grad_norm": 0.7127093511519104,
+      "learning_rate": 0.0001769366439354882,
+      "loss": 0.8578,
+      "step": 458
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.39037585116747653,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7079,
+      "step": 459
+    },
+    {
+      "epoch": 0.24533333333333332,
+      "grad_norm": 0.4387647959996424,
+      "learning_rate": 0.00017671540671383243,
+      "loss": 0.6804,
+      "step": 460
+    },
+    {
+      "epoch": 0.24586666666666668,
+      "grad_norm": 0.5288638738165535,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 0.8139,
+      "step": 461
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.47115878113596116,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7384,
+      "step": 462
+    },
+    {
+      "epoch": 0.24693333333333334,
+      "grad_norm": 0.46183812959425985,
+      "learning_rate": 0.00017638183358256696,
+      "loss": 0.7397,
+      "step": 463
+    },
+    {
+      "epoch": 0.24746666666666667,
+      "grad_norm": 0.5000788732299103,
+      "learning_rate": 0.00017627018591992018,
+      "loss": 0.8112,
+      "step": 464
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.48262617864872853,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.8582,
+      "step": 465
+    },
+    {
+      "epoch": 0.24853333333333333,
+      "grad_norm": 0.42645130758657335,
+      "learning_rate": 0.00017604620766564723,
+      "loss": 0.6859,
+      "step": 466
+    },
+    {
+      "epoch": 0.24906666666666666,
+      "grad_norm": 0.48305386241043113,
+      "learning_rate": 0.00017593387774285412,
+      "loss": 0.7342,
+      "step": 467
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.5387863030614242,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8496,
+      "step": 468
+    },
+    {
+      "epoch": 0.2501333333333333,
+      "grad_norm": 0.45387368932873007,
+      "learning_rate": 0.0001757085379831246,
+      "loss": 0.7747,
+      "step": 469
+    },
+    {
+      "epoch": 0.25066666666666665,
+      "grad_norm": 0.4891518550832705,
+      "learning_rate": 0.00017559552881908695,
+      "loss": 0.7686,
+      "step": 470
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.6491626825140133,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8211,
+      "step": 471
+    },
+    {
+      "epoch": 0.2517333333333333,
+      "grad_norm": 0.40152077036105355,
+      "learning_rate": 0.00017536883360997743,
+      "loss": 0.7162,
+      "step": 472
+    },
+    {
+      "epoch": 0.25226666666666664,
+      "grad_norm": 0.48088470877038086,
+      "learning_rate": 0.00017525514824185185,
+      "loss": 0.8232,
+      "step": 473
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4326827443571955,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.7644,
+      "step": 474
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 0.4026907127984479,
+      "learning_rate": 0.00017502710367586687,
+      "loss": 0.7148,
+      "step": 475
+    },
+    {
+      "epoch": 0.2538666666666667,
+      "grad_norm": 0.3808283680729642,
+      "learning_rate": 0.0001749127451589832,
+      "loss": 0.7195,
+      "step": 476
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.4629461315833693,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7329,
+      "step": 477
+    },
+    {
+      "epoch": 0.25493333333333335,
+      "grad_norm": 0.47849932306356757,
+      "learning_rate": 0.00017468335736489177,
+      "loss": 0.7467,
+      "step": 478
+    },
+    {
+      "epoch": 0.2554666666666667,
+      "grad_norm": 0.4321013194596791,
+      "learning_rate": 0.00017456832877267084,
+      "loss": 0.7224,
+      "step": 479
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4296075615472621,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7698,
+      "step": 480
+    },
+    {
+      "epoch": 0.25653333333333334,
+      "grad_norm": 0.4501563585562932,
+      "learning_rate": 0.00017433760391534167,
+      "loss": 0.8284,
+      "step": 481
+    },
+    {
+      "epoch": 0.25706666666666667,
+      "grad_norm": 0.4298971262426949,
+      "learning_rate": 0.00017422190833921283,
+      "loss": 0.7613,
+      "step": 482
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.4659220801901008,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7692,
+      "step": 483
+    },
+    {
+      "epoch": 0.2581333333333333,
+      "grad_norm": 0.5060198184251572,
+      "learning_rate": 0.00017398985261944856,
+      "loss": 0.7403,
+      "step": 484
+    },
+    {
+      "epoch": 0.25866666666666666,
+      "grad_norm": 0.40249062290001575,
+      "learning_rate": 0.00017387349316876666,
+      "loss": 0.7152,
+      "step": 485
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.5330978060621342,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.791,
+      "step": 486
+    },
+    {
+      "epoch": 0.2597333333333333,
+      "grad_norm": 0.6214848365468837,
+      "learning_rate": 0.0001736401128231373,
+      "loss": 0.7896,
+      "step": 487
+    },
+    {
+      "epoch": 0.26026666666666665,
+      "grad_norm": 0.43978605057364534,
+      "learning_rate": 0.00017352309262509894,
+      "loss": 0.7661,
+      "step": 488
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.5453447632069873,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.8464,
+      "step": 489
+    },
+    {
+      "epoch": 0.2613333333333333,
+      "grad_norm": 0.48435012249024784,
+      "learning_rate": 0.0001732883939257742,
+      "loss": 0.8194,
+      "step": 490
+    },
+    {
+      "epoch": 0.2618666666666667,
+      "grad_norm": 0.4303372788774748,
+      "learning_rate": 0.0001731707161253338,
+      "loss": 0.6974,
+      "step": 491
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.42014942047868853,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7546,
+      "step": 492
+    },
+    {
+      "epoch": 0.26293333333333335,
+      "grad_norm": 0.37255071927968164,
+      "learning_rate": 0.00017293470537991463,
+      "loss": 0.7495,
+      "step": 493
+    },
+    {
+      "epoch": 0.2634666666666667,
+      "grad_norm": 0.5001674040197623,
+      "learning_rate": 0.00017281637313969978,
+      "loss": 0.8021,
+      "step": 494
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3779890246400226,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.7286,
+      "step": 495
+    },
+    {
+      "epoch": 0.26453333333333334,
+      "grad_norm": 0.5021655118073337,
+      "learning_rate": 0.00017257905669104874,
+      "loss": 0.8506,
+      "step": 496
+    },
+    {
+      "epoch": 0.2650666666666667,
+      "grad_norm": 0.45595243514127176,
+      "learning_rate": 0.00017246007319127545,
+      "loss": 0.7342,
+      "step": 497
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.4726357599590175,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.74,
+      "step": 498
+    },
+    {
+      "epoch": 0.26613333333333333,
+      "grad_norm": 0.4521335584835581,
+      "learning_rate": 0.00017222145741734626,
+      "loss": 0.709,
+      "step": 499
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.43127508095845263,
+      "learning_rate": 0.00017210182585573327,
+      "loss": 0.7497,
+      "step": 500
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.5599697811197301,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.8137,
+      "step": 501
+    },
+    {
+      "epoch": 0.2677333333333333,
+      "grad_norm": 0.3804029533265518,
+      "learning_rate": 0.00017186191716939944,
+      "loss": 0.6915,
+      "step": 502
+    },
+    {
+      "epoch": 0.26826666666666665,
+      "grad_norm": 0.746183002609445,
+      "learning_rate": 0.0001717416407610824,
+      "loss": 0.7697,
+      "step": 503
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.5130232918519868,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.806,
+      "step": 504
+    },
+    {
+      "epoch": 0.2693333333333333,
+      "grad_norm": 0.5213460024416292,
+      "learning_rate": 0.00017150044560996488,
+      "loss": 0.8428,
+      "step": 505
+    },
+    {
+      "epoch": 0.26986666666666664,
+      "grad_norm": 0.41022692879256917,
+      "learning_rate": 0.00017137952758740978,
+      "loss": 0.7383,
+      "step": 506
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.5578436217570287,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.696,
+      "step": 507
+    },
+    {
+      "epoch": 0.27093333333333336,
+      "grad_norm": 0.45664610602130395,
+      "learning_rate": 0.00017113705245370368,
+      "loss": 0.8532,
+      "step": 508
+    },
+    {
+      "epoch": 0.2714666666666667,
+      "grad_norm": 0.5585428729991476,
+      "learning_rate": 0.00017101549606662024,
+      "loss": 0.8265,
+      "step": 509
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4244871717543694,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7385,
+      "step": 510
+    },
+    {
+      "epoch": 0.27253333333333335,
+      "grad_norm": 0.5227791534252502,
+      "learning_rate": 0.00017077174746692056,
+      "loss": 0.8892,
+      "step": 511
+    },
+    {
+      "epoch": 0.2730666666666667,
+      "grad_norm": 0.5318292679279191,
+      "learning_rate": 0.00017064955598217462,
+      "loss": 0.8573,
+      "step": 512
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.4615248700945455,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7564,
+      "step": 513
+    },
+    {
+      "epoch": 0.27413333333333334,
+      "grad_norm": 0.49176912695016994,
+      "learning_rate": 0.00017040454046730115,
+      "loss": 0.8478,
+      "step": 514
+    },
+    {
+      "epoch": 0.27466666666666667,
+      "grad_norm": 0.44791191354003707,
+      "learning_rate": 0.00017028171716882714,
+      "loss": 0.7504,
+      "step": 515
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.45997484601856076,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.736,
+      "step": 516
+    },
+    {
+      "epoch": 0.27573333333333333,
+      "grad_norm": 0.4827218607134327,
+      "learning_rate": 0.00017003544132364846,
+      "loss": 0.7537,
+      "step": 517
+    },
+    {
+      "epoch": 0.27626666666666666,
+      "grad_norm": 0.4929806803809034,
+      "learning_rate": 0.00016991198951236088,
+      "loss": 0.7408,
+      "step": 518
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.4618438899566801,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.7292,
+      "step": 519
+    },
+    {
+      "epoch": 0.2773333333333333,
+      "grad_norm": 0.3990480991514046,
+      "learning_rate": 0.00016966445995561727,
+      "loss": 0.6469,
+      "step": 520
+    },
+    {
+      "epoch": 0.27786666666666665,
+      "grad_norm": 0.5689347089145561,
+      "learning_rate": 0.00016954038294932216,
+      "loss": 0.8524,
+      "step": 521
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.42746924542558523,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7454,
+      "step": 522
+    },
+    {
+      "epoch": 0.2789333333333333,
+      "grad_norm": 0.45646558867280534,
+      "learning_rate": 0.0001692916063334479,
+      "loss": 0.7797,
+      "step": 523
+    },
+    {
+      "epoch": 0.27946666666666664,
+      "grad_norm": 0.4099030337127495,
+      "learning_rate": 0.0001691669074667535,
+      "loss": 0.7424,
+      "step": 524
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.4699366544016761,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.8001,
+      "step": 525
+    },
+    {
+      "epoch": 0.28053333333333336,
+      "grad_norm": 0.48801320135287946,
+      "learning_rate": 0.0001689168904776979,
+      "loss": 0.7731,
+      "step": 526
+    },
+    {
+      "epoch": 0.2810666666666667,
+      "grad_norm": 0.4729783464102199,
+      "learning_rate": 0.00016879157310192535,
+      "loss": 0.7898,
+      "step": 527
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.366664477964398,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7197,
+      "step": 528
+    },
+    {
+      "epoch": 0.28213333333333335,
+      "grad_norm": 0.6489799939556533,
+      "learning_rate": 0.00016854032245897308,
+      "loss": 0.8747,
+      "step": 529
+    },
+    {
+      "epoch": 0.2826666666666667,
+      "grad_norm": 0.4004114664963537,
+      "learning_rate": 0.00016841438994206595,
+      "loss": 0.6793,
+      "step": 530
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.39106408936041676,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7231,
+      "step": 531
+    },
+    {
+      "epoch": 0.28373333333333334,
+      "grad_norm": 0.7550406323196349,
+      "learning_rate": 0.00016816191239765667,
+      "loss": 0.8428,
+      "step": 532
+    },
+    {
+      "epoch": 0.28426666666666667,
+      "grad_norm": 0.5840244880545498,
+      "learning_rate": 0.00016803536812409075,
+      "loss": 0.8486,
+      "step": 533
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.4751770085819043,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.7912,
+      "step": 534
+    },
+    {
+      "epoch": 0.2853333333333333,
+      "grad_norm": 0.4490688524112541,
+      "learning_rate": 0.00016778167046363734,
+      "loss": 0.7203,
+      "step": 535
+    },
+    {
+      "epoch": 0.28586666666666666,
+      "grad_norm": 0.43911848862092817,
+      "learning_rate": 0.00016765451783432953,
+      "loss": 0.7527,
+      "step": 536
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.4024482136911001,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7584,
+      "step": 537
+    },
+    {
+      "epoch": 0.2869333333333333,
+      "grad_norm": 0.39530631455486304,
+      "learning_rate": 0.0001673996068760359,
+      "loss": 0.7077,
+      "step": 538
+    },
+    {
+      "epoch": 0.28746666666666665,
+      "grad_norm": 0.3803203234642549,
+      "learning_rate": 0.00016727184930825288,
+      "loss": 0.6664,
+      "step": 539
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.590917846307457,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.8629,
+      "step": 540
+    },
+    {
+      "epoch": 0.2885333333333333,
+      "grad_norm": 0.40770436203904703,
+      "learning_rate": 0.00016701573190293077,
+      "loss": 0.7312,
+      "step": 541
+    },
+    {
+      "epoch": 0.2890666666666667,
+      "grad_norm": 0.4041210416957774,
+      "learning_rate": 0.00016688737283019706,
+      "loss": 0.6492,
+      "step": 542
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.4433940191733088,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7532,
+      "step": 543
+    },
+    {
+      "epoch": 0.29013333333333335,
+      "grad_norm": 0.5182148914526955,
+      "learning_rate": 0.00016663005586108176,
+      "loss": 0.7671,
+      "step": 544
+    },
+    {
+      "epoch": 0.2906666666666667,
+      "grad_norm": 0.4716060340008417,
+      "learning_rate": 0.00016650109873308765,
+      "loss": 0.7363,
+      "step": 545
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.43474109735536554,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7864,
+      "step": 546
+    },
+    {
+      "epoch": 0.29173333333333334,
+      "grad_norm": 0.4552286667017043,
+      "learning_rate": 0.0001662425891156531,
+      "loss": 0.7119,
+      "step": 547
+    },
+    {
+      "epoch": 0.2922666666666667,
+      "grad_norm": 0.4283714125903116,
+      "learning_rate": 0.00016611303739816168,
+      "loss": 0.7459,
+      "step": 548
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.4399614523439693,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7647,
+      "step": 549
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.49806828571806006,
+      "learning_rate": 0.00016585334207993476,
+      "loss": 0.8131,
+      "step": 550
+    },
+    {
+      "epoch": 0.29386666666666666,
+      "grad_norm": 0.47784531729781604,
+      "learning_rate": 0.00016572319925468892,
+      "loss": 0.7075,
+      "step": 551
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.43140573525010656,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7423,
+      "step": 552
+    },
+    {
+      "epoch": 0.2949333333333333,
+      "grad_norm": 0.45867146939001696,
+      "learning_rate": 0.0001654623252150624,
+      "loss": 0.7795,
+      "step": 553
+    },
+    {
+      "epoch": 0.29546666666666666,
+      "grad_norm": 0.47173040602918015,
+      "learning_rate": 0.00016533159477969122,
+      "loss": 0.7149,
+      "step": 554
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4463124416158129,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.6789,
+      "step": 555
+    },
+    {
+      "epoch": 0.2965333333333333,
+      "grad_norm": 0.43562608521827223,
+      "learning_rate": 0.00016506954902973655,
+      "loss": 0.7121,
+      "step": 556
+    },
+    {
+      "epoch": 0.29706666666666665,
+      "grad_norm": 0.510107104315274,
+      "learning_rate": 0.00016493823449766136,
+      "loss": 0.77,
+      "step": 557
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.4286178998079608,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.6464,
+      "step": 558
+    },
+    {
+      "epoch": 0.2981333333333333,
+      "grad_norm": 0.5588290670377358,
+      "learning_rate": 0.00016467502407993992,
+      "loss": 0.8497,
+      "step": 559
+    },
+    {
+      "epoch": 0.2986666666666667,
+      "grad_norm": 0.4126039314109562,
+      "learning_rate": 0.0001645431289802799,
+      "loss": 0.698,
+      "step": 560
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.4438598521550294,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.8173,
+      "step": 561
+    },
+    {
+      "epoch": 0.29973333333333335,
+      "grad_norm": 0.49310174824619524,
+      "learning_rate": 0.00016427876096865394,
+      "loss": 0.7645,
+      "step": 562
+    },
+    {
+      "epoch": 0.3002666666666667,
+      "grad_norm": 0.4827642620974618,
+      "learning_rate": 0.00016414628884613107,
+      "loss": 0.6694,
+      "step": 563
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.62650152900039,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7612,
+      "step": 564
+    },
+    {
+      "epoch": 0.30133333333333334,
+      "grad_norm": 0.4136073348459315,
+      "learning_rate": 0.00016388077034557355,
+      "loss": 0.6796,
+      "step": 565
+    },
+    {
+      "epoch": 0.30186666666666667,
+      "grad_norm": 0.4156469290680636,
+      "learning_rate": 0.00016374772476041748,
+      "loss": 0.7477,
+      "step": 566
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.46252910109112666,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7276,
+      "step": 567
+    },
+    {
+      "epoch": 0.30293333333333333,
+      "grad_norm": 0.4838458879606349,
+      "learning_rate": 0.00016348106290682118,
+      "loss": 0.7985,
+      "step": 568
+    },
+    {
+      "epoch": 0.30346666666666666,
+      "grad_norm": 0.41936094452170086,
+      "learning_rate": 0.00016334744743467364,
+      "loss": 0.746,
+      "step": 569
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4246631931992224,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7663,
+      "step": 570
+    },
+    {
+      "epoch": 0.3045333333333333,
+      "grad_norm": 0.5307689308824421,
+      "learning_rate": 0.00016307964939465914,
+      "loss": 0.8089,
+      "step": 571
+    },
+    {
+      "epoch": 0.30506666666666665,
+      "grad_norm": 0.4176028930825871,
+      "learning_rate": 0.00016294546762647775,
+      "loss": 0.7023,
+      "step": 572
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.4050874027460776,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7081,
+      "step": 573
+    },
+    {
+      "epoch": 0.3061333333333333,
+      "grad_norm": 0.5552860031694926,
+      "learning_rate": 0.0001626765405972011,
+      "loss": 0.8476,
+      "step": 574
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 0.5059177529622828,
+      "learning_rate": 0.00016254179613916278,
+      "loss": 0.7602,
+      "step": 575
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.45647957438327774,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7639,
+      "step": 576
+    },
+    {
+      "epoch": 0.30773333333333336,
+      "grad_norm": 0.45155865053962924,
+      "learning_rate": 0.000162271747348122,
+      "loss": 0.7365,
+      "step": 577
+    },
+    {
+      "epoch": 0.3082666666666667,
+      "grad_norm": 0.522916264139643,
+      "learning_rate": 0.0001621364438215262,
+      "loss": 0.74,
+      "step": 578
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.5960253694475376,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.8734,
+      "step": 579
+    },
+    {
+      "epoch": 0.30933333333333335,
+      "grad_norm": 0.5578338552813317,
+      "learning_rate": 0.00016186528052636692,
+      "loss": 0.7991,
+      "step": 580
+    },
+    {
+      "epoch": 0.3098666666666667,
+      "grad_norm": 0.5563161861649676,
+      "learning_rate": 0.0001617294215675382,
+      "loss": 0.8162,
+      "step": 581
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.40767014271829316,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7877,
+      "step": 582
+    },
+    {
+      "epoch": 0.31093333333333334,
+      "grad_norm": 0.6405265025741261,
+      "learning_rate": 0.0001614571510558588,
+      "loss": 0.8762,
+      "step": 583
+    },
+    {
+      "epoch": 0.31146666666666667,
+      "grad_norm": 0.446836140499563,
+      "learning_rate": 0.00016132074031604917,
+      "loss": 0.7816,
+      "step": 584
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.40862925375593145,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7069,
+      "step": 585
+    },
+    {
+      "epoch": 0.31253333333333333,
+      "grad_norm": 0.47345786974218745,
+      "learning_rate": 0.00016104736990520468,
+      "loss": 0.7699,
+      "step": 586
+    },
+    {
+      "epoch": 0.31306666666666666,
+      "grad_norm": 0.3964458638261727,
+      "learning_rate": 0.0001609104110504954,
+      "loss": 0.7024,
+      "step": 587
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4782487646804292,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7582,
+      "step": 588
+    },
+    {
+      "epoch": 0.3141333333333333,
+      "grad_norm": 0.4781890518885793,
+      "learning_rate": 0.00016063594808740113,
+      "loss": 0.7751,
+      "step": 589
+    },
+    {
+      "epoch": 0.31466666666666665,
+      "grad_norm": 0.4814442042715367,
+      "learning_rate": 0.00016049844479860422,
+      "loss": 0.7019,
+      "step": 590
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.5199140974588942,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.8447,
+      "step": 591
+    },
+    {
+      "epoch": 0.3157333333333333,
+      "grad_norm": 0.5313990613206334,
+      "learning_rate": 0.00016022289665953808,
+      "loss": 0.7816,
+      "step": 592
+    },
+    {
+      "epoch": 0.31626666666666664,
+      "grad_norm": 0.4799725877181169,
+      "learning_rate": 0.00016008485263209742,
+      "loss": 0.7832,
+      "step": 593
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.5266435584773811,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.6904,
+      "step": 594
+    },
+    {
+      "epoch": 0.31733333333333336,
+      "grad_norm": 0.5046991392874137,
+      "learning_rate": 0.0001598082267225018,
+      "loss": 0.7408,
+      "step": 595
+    },
+    {
+      "epoch": 0.3178666666666667,
+      "grad_norm": 0.45424349902581057,
+      "learning_rate": 0.0001596696456663938,
+      "loss": 0.698,
+      "step": 596
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.4793111057554418,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7567,
+      "step": 597
+    },
+    {
+      "epoch": 0.31893333333333335,
+      "grad_norm": 0.43227414761525423,
+      "learning_rate": 0.00015939194942067646,
+      "loss": 0.7375,
+      "step": 598
+    },
+    {
+      "epoch": 0.3194666666666667,
+      "grad_norm": 0.5643311648896452,
+      "learning_rate": 0.0001592528350603103,
+      "loss": 0.858,
+      "step": 599
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4412915834499803,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.69,
+      "step": 600
+    },
+    {
+      "epoch": 0.32053333333333334,
+      "grad_norm": 0.4531276391170449,
+      "learning_rate": 0.00015897407594164467,
+      "loss": 0.6854,
+      "step": 601
+    },
+    {
+      "epoch": 0.32106666666666667,
+      "grad_norm": 0.4905267317151564,
+      "learning_rate": 0.00015883443201576225,
+      "loss": 0.7467,
+      "step": 602
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.4911187969046422,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7275,
+      "step": 603
+    },
+    {
+      "epoch": 0.3221333333333333,
+      "grad_norm": 0.5013240404148479,
+      "learning_rate": 0.00015855461751588677,
+      "loss": 0.7718,
+      "step": 604
+    },
+    {
+      "epoch": 0.32266666666666666,
+      "grad_norm": 0.46830579668224936,
+      "learning_rate": 0.0001584144477774623,
+      "loss": 0.7554,
+      "step": 605
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.47951870119945444,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7971,
+      "step": 606
+    },
+    {
+      "epoch": 0.3237333333333333,
+      "grad_norm": 0.43737643759649525,
+      "learning_rate": 0.00015813358541647915,
+      "loss": 0.7431,
+      "step": 607
+    },
+    {
+      "epoch": 0.32426666666666665,
+      "grad_norm": 0.4765479525151643,
+      "learning_rate": 0.00015799289363261813,
+      "loss": 0.7483,
+      "step": 608
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.4851346776119451,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7464,
+      "step": 609
+    },
+    {
+      "epoch": 0.3253333333333333,
+      "grad_norm": 0.45666414540496963,
+      "learning_rate": 0.00015771099095879108,
+      "loss": 0.7142,
+      "step": 610
+    },
+    {
+      "epoch": 0.3258666666666667,
+      "grad_norm": 0.47124937687476487,
+      "learning_rate": 0.0001575697809106292,
+      "loss": 0.8063,
+      "step": 611
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4512181791665397,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.778,
+      "step": 612
+    },
+    {
+      "epoch": 0.32693333333333335,
+      "grad_norm": 0.43799214729588204,
+      "learning_rate": 0.00015728684550018064,
+      "loss": 0.715,
+      "step": 613
+    },
+    {
+      "epoch": 0.3274666666666667,
+      "grad_norm": 0.49942633605142783,
+      "learning_rate": 0.0001571451209827821,
+      "loss": 0.7268,
+      "step": 614
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.41878555210391544,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.6978,
+      "step": 615
+    },
+    {
+      "epoch": 0.32853333333333334,
+      "grad_norm": 0.4617890713138586,
+      "learning_rate": 0.00015686116043968972,
+      "loss": 0.8091,
+      "step": 616
+    },
+    {
+      "epoch": 0.3290666666666667,
+      "grad_norm": 0.5086351450883347,
+      "learning_rate": 0.00015671892526194516,
+      "loss": 0.7992,
+      "step": 617
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.3982048330141303,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7772,
+      "step": 618
+    },
+    {
+      "epoch": 0.33013333333333333,
+      "grad_norm": 0.5045069965717076,
+      "learning_rate": 0.0001564339472177373,
+      "loss": 0.8574,
+      "step": 619
+    },
+    {
+      "epoch": 0.33066666666666666,
+      "grad_norm": 0.4300046902832388,
+      "learning_rate": 0.00015629120520226165,
+      "loss": 0.6929,
+      "step": 620
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.3854888620823196,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.6634,
+      "step": 621
+    },
+    {
+      "epoch": 0.3317333333333333,
+      "grad_norm": 0.44413915656516145,
+      "learning_rate": 0.0001560052173158123,
+      "loss": 0.7947,
+      "step": 622
+    },
+    {
+      "epoch": 0.33226666666666665,
+      "grad_norm": 0.5148269240369787,
+      "learning_rate": 0.00015586197229884184,
+      "loss": 0.7268,
+      "step": 623
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.42614724746829896,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7286,
+      "step": 624
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.5188637044930834,
+      "learning_rate": 0.00015557498225616487,
+      "loss": 0.7432,
+      "step": 625
+    },
+    {
+      "epoch": 0.33386666666666664,
+      "grad_norm": 0.4437765589378293,
+      "learning_rate": 0.0001554312380874542,
+      "loss": 0.7139,
+      "step": 626
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.4782643512158861,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7499,
+      "step": 627
+    },
+    {
+      "epoch": 0.33493333333333336,
+      "grad_norm": 0.49675578497530054,
+      "learning_rate": 0.00015514325360149668,
+      "loss": 0.7929,
+      "step": 628
+    },
+    {
+      "epoch": 0.3354666666666667,
+      "grad_norm": 0.522848980382239,
+      "learning_rate": 0.0001549990141442153,
+      "loss": 0.8067,
+      "step": 629
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5079536115851717,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7452,
+      "step": 630
+    },
+    {
+      "epoch": 0.33653333333333335,
+      "grad_norm": 0.4137829685964748,
+      "learning_rate": 0.00015471004295465035,
+      "loss": 0.7043,
+      "step": 631
+    },
+    {
+      "epoch": 0.3370666666666667,
+      "grad_norm": 0.453605358620488,
+      "learning_rate": 0.0001545653120852787,
+      "loss": 0.7271,
+      "step": 632
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.4141635974805384,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.6285,
+      "step": 633
+    },
+    {
+      "epoch": 0.33813333333333334,
+      "grad_norm": 0.5076033525490622,
+      "learning_rate": 0.00015427536195829742,
+      "loss": 0.81,
+      "step": 634
+    },
+    {
+      "epoch": 0.33866666666666667,
+      "grad_norm": 0.40817821955356365,
+      "learning_rate": 0.00015413014356652286,
+      "loss": 0.6805,
+      "step": 635
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4983328302722857,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.8039,
+      "step": 636
+    },
+    {
+      "epoch": 0.33973333333333333,
+      "grad_norm": 0.4383914124946045,
+      "learning_rate": 0.00015383922229462549,
+      "loss": 0.793,
+      "step": 637
+    },
+    {
+      "epoch": 0.34026666666666666,
+      "grad_norm": 0.6154682202613985,
+      "learning_rate": 0.00015369352028323774,
+      "loss": 0.8009,
+      "step": 638
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.4996894135065563,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.6986,
+      "step": 639
+    },
+    {
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.5342737258228484,
+      "learning_rate": 0.0001534016356850244,
+      "loss": 0.7628,
+      "step": 640
+    },
+    {
+      "epoch": 0.34186666666666665,
+      "grad_norm": 0.4368134499640202,
+      "learning_rate": 0.0001532554539698105,
+      "loss": 0.7567,
+      "step": 641
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.4420806653535677,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.71,
+      "step": 642
+    },
+    {
+      "epoch": 0.3429333333333333,
+      "grad_norm": 0.4452477626339898,
+      "learning_rate": 0.00015296261388977108,
+      "loss": 0.7052,
+      "step": 643
+    },
+    {
+      "epoch": 0.34346666666666664,
+      "grad_norm": 0.4563064594584842,
+      "learning_rate": 0.0001528159563994104,
+      "loss": 0.7361,
+      "step": 644
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.45460213987624065,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7554,
+      "step": 645
+    },
+    {
+      "epoch": 0.34453333333333336,
+      "grad_norm": 0.42295115123363,
+      "learning_rate": 0.00015252216870771345,
+      "loss": 0.6675,
+      "step": 646
+    },
+    {
+      "epoch": 0.3450666666666667,
+      "grad_norm": 0.5267021561008577,
+      "learning_rate": 0.00015237503938367186,
+      "loss": 0.7679,
+      "step": 647
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.6540071760469934,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.8059,
+      "step": 648
+    },
+    {
+      "epoch": 0.34613333333333335,
+      "grad_norm": 0.5213193527286809,
+      "learning_rate": 0.00015208031197595356,
+      "loss": 0.7712,
+      "step": 649
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.43139377295203163,
+      "learning_rate": 0.0001519327147723776,
+      "loss": 0.7263,
+      "step": 650
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.4316050447440885,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7478,
+      "step": 651
+    },
+    {
+      "epoch": 0.34773333333333334,
+      "grad_norm": 0.42914517990572965,
+      "learning_rate": 0.0001516370555695291,
+      "loss": 0.7407,
+      "step": 652
+    },
+    {
+      "epoch": 0.34826666666666667,
+      "grad_norm": 0.481153045607107,
+      "learning_rate": 0.00015148899445313981,
+      "loss": 0.7459,
+      "step": 653
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.46811913912239944,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7748,
+      "step": 654
+    },
+    {
+      "epoch": 0.34933333333333333,
+      "grad_norm": 0.4342823363947245,
+      "learning_rate": 0.00015119241140109467,
+      "loss": 0.7907,
+      "step": 655
+    },
+    {
+      "epoch": 0.34986666666666666,
+      "grad_norm": 0.3905026412716211,
+      "learning_rate": 0.00015104389035108077,
+      "loss": 0.7316,
+      "step": 656
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.42610498488986187,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.7399,
+      "step": 657
+    },
+    {
+      "epoch": 0.3509333333333333,
+      "grad_norm": 0.5034508370115129,
+      "learning_rate": 0.0001507463914206012,
+      "loss": 0.8344,
+      "step": 658
+    },
+    {
+      "epoch": 0.35146666666666665,
+      "grad_norm": 0.5586391402759366,
+      "learning_rate": 0.0001505974144285124,
+      "loss": 0.74,
+      "step": 659
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.37125900653608684,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.6617,
+      "step": 660
+    },
+    {
+      "epoch": 0.3525333333333333,
+      "grad_norm": 0.485583458353565,
+      "learning_rate": 0.00015029900761497506,
+      "loss": 0.7774,
+      "step": 661
+    },
+    {
+      "epoch": 0.35306666666666664,
+      "grad_norm": 0.4954571050155668,
+      "learning_rate": 0.00015014957868461458,
+      "loss": 0.7906,
+      "step": 662
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.505070845435494,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.6987,
+      "step": 663
+    },
+    {
+      "epoch": 0.35413333333333336,
+      "grad_norm": 0.4134534293232696,
+      "learning_rate": 0.000149850272007796,
+      "loss": 0.7442,
+      "step": 664
+    },
+    {
+      "epoch": 0.3546666666666667,
+      "grad_norm": 0.5263929821549653,
+      "learning_rate": 0.00014970039515511304,
+      "loss": 0.7707,
+      "step": 665
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.44457344668479776,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7349,
+      "step": 666
+    },
+    {
+      "epoch": 0.35573333333333335,
+      "grad_norm": 0.5817464789803348,
+      "learning_rate": 0.0001494001966589736,
+      "loss": 0.7679,
+      "step": 667
+    },
+    {
+      "epoch": 0.3562666666666667,
+      "grad_norm": 0.5217524197580704,
+      "learning_rate": 0.00014924987591195547,
+      "loss": 0.7198,
+      "step": 668
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.46930386991088263,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7354,
+      "step": 669
+    },
+    {
+      "epoch": 0.35733333333333334,
+      "grad_norm": 0.5051126447435591,
+      "learning_rate": 0.0001489487936644237,
+      "loss": 0.6634,
+      "step": 670
+    },
+    {
+      "epoch": 0.35786666666666667,
+      "grad_norm": 0.49067362493495315,
+      "learning_rate": 0.00014879803306298736,
+      "loss": 0.7956,
+      "step": 671
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.5424462375306275,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.8295,
+      "step": 672
+    },
+    {
+      "epoch": 0.3589333333333333,
+      "grad_norm": 0.47017969766672846,
+      "learning_rate": 0.00014849607515574276,
+      "loss": 0.7638,
+      "step": 673
+    },
+    {
+      "epoch": 0.35946666666666666,
+      "grad_norm": 0.4540154713824308,
+      "learning_rate": 0.00014834487875162657,
+      "loss": 0.7648,
+      "step": 674
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4837506777958714,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.6739,
+      "step": 675
+    },
+    {
+      "epoch": 0.3605333333333333,
+      "grad_norm": 0.3934598204197847,
+      "learning_rate": 0.00014804205329988225,
+      "loss": 0.7396,
+      "step": 676
+    },
+    {
+      "epoch": 0.36106666666666665,
+      "grad_norm": 0.5432294644004281,
+      "learning_rate": 0.00014789042515653687,
+      "loss": 0.6925,
+      "step": 677
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4789363170771198,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7313,
+      "step": 678
+    },
+    {
+      "epoch": 0.3621333333333333,
+      "grad_norm": 0.5357514667571863,
+      "learning_rate": 0.00014758674029882152,
+      "loss": 0.8168,
+      "step": 679
+    },
+    {
+      "epoch": 0.3626666666666667,
+      "grad_norm": 0.4500945178245003,
+      "learning_rate": 0.00014743468449130063,
+      "loss": 0.7634,
+      "step": 680
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.43400383552147376,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7586,
+      "step": 681
+    },
+    {
+      "epoch": 0.36373333333333335,
+      "grad_norm": 0.4259067740107126,
+      "learning_rate": 0.00014713014838923976,
+      "loss": 0.7522,
+      "step": 682
+    },
+    {
+      "epoch": 0.3642666666666667,
+      "grad_norm": 0.5105416602018407,
+      "learning_rate": 0.00014697766900409074,
+      "loss": 0.7649,
+      "step": 683
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.41119635062549587,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.6514,
+      "step": 684
+    },
+    {
+      "epoch": 0.36533333333333334,
+      "grad_norm": 0.4588668418997286,
+      "learning_rate": 0.0001466722898421873,
+      "loss": 0.7102,
+      "step": 685
+    },
+    {
+      "epoch": 0.3658666666666667,
+      "grad_norm": 0.47124117773297414,
+      "learning_rate": 0.0001465193909773413,
+      "loss": 0.8015,
+      "step": 686
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.4658260816751191,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.8045,
+      "step": 687
+    },
+    {
+      "epoch": 0.36693333333333333,
+      "grad_norm": 0.5015444435396279,
+      "learning_rate": 0.00014621317696275564,
+      "loss": 0.7678,
+      "step": 688
+    },
+    {
+      "epoch": 0.36746666666666666,
+      "grad_norm": 0.44157284696271476,
+      "learning_rate": 0.00014605986272741748,
+      "loss": 0.7543,
+      "step": 689
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.48659294076982745,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.743,
+      "step": 690
+    },
+    {
+      "epoch": 0.3685333333333333,
+      "grad_norm": 0.4265474980449237,
+      "learning_rate": 0.00014575282208974702,
+      "loss": 0.7171,
+      "step": 691
+    },
+    {
+      "epoch": 0.36906666666666665,
+      "grad_norm": 0.4219439573412077,
+      "learning_rate": 0.00014559909660428468,
+      "loss": 0.7029,
+      "step": 692
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.3855916919420435,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.6896,
+      "step": 693
+    },
+    {
+      "epoch": 0.3701333333333333,
+      "grad_norm": 0.5427974072840934,
+      "learning_rate": 0.00014529123759534255,
+      "loss": 0.7743,
+      "step": 694
+    },
+    {
+      "epoch": 0.37066666666666664,
+      "grad_norm": 0.3866197168558701,
+      "learning_rate": 0.00014513710499117647,
+      "loss": 0.665,
+      "step": 695
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4977247982824666,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.7273,
+      "step": 696
+    },
+    {
+      "epoch": 0.37173333333333336,
+      "grad_norm": 0.49913093342251724,
+      "learning_rate": 0.00014482843588476974,
+      "loss": 0.7092,
+      "step": 697
+    },
+    {
+      "epoch": 0.3722666666666667,
+      "grad_norm": 0.39274709181532047,
+      "learning_rate": 0.00014467390030426186,
+      "loss": 0.7625,
+      "step": 698
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.501820916604351,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7874,
+      "step": 699
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.4436715480416968,
+      "learning_rate": 0.0001443644293959693,
+      "loss": 0.7617,
+      "step": 700
+    },
+    {
+      "epoch": 0.3738666666666667,
+      "grad_norm": 0.4162035570116486,
+      "learning_rate": 0.00014420949499231172,
+      "loss": 0.6804,
+      "step": 701
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.42380178364232374,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.7388,
+      "step": 702
+    },
+    {
+      "epoch": 0.37493333333333334,
+      "grad_norm": 0.4468586847170325,
+      "learning_rate": 0.00014389923059926062,
+      "loss": 0.7477,
+      "step": 703
+    },
+    {
+      "epoch": 0.37546666666666667,
+      "grad_norm": 0.42116717568302664,
+      "learning_rate": 0.0001437439015363638,
+      "loss": 0.7642,
+      "step": 704
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.43594572116323654,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.753,
+      "step": 705
+    },
+    {
+      "epoch": 0.37653333333333333,
+      "grad_norm": 0.41700576251172683,
+      "learning_rate": 0.00014343285199700683,
+      "loss": 0.7619,
+      "step": 706
+    },
+    {
+      "epoch": 0.37706666666666666,
+      "grad_norm": 0.49112500879611,
+      "learning_rate": 0.0001432771324493879,
+      "loss": 0.7173,
+      "step": 707
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.42664436452177007,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.6814,
+      "step": 708
+    },
+    {
+      "epoch": 0.3781333333333333,
+      "grad_norm": 0.5755636763054665,
+      "learning_rate": 0.00014296530612327863,
+      "loss": 0.8924,
+      "step": 709
+    },
+    {
+      "epoch": 0.37866666666666665,
+      "grad_norm": 0.41226417870120463,
+      "learning_rate": 0.00014280920027594907,
+      "loss": 0.7581,
+      "step": 710
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.41409852034763495,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.6909,
+      "step": 711
+    },
+    {
+      "epoch": 0.3797333333333333,
+      "grad_norm": 0.4824416084642549,
+      "learning_rate": 0.00014249660554351752,
+      "loss": 0.7301,
+      "step": 712
+    },
+    {
+      "epoch": 0.38026666666666664,
+      "grad_norm": 0.47526961038746723,
+      "learning_rate": 0.00014234011759187083,
+      "loss": 0.7088,
+      "step": 713
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.4382117625168252,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.727,
+      "step": 714
+    },
+    {
+      "epoch": 0.38133333333333336,
+      "grad_norm": 0.3949202593465074,
+      "learning_rate": 0.00014202676285419812,
+      "loss": 0.6361,
+      "step": 715
+    },
+    {
+      "epoch": 0.3818666666666667,
+      "grad_norm": 0.4840819307930419,
+      "learning_rate": 0.00014186989700389687,
+      "loss": 0.6759,
+      "step": 716
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.40343798682203114,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.665,
+      "step": 717
+    },
+    {
+      "epoch": 0.38293333333333335,
+      "grad_norm": 0.39634190171344325,
+      "learning_rate": 0.0001415557906824895,
+      "loss": 0.6856,
+      "step": 718
+    },
+    {
+      "epoch": 0.3834666666666667,
+      "grad_norm": 0.4607000428848122,
+      "learning_rate": 0.00014139855114935252,
+      "loss": 0.6786,
+      "step": 719
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.49937064313118673,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.7668,
+      "step": 720
+    },
+    {
+      "epoch": 0.38453333333333334,
+      "grad_norm": 0.47729305823170465,
+      "learning_rate": 0.0001410837016859161,
+      "loss": 0.7371,
+      "step": 721
+    },
+    {
+      "epoch": 0.38506666666666667,
+      "grad_norm": 0.5027456902687405,
+      "learning_rate": 0.00014092609269580496,
+      "loss": 0.7119,
+      "step": 722
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.43275518606600494,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7195,
+      "step": 723
+    },
+    {
+      "epoch": 0.38613333333333333,
+      "grad_norm": 0.44641691338004713,
+      "learning_rate": 0.00014061050855201723,
+      "loss": 0.7837,
+      "step": 724
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 0.5351564724110155,
+      "learning_rate": 0.0001404525343407228,
+      "loss": 0.7391,
+      "step": 725
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.4133518842584368,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7433,
+      "step": 726
+    },
+    {
+      "epoch": 0.3877333333333333,
+      "grad_norm": 0.4087438003882989,
+      "learning_rate": 0.00014013622399800627,
+      "loss": 0.6868,
+      "step": 727
+    },
+    {
+      "epoch": 0.38826666666666665,
+      "grad_norm": 0.47086739260029037,
+      "learning_rate": 0.00013997788881113489,
+      "loss": 0.6866,
+      "step": 728
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.45708602893778905,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7001,
+      "step": 729
+    },
+    {
+      "epoch": 0.3893333333333333,
+      "grad_norm": 0.4152859381495282,
+      "learning_rate": 0.0001396608607704289,
+      "loss": 0.7203,
+      "step": 730
+    },
+    {
+      "epoch": 0.38986666666666664,
+      "grad_norm": 0.5280364808819354,
+      "learning_rate": 0.0001395021688632882,
+      "loss": 0.7786,
+      "step": 731
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.4212996214832479,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7541,
+      "step": 732
+    },
+    {
+      "epoch": 0.39093333333333335,
+      "grad_norm": 0.4470649164782845,
+      "learning_rate": 0.00013918443164482046,
+      "loss": 0.7063,
+      "step": 733
+    },
+    {
+      "epoch": 0.3914666666666667,
+      "grad_norm": 0.4315979565521048,
+      "learning_rate": 0.000139025387282305,
+      "loss": 0.6626,
+      "step": 734
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.42566473320011,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.738,
+      "step": 735
+    },
+    {
+      "epoch": 0.39253333333333335,
+      "grad_norm": 0.40467441171444274,
+      "learning_rate": 0.0001387069494253626,
+      "loss": 0.6889,
+      "step": 736
+    },
+    {
+      "epoch": 0.3930666666666667,
+      "grad_norm": 0.37594266801130694,
+      "learning_rate": 0.0001385475568818394,
+      "loss": 0.7135,
+      "step": 737
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.4639301597199985,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7565,
+      "step": 738
+    },
+    {
+      "epoch": 0.39413333333333334,
+      "grad_norm": 0.48709909205666824,
+      "learning_rate": 0.00013822842694453924,
+      "loss": 0.7843,
+      "step": 739
+    },
+    {
+      "epoch": 0.39466666666666667,
+      "grad_norm": 0.4916092905206473,
+      "learning_rate": 0.0001380686905037327,
+      "loss": 0.8131,
+      "step": 740
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.49642888986543793,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7182,
+      "step": 741
+    },
+    {
+      "epoch": 0.3957333333333333,
+      "grad_norm": 0.45779062911232865,
+      "learning_rate": 0.00013774887706279165,
+      "loss": 0.7063,
+      "step": 742
+    },
+    {
+      "epoch": 0.39626666666666666,
+      "grad_norm": 0.41682184977545866,
+      "learning_rate": 0.0001375888010176686,
+      "loss": 0.7044,
+      "step": 743
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.5636949224513499,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.8135,
+      "step": 744
+    },
+    {
+      "epoch": 0.3973333333333333,
+      "grad_norm": 0.5214464152961691,
+      "learning_rate": 0.00013726831266817278,
+      "loss": 0.7709,
+      "step": 745
+    },
+    {
+      "epoch": 0.39786666666666665,
+      "grad_norm": 0.4542017923917684,
+      "learning_rate": 0.00013710790132082692,
+      "loss": 0.7317,
+      "step": 746
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.47749811632424743,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7397,
+      "step": 747
+    },
+    {
+      "epoch": 0.3989333333333333,
+      "grad_norm": 0.3735648222986758,
+      "learning_rate": 0.00013678674667600102,
+      "loss": 0.6383,
+      "step": 748
+    },
+    {
+      "epoch": 0.3994666666666667,
+      "grad_norm": 0.4016201210529084,
+      "learning_rate": 0.00013662600433753745,
+      "loss": 0.679,
+      "step": 749
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.505896793729167,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.8141,
+      "step": 750
+    },
+    {
+      "epoch": 0.40053333333333335,
+      "grad_norm": 0.5604762114299658,
+      "learning_rate": 0.00013630419202851284,
+      "loss": 0.7992,
+      "step": 751
+    },
+    {
+      "epoch": 0.4010666666666667,
+      "grad_norm": 0.48154668494328884,
+      "learning_rate": 0.00013614312301893223,
+      "loss": 0.7482,
+      "step": 752
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.3834678300713089,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.6933,
+      "step": 753
+    },
+    {
+      "epoch": 0.40213333333333334,
+      "grad_norm": 0.37959432979002766,
+      "learning_rate": 0.00013582066169451535,
+      "loss": 0.7097,
+      "step": 754
+    },
+    {
+      "epoch": 0.4026666666666667,
+      "grad_norm": 0.42885235257161053,
+      "learning_rate": 0.0001356592703425976,
+      "loss": 0.7522,
+      "step": 755
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.4648928612908778,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7111,
+      "step": 756
+    },
+    {
+      "epoch": 0.40373333333333333,
+      "grad_norm": 0.4577270811186029,
+      "learning_rate": 0.00013533616866903735,
+      "loss": 0.7886,
+      "step": 757
+    },
+    {
+      "epoch": 0.40426666666666666,
+      "grad_norm": 0.4146231641601951,
+      "learning_rate": 0.0001351744593122255,
+      "loss": 0.6729,
+      "step": 758
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.4691410563463206,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7931,
+      "step": 759
+    },
+    {
+      "epoch": 0.4053333333333333,
+      "grad_norm": 0.44006688074462075,
+      "learning_rate": 0.00013485072597298038,
+      "loss": 0.693,
+      "step": 760
+    },
+    {
+      "epoch": 0.40586666666666665,
+      "grad_norm": 0.49475514221150574,
+      "learning_rate": 0.00013468870295726398,
+      "loss": 0.672,
+      "step": 761
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.3784851119779148,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.6786,
+      "step": 762
+    },
+    {
+      "epoch": 0.4069333333333333,
+      "grad_norm": 0.4031492933596611,
+      "learning_rate": 0.00013436434665276865,
+      "loss": 0.7517,
+      "step": 763
+    },
+    {
+      "epoch": 0.40746666666666664,
+      "grad_norm": 0.48680566995254776,
+      "learning_rate": 0.00013420201433256689,
+      "loss": 0.7367,
+      "step": 764
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.5113939087123414,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7802,
+      "step": 765
+    },
+    {
+      "epoch": 0.40853333333333336,
+      "grad_norm": 0.4102874605034871,
+      "learning_rate": 0.00013387704377999842,
+      "loss": 0.7369,
+      "step": 766
+    },
+    {
+      "epoch": 0.4090666666666667,
+      "grad_norm": 0.4171096051214564,
+      "learning_rate": 0.00013371440651804313,
+      "loss": 0.7368,
+      "step": 767
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4524192577700863,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7456,
+      "step": 768
+    },
+    {
+      "epoch": 0.41013333333333335,
+      "grad_norm": 0.3666143699110309,
+      "learning_rate": 0.00013338883045108674,
+      "loss": 0.7239,
+      "step": 769
+    },
+    {
+      "epoch": 0.4106666666666667,
+      "grad_norm": 0.5440299357704128,
+      "learning_rate": 0.00013322589261830517,
+      "loss": 0.8337,
+      "step": 770
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.46149288718804504,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7527,
+      "step": 771
+    },
+    {
+      "epoch": 0.41173333333333334,
+      "grad_norm": 0.3801147835957714,
+      "learning_rate": 0.0001328997197869194,
+      "loss": 0.6629,
+      "step": 772
+    },
+    {
+      "epoch": 0.41226666666666667,
+      "grad_norm": 0.4221683917992794,
+      "learning_rate": 0.0001327364857623168,
+      "loss": 0.6924,
+      "step": 773
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4774992398891083,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7155,
+      "step": 774
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 0.4708910106683695,
+      "learning_rate": 0.00013240972493249847,
+      "loss": 0.7459,
+      "step": 775
+    },
+    {
+      "epoch": 0.41386666666666666,
+      "grad_norm": 0.5191159285599448,
+      "learning_rate": 0.0001322461991030402,
+      "loss": 0.7612,
+      "step": 776
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.5036982816746822,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7711,
+      "step": 777
+    },
+    {
+      "epoch": 0.4149333333333333,
+      "grad_norm": 0.41557325237011916,
+      "learning_rate": 0.00013191885905658872,
+      "loss": 0.6962,
+      "step": 778
+    },
+    {
+      "epoch": 0.41546666666666665,
+      "grad_norm": 0.49612511563399675,
+      "learning_rate": 0.0001317550458170826,
+      "loss": 0.7896,
+      "step": 779
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4232564445836102,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.6845,
+      "step": 780
+    },
+    {
+      "epoch": 0.4165333333333333,
+      "grad_norm": 0.4757437427343549,
+      "learning_rate": 0.00013142713535136414,
+      "loss": 0.7722,
+      "step": 781
+    },
+    {
+      "epoch": 0.41706666666666664,
+      "grad_norm": 0.45951650279539347,
+      "learning_rate": 0.00013126303910434214,
+      "loss": 0.7107,
+      "step": 782
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.4930026379152187,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.8066,
+      "step": 783
+    },
+    {
+      "epoch": 0.41813333333333336,
+      "grad_norm": 0.5154519145490679,
+      "learning_rate": 0.00013093456703205288,
+      "loss": 0.7432,
+      "step": 784
+    },
+    {
+      "epoch": 0.4186666666666667,
+      "grad_norm": 0.43147853903317523,
+      "learning_rate": 0.00013077019218765305,
+      "loss": 0.7335,
+      "step": 785
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.5245300853486963,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7684,
+      "step": 786
+    },
+    {
+      "epoch": 0.41973333333333335,
+      "grad_norm": 0.48268223350859163,
+      "learning_rate": 0.0001304411673365826,
+      "loss": 0.7963,
+      "step": 787
+    },
+    {
+      "epoch": 0.4202666666666667,
+      "grad_norm": 0.45626224106345264,
+      "learning_rate": 0.0001302765183124302,
+      "loss": 0.7106,
+      "step": 788
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.4816615903566304,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7542,
+      "step": 789
+    },
+    {
+      "epoch": 0.42133333333333334,
+      "grad_norm": 0.44273772291567315,
+      "learning_rate": 0.00012994694952522435,
+      "loss": 0.743,
+      "step": 790
+    },
+    {
+      "epoch": 0.42186666666666667,
+      "grad_norm": 0.4627967288211668,
+      "learning_rate": 0.00012978203074631334,
+      "loss": 0.7363,
+      "step": 791
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.4921961571550859,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.792,
+      "step": 792
+    },
+    {
+      "epoch": 0.42293333333333333,
+      "grad_norm": 0.4950946721863211,
+      "learning_rate": 0.00012945192688023624,
+      "loss": 0.7982,
+      "step": 793
+    },
+    {
+      "epoch": 0.42346666666666666,
+      "grad_norm": 0.5001968723810016,
+      "learning_rate": 0.0001292867427788104,
+      "loss": 0.7497,
+      "step": 794
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.4987396790002395,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7409,
+      "step": 795
+    },
+    {
+      "epoch": 0.4245333333333333,
+      "grad_norm": 0.5033106613112274,
+      "learning_rate": 0.00012895611270550666,
+      "loss": 0.7685,
+      "step": 796
+    },
+    {
+      "epoch": 0.42506666666666665,
+      "grad_norm": 0.35568005161962374,
+      "learning_rate": 0.0001287906677209403,
+      "loss": 0.6351,
+      "step": 797
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.4484372275365866,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7417,
+      "step": 798
+    },
+    {
+      "epoch": 0.4261333333333333,
+      "grad_norm": 0.5534419189105819,
+      "learning_rate": 0.0001284595203261965,
+      "loss": 0.8277,
+      "step": 799
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.48027637709844756,
+      "learning_rate": 0.00012829381890487536,
+      "loss": 0.7758,
+      "step": 800
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.39266254437436654,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.6756,
+      "step": 801
+    },
+    {
+      "epoch": 0.42773333333333335,
+      "grad_norm": 0.4484161848088681,
+      "learning_rate": 0.00012796216308838117,
+      "loss": 0.7329,
+      "step": 802
+    },
+    {
+      "epoch": 0.4282666666666667,
+      "grad_norm": 0.40397101383857004,
+      "learning_rate": 0.00012779620968358273,
+      "loss": 0.7604,
+      "step": 803
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4642499099886137,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7229,
+      "step": 804
+    },
+    {
+      "epoch": 0.42933333333333334,
+      "grad_norm": 0.36456119311729024,
+      "learning_rate": 0.00012746405435869198,
+      "loss": 0.6818,
+      "step": 805
+    },
+    {
+      "epoch": 0.4298666666666667,
+      "grad_norm": 0.4460536435572603,
+      "learning_rate": 0.00012729785343046588,
+      "loss": 0.7805,
+      "step": 806
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.46166211598194307,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7206,
+      "step": 807
+    },
+    {
+      "epoch": 0.43093333333333333,
+      "grad_norm": 0.4733167613493444,
+      "learning_rate": 0.00012696520752395672,
+      "loss": 0.8046,
+      "step": 808
+    },
+    {
+      "epoch": 0.43146666666666667,
+      "grad_norm": 0.4375935945924862,
+      "learning_rate": 0.00012679876353900482,
+      "loss": 0.7366,
+      "step": 809
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4940390389594289,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7666,
+      "step": 810
+    },
+    {
+      "epoch": 0.4325333333333333,
+      "grad_norm": 0.4364323796942158,
+      "learning_rate": 0.00012646563599083996,
+      "loss": 0.7708,
+      "step": 811
+    },
+    {
+      "epoch": 0.43306666666666666,
+      "grad_norm": 0.4096999973044443,
+      "learning_rate": 0.00012629895342239643,
+      "loss": 0.7294,
+      "step": 812
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.4361399645771164,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.6885,
+      "step": 813
+    },
+    {
+      "epoch": 0.4341333333333333,
+      "grad_norm": 0.44174351872813766,
+      "learning_rate": 0.00012596535318548289,
+      "loss": 0.7446,
+      "step": 814
+    },
+    {
+      "epoch": 0.43466666666666665,
+      "grad_norm": 0.427523329003452,
+      "learning_rate": 0.0001257984365131938,
+      "loss": 0.7147,
+      "step": 815
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.5778421408101975,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.862,
+      "step": 816
+    },
+    {
+      "epoch": 0.4357333333333333,
+      "grad_norm": 0.44459566364555486,
+      "learning_rate": 0.00012546437255314222,
+      "loss": 0.7121,
+      "step": 817
+    },
+    {
+      "epoch": 0.4362666666666667,
+      "grad_norm": 0.4706629147593683,
+      "learning_rate": 0.0001252972262629454,
+      "loss": 0.6938,
+      "step": 818
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.49030812004620705,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7943,
+      "step": 819
+    },
+    {
+      "epoch": 0.43733333333333335,
+      "grad_norm": 0.4994646143202072,
+      "learning_rate": 0.00012496270755782914,
+      "loss": 0.7081,
+      "step": 820
+    },
+    {
+      "epoch": 0.4378666666666667,
+      "grad_norm": 0.39750133719675035,
+      "learning_rate": 0.00012479533614183334,
+      "loss": 0.6595,
+      "step": 821
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.45194558573661464,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7234,
+      "step": 822
+    },
+    {
+      "epoch": 0.43893333333333334,
+      "grad_norm": 0.46899376059178066,
+      "learning_rate": 0.00012446037168194714,
+      "loss": 0.7355,
+      "step": 823
+    },
+    {
+      "epoch": 0.43946666666666667,
+      "grad_norm": 0.48068632630673874,
+      "learning_rate": 0.00012429277963831148,
+      "loss": 0.7246,
+      "step": 824
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.612549402178685,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.8644,
+      "step": 825
+    },
+    {
+      "epoch": 0.44053333333333333,
+      "grad_norm": 0.48657475109795506,
+      "learning_rate": 0.00012395737842592995,
+      "loss": 0.768,
+      "step": 826
+    },
+    {
+      "epoch": 0.44106666666666666,
+      "grad_norm": 0.5062870143187943,
+      "learning_rate": 0.000123789570258743,
+      "loss": 0.8549,
+      "step": 827
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.39742262719452504,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.6501,
+      "step": 828
+    },
+    {
+      "epoch": 0.4421333333333333,
+      "grad_norm": 0.3975895593026467,
+      "learning_rate": 0.00012345374130787854,
+      "loss": 0.6275,
+      "step": 829
+    },
+    {
+      "epoch": 0.44266666666666665,
+      "grad_norm": 0.4420456644998677,
+      "learning_rate": 0.00012328572152703725,
+      "loss": 0.7184,
+      "step": 830
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.6000175887702327,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.797,
+      "step": 831
+    },
+    {
+      "epoch": 0.4437333333333333,
+      "grad_norm": 0.4845212329336108,
+      "learning_rate": 0.00012294947386319794,
+      "loss": 0.7368,
+      "step": 832
+    },
+    {
+      "epoch": 0.44426666666666664,
+      "grad_norm": 0.4128619727170774,
+      "learning_rate": 0.0001227812469842864,
+      "loss": 0.7111,
+      "step": 833
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.40976818344769816,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.6721,
+      "step": 834
+    },
+    {
+      "epoch": 0.44533333333333336,
+      "grad_norm": 0.5002400319715944,
+      "learning_rate": 0.00012244458964423327,
+      "loss": 0.7052,
+      "step": 835
+    },
+    {
+      "epoch": 0.4458666666666667,
+      "grad_norm": 0.42928149943260674,
+      "learning_rate": 0.00012227616018840154,
+      "loss": 0.7343,
+      "step": 836
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.46100917796831087,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7286,
+      "step": 837
+    },
+    {
+      "epoch": 0.44693333333333335,
+      "grad_norm": 0.5142961676832801,
+      "learning_rate": 0.00012193910221990581,
+      "loss": 0.6487,
+      "step": 838
+    },
+    {
+      "epoch": 0.4474666666666667,
+      "grad_norm": 0.5098026662721512,
+      "learning_rate": 0.00012177047471374807,
+      "loss": 0.7197,
+      "step": 839
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4460468154341747,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.747,
+      "step": 840
+    },
+    {
+      "epoch": 0.44853333333333334,
+      "grad_norm": 0.4775939193735352,
+      "learning_rate": 0.0001214330251753481,
+      "loss": 0.6684,
+      "step": 841
+    },
+    {
+      "epoch": 0.44906666666666667,
+      "grad_norm": 0.4661822041472872,
+      "learning_rate": 0.00012126420415078132,
+      "loss": 0.7398,
+      "step": 842
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.3917799650828493,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.6741,
+      "step": 843
+    },
+    {
+      "epoch": 0.45013333333333333,
+      "grad_norm": 0.4467040975511843,
+      "learning_rate": 0.00012092637211153885,
+      "loss": 0.6942,
+      "step": 844
+    },
+    {
+      "epoch": 0.45066666666666666,
+      "grad_norm": 0.5170019902686592,
+      "learning_rate": 0.0001207573621056809,
+      "loss": 0.7176,
+      "step": 845
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3994721099123077,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.719,
+      "step": 846
+    },
+    {
+      "epoch": 0.4517333333333333,
+      "grad_norm": 0.5164083639501578,
+      "learning_rate": 0.00012041915664493761,
+      "loss": 0.7935,
+      "step": 847
+    },
+    {
+      "epoch": 0.45226666666666665,
+      "grad_norm": 0.41581390005486735,
+      "learning_rate": 0.00012024996219998517,
+      "loss": 0.7246,
+      "step": 848
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.47301085242615,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7147,
+      "step": 849
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.38789396006274784,
+      "learning_rate": 0.00011991139240711857,
+      "loss": 0.7101,
+      "step": 850
+    },
+    {
+      "epoch": 0.45386666666666664,
+      "grad_norm": 0.5357165894970772,
+      "learning_rate": 0.00011974201807022525,
+      "loss": 0.7068,
+      "step": 851
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.42820463843571843,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7453,
+      "step": 852
+    },
+    {
+      "epoch": 0.45493333333333336,
+      "grad_norm": 0.37348328995932895,
+      "learning_rate": 0.00011940309304440433,
+      "loss": 0.6773,
+      "step": 853
+    },
+    {
+      "epoch": 0.4554666666666667,
+      "grad_norm": 0.4568102293336539,
+      "learning_rate": 0.00011923354336755835,
+      "loss": 0.6935,
+      "step": 854
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.465763855201166,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.6747,
+      "step": 855
+    },
+    {
+      "epoch": 0.45653333333333335,
+      "grad_norm": 0.44343252011603657,
+      "learning_rate": 0.00011889427221749916,
+      "loss": 0.7298,
+      "step": 856
+    },
+    {
+      "epoch": 0.4570666666666667,
+      "grad_norm": 0.41756697563581,
+      "learning_rate": 0.00011872455175740112,
+      "loss": 0.7099,
+      "step": 857
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3543448389846333,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7016,
+      "step": 858
+    },
+    {
+      "epoch": 0.45813333333333334,
+      "grad_norm": 0.4749102980088939,
+      "learning_rate": 0.00011838494360112185,
+      "loss": 0.7041,
+      "step": 859
+    },
+    {
+      "epoch": 0.45866666666666667,
+      "grad_norm": 0.4843989649822722,
+      "learning_rate": 0.00011821505691906216,
+      "loss": 0.7386,
+      "step": 860
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.43647707931454,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7496,
+      "step": 861
+    },
+    {
+      "epoch": 0.4597333333333333,
+      "grad_norm": 0.4488231733014884,
+      "learning_rate": 0.00011787512088363817,
+      "loss": 0.7443,
+      "step": 862
+    },
+    {
+      "epoch": 0.46026666666666666,
+      "grad_norm": 0.4260702659667508,
+      "learning_rate": 0.00011770507254537453,
+      "loss": 0.7232,
+      "step": 863
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.3986432094275747,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.6624,
+      "step": 864
+    },
+    {
+      "epoch": 0.4613333333333333,
+      "grad_norm": 0.4592828621112967,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.74,
+      "step": 865
+    },
+    {
+      "epoch": 0.46186666666666665,
+      "grad_norm": 0.4408939554219865,
+      "learning_rate": 0.00011719461234232764,
+      "loss": 0.68,
+      "step": 866
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.5086358279517801,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.6979,
+      "step": 867
+    },
+    {
+      "epoch": 0.4629333333333333,
+      "grad_norm": 0.39525537490305795,
+      "learning_rate": 0.00011685404796484225,
+      "loss": 0.6846,
+      "step": 868
+    },
+    {
+      "epoch": 0.4634666666666667,
+      "grad_norm": 0.4885877975258833,
+      "learning_rate": 0.00011668369002869912,
+      "loss": 0.6606,
+      "step": 869
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.39425244906586043,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7186,
+      "step": 870
+    },
+    {
+      "epoch": 0.46453333333333335,
+      "grad_norm": 0.5567465211281566,
+      "learning_rate": 0.00011634282520518383,
+      "loss": 0.7419,
+      "step": 871
+    },
+    {
+      "epoch": 0.4650666666666667,
+      "grad_norm": 0.48497221446308514,
+      "learning_rate": 0.00011617231933568578,
+      "loss": 0.7072,
+      "step": 872
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.452834992623621,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7357,
+      "step": 873
+    },
+    {
+      "epoch": 0.46613333333333334,
+      "grad_norm": 0.40701039380651016,
+      "learning_rate": 0.00011583116322698935,
+      "loss": 0.7171,
+      "step": 874
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.4105060383138123,
+      "learning_rate": 0.00011566051400653486,
+      "loss": 0.6545,
+      "step": 875
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.43566126345189354,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7283,
+      "step": 876
+    },
+    {
+      "epoch": 0.46773333333333333,
+      "grad_norm": 0.47208521131207976,
+      "learning_rate": 0.00011531907578133429,
+      "loss": 0.7091,
+      "step": 877
+    },
+    {
+      "epoch": 0.46826666666666666,
+      "grad_norm": 0.5688134151204622,
+      "learning_rate": 0.00011514828779617459,
+      "loss": 0.7425,
+      "step": 878
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.5443143232913138,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7445,
+      "step": 879
+    },
+    {
+      "epoch": 0.4693333333333333,
+      "grad_norm": 0.48085819843103145,
+      "learning_rate": 0.00011480657663072896,
+      "loss": 0.6705,
+      "step": 880
+    },
+    {
+      "epoch": 0.46986666666666665,
+      "grad_norm": 0.4386271549602791,
+      "learning_rate": 0.00011463565447084445,
+      "loss": 0.7251,
+      "step": 881
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.4096971176974968,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.648,
+      "step": 882
+    },
+    {
+      "epoch": 0.4709333333333333,
+      "grad_norm": 0.4993489649886308,
+      "learning_rate": 0.00011429367954874819,
+      "loss": 0.7352,
+      "step": 883
+    },
+    {
+      "epoch": 0.47146666666666665,
+      "grad_norm": 0.5522085818405623,
+      "learning_rate": 0.0001141226278077254,
+      "loss": 0.7709,
+      "step": 884
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4400400359043301,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7175,
+      "step": 885
+    },
+    {
+      "epoch": 0.47253333333333336,
+      "grad_norm": 0.40620390789782984,
+      "learning_rate": 0.00011378039831966134,
+      "loss": 0.7021,
+      "step": 886
+    },
+    {
+      "epoch": 0.4730666666666667,
+      "grad_norm": 0.5572622124721404,
+      "learning_rate": 0.00011360922159456928,
+      "loss": 0.6601,
+      "step": 887
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.4323977666292517,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.6909,
+      "step": 888
+    },
+    {
+      "epoch": 0.47413333333333335,
+      "grad_norm": 0.476603405798883,
+      "learning_rate": 0.00011326674673806195,
+      "loss": 0.7392,
+      "step": 889
+    },
+    {
+      "epoch": 0.4746666666666667,
+      "grad_norm": 0.49721643178524155,
+      "learning_rate": 0.00011309544962932862,
+      "loss": 0.8066,
+      "step": 890
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.4727030971806207,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7741,
+      "step": 891
+    },
+    {
+      "epoch": 0.47573333333333334,
+      "grad_norm": 0.43043729497363453,
+      "learning_rate": 0.00011275273860849684,
+      "loss": 0.734,
+      "step": 892
+    },
+    {
+      "epoch": 0.47626666666666667,
+      "grad_norm": 0.4891809662815345,
+      "learning_rate": 0.00011258132571978555,
+      "loss": 0.6993,
+      "step": 893
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.5432057177158748,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.7441,
+      "step": 894
+    },
+    {
+      "epoch": 0.47733333333333333,
+      "grad_norm": 0.39716243401806606,
+      "learning_rate": 0.00011223838774509514,
+      "loss": 0.7452,
+      "step": 895
+    },
+    {
+      "epoch": 0.47786666666666666,
+      "grad_norm": 0.5257584489427201,
+      "learning_rate": 0.00011206686368318086,
+      "loss": 0.6369,
+      "step": 896
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.40698333482853216,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.6332,
+      "step": 897
+    },
+    {
+      "epoch": 0.4789333333333333,
+      "grad_norm": 0.4369951475725393,
+      "learning_rate": 0.00011172370797119712,
+      "loss": 0.7275,
+      "step": 898
+    },
+    {
+      "epoch": 0.47946666666666665,
+      "grad_norm": 0.43995365876772313,
+      "learning_rate": 0.00011155207734584263,
+      "loss": 0.7402,
+      "step": 899
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.40987317542711543,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7448,
+      "step": 900
+    },
+    {
+      "epoch": 0.4805333333333333,
+      "grad_norm": 0.4932690579196363,
+      "learning_rate": 0.00011120871311898254,
+      "loss": 0.7736,
+      "step": 901
+    },
+    {
+      "epoch": 0.48106666666666664,
+      "grad_norm": 0.5829479809151474,
+      "learning_rate": 0.0001110369805428146,
+      "loss": 0.7785,
+      "step": 902
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.5123604082665411,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.7132,
+      "step": 903
+    },
+    {
+      "epoch": 0.48213333333333336,
+      "grad_norm": 0.47175680926389957,
+      "learning_rate": 0.0001106934170290991,
+      "loss": 0.7543,
+      "step": 904
+    },
+    {
+      "epoch": 0.4826666666666667,
+      "grad_norm": 0.463546791041696,
+      "learning_rate": 0.00011052158711748434,
+      "loss": 0.6827,
+      "step": 905
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.4736667915714236,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.7586,
+      "step": 906
+    },
+    {
+      "epoch": 0.48373333333333335,
+      "grad_norm": 0.5095245626981498,
+      "learning_rate": 0.00011017783355029026,
+      "loss": 0.7394,
+      "step": 907
+    },
+    {
+      "epoch": 0.4842666666666667,
+      "grad_norm": 0.4079615704673969,
+      "learning_rate": 0.00011000591092121127,
+      "loss": 0.7186,
+      "step": 908
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.39594416147779593,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7293,
+      "step": 909
+    },
+    {
+      "epoch": 0.48533333333333334,
+      "grad_norm": 0.49022952350120763,
+      "learning_rate": 0.0001096619765390232,
+      "loss": 0.8388,
+      "step": 910
+    },
+    {
+      "epoch": 0.48586666666666667,
+      "grad_norm": 0.5171604362910454,
+      "learning_rate": 0.00010948996581295436,
+      "loss": 0.7446,
+      "step": 911
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.39036446499127037,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7097,
+      "step": 912
+    },
+    {
+      "epoch": 0.48693333333333333,
+      "grad_norm": 0.45214678839921707,
+      "learning_rate": 0.00010914585985911632,
+      "loss": 0.6657,
+      "step": 913
+    },
+    {
+      "epoch": 0.48746666666666666,
+      "grad_norm": 0.44036420103706336,
+      "learning_rate": 0.00010897376565889971,
+      "loss": 0.6973,
+      "step": 914
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.39197642684368633,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.7057,
+      "step": 915
+    },
+    {
+      "epoch": 0.4885333333333333,
+      "grad_norm": 0.3988590838084675,
+      "learning_rate": 0.00010862949738136681,
+      "loss": 0.6701,
+      "step": 916
+    },
+    {
+      "epoch": 0.48906666666666665,
+      "grad_norm": 0.4958658645415372,
+      "learning_rate": 0.00010845732433208779,
+      "loss": 0.7006,
+      "step": 917
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.4456206698071224,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.8273,
+      "step": 918
+    },
+    {
+      "epoch": 0.4901333333333333,
+      "grad_norm": 0.39693850277163817,
+      "learning_rate": 0.00010811290298317755,
+      "loss": 0.6024,
+      "step": 919
+    },
+    {
+      "epoch": 0.49066666666666664,
+      "grad_norm": 0.428211053877834,
+      "learning_rate": 0.00010794065571204072,
+      "loss": 0.6739,
+      "step": 920
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.4791909619530287,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7601,
+      "step": 921
+    },
+    {
+      "epoch": 0.49173333333333336,
+      "grad_norm": 0.4750482800486702,
+      "learning_rate": 0.00010759609054818458,
+      "loss": 0.7445,
+      "step": 922
+    },
+    {
+      "epoch": 0.4922666666666667,
+      "grad_norm": 0.41290528170872354,
+      "learning_rate": 0.00010742377368438914,
+      "loss": 0.652,
+      "step": 923
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.42791752193612376,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.6863,
+      "step": 924
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 0.40641415334107067,
+      "learning_rate": 0.00010707907396588361,
+      "loss": 0.682,
+      "step": 925
+    },
+    {
+      "epoch": 0.4938666666666667,
+      "grad_norm": 0.3878362011438658,
+      "learning_rate": 0.0001069066921404992,
+      "loss": 0.6437,
+      "step": 926
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.5024343126636618,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7408,
+      "step": 927
+    },
+    {
+      "epoch": 0.49493333333333334,
+      "grad_norm": 0.4962270164379621,
+      "learning_rate": 0.00010656186713125689,
+      "loss": 0.7553,
+      "step": 928
+    },
+    {
+      "epoch": 0.49546666666666667,
+      "grad_norm": 0.38549643158558333,
+      "learning_rate": 0.0001063894249770989,
+      "loss": 0.6732,
+      "step": 929
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.43394431289703034,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.6688,
+      "step": 930
+    },
+    {
+      "epoch": 0.4965333333333333,
+      "grad_norm": 0.4294888638549087,
+      "learning_rate": 0.00010604448394439983,
+      "loss": 0.7451,
+      "step": 931
+    },
+    {
+      "epoch": 0.49706666666666666,
+      "grad_norm": 0.513370767246734,
+      "learning_rate": 0.00010587198609590505,
+      "loss": 0.7381,
+      "step": 932
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.38562627685147055,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.6666,
+      "step": 933
+    },
+    {
+      "epoch": 0.4981333333333333,
+      "grad_norm": 0.41283769392651115,
+      "learning_rate": 0.00010552693831014726,
+      "loss": 0.6441,
+      "step": 934
+    },
+    {
+      "epoch": 0.49866666666666665,
+      "grad_norm": 0.4619707404051264,
+      "learning_rate": 0.0001053543894032493,
+      "loss": 0.7493,
+      "step": 935
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.4287926829979124,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.6777,
+      "step": 936
+    },
+    {
+      "epoch": 0.4997333333333333,
+      "grad_norm": 0.4160457016809911,
+      "learning_rate": 0.00010500924413769988,
+      "loss": 0.6726,
+      "step": 937
+    },
+    {
+      "epoch": 0.5002666666666666,
+      "grad_norm": 0.5554187905057468,
+      "learning_rate": 0.00010483664880970457,
+      "loss": 0.7604,
+      "step": 938
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.5159855991913099,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7763,
+      "step": 939
+    },
+    {
+      "epoch": 0.5013333333333333,
+      "grad_norm": 0.41714889656788784,
+      "learning_rate": 0.00010449141534025045,
+      "loss": 0.7219,
+      "step": 940
+    },
+    {
+      "epoch": 0.5018666666666667,
+      "grad_norm": 0.49842980465802544,
+      "learning_rate": 0.00010431877822971117,
+      "loss": 0.717,
+      "step": 941
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.5844190017128177,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.8275,
+      "step": 942
+    },
+    {
+      "epoch": 0.5029333333333333,
+      "grad_norm": 0.4595790009260156,
+      "learning_rate": 0.00010397346583460971,
+      "loss": 0.7924,
+      "step": 943
+    },
+    {
+      "epoch": 0.5034666666666666,
+      "grad_norm": 0.41040216394669327,
+      "learning_rate": 0.0001038007915812028,
+      "loss": 0.717,
+      "step": 944
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.44866665228577424,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.6627,
+      "step": 945
+    },
+    {
+      "epoch": 0.5045333333333333,
+      "grad_norm": 0.35373256778456563,
+      "learning_rate": 0.0001034554095408326,
+      "loss": 0.6602,
+      "step": 946
+    },
+    {
+      "epoch": 0.5050666666666667,
+      "grad_norm": 0.4355239438501313,
+      "learning_rate": 0.00010328270278523256,
+      "loss": 0.7167,
+      "step": 947
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.45168989893797795,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7315,
+      "step": 948
+    },
+    {
+      "epoch": 0.5061333333333333,
+      "grad_norm": 0.433058877472146,
+      "learning_rate": 0.00010293726038184393,
+      "loss": 0.7167,
+      "step": 949
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.5147396969205077,
+      "learning_rate": 0.00010276452576559879,
+      "loss": 0.7996,
+      "step": 950
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.39820945052815354,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.6183,
+      "step": 951
+    },
+    {
+      "epoch": 0.5077333333333334,
+      "grad_norm": 0.4494938100536776,
+      "learning_rate": 0.00010241903228306431,
+      "loss": 0.7202,
+      "step": 952
+    },
+    {
+      "epoch": 0.5082666666666666,
+      "grad_norm": 0.44647286417911763,
+      "learning_rate": 0.0001022462744484709,
+      "loss": 0.6793,
+      "step": 953
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.45288656938564176,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.6569,
+      "step": 954
+    },
+    {
+      "epoch": 0.5093333333333333,
+      "grad_norm": 0.36496777988182755,
+      "learning_rate": 0.00010190073917203589,
+      "loss": 0.6806,
+      "step": 955
+    },
+    {
+      "epoch": 0.5098666666666667,
+      "grad_norm": 0.38203936765064545,
+      "learning_rate": 0.00010172796276201503,
+      "loss": 0.6965,
+      "step": 956
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.4709068591866945,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.746,
+      "step": 957
+    },
+    {
+      "epoch": 0.5109333333333334,
+      "grad_norm": 0.4219177375044993,
+      "learning_rate": 0.00010138239497804804,
+      "loss": 0.6893,
+      "step": 958
+    },
+    {
+      "epoch": 0.5114666666666666,
+      "grad_norm": 0.5870826706336743,
+      "learning_rate": 0.00010120960463601976,
+      "loss": 0.7592,
+      "step": 959
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.5327220178331229,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7329,
+      "step": 960
+    },
+    {
+      "epoch": 0.5125333333333333,
+      "grad_norm": 0.41305113452194786,
+      "learning_rate": 0.00010086401363176305,
+      "loss": 0.658,
+      "step": 961
+    },
+    {
+      "epoch": 0.5130666666666667,
+      "grad_norm": 0.46250787679599625,
+      "learning_rate": 0.00010069121400152181,
+      "loss": 0.6998,
+      "step": 962
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.4213464021773313,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.6636,
+      "step": 963
+    },
+    {
+      "epoch": 0.5141333333333333,
+      "grad_norm": 0.41013232445121084,
+      "learning_rate": 0.0001003456090648416,
+      "loss": 0.6652,
+      "step": 964
+    },
+    {
+      "epoch": 0.5146666666666667,
+      "grad_norm": 0.44713210682610505,
+      "learning_rate": 0.00010017280479043147,
+      "loss": 0.694,
+      "step": 965
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.43464675714158246,
+      "learning_rate": 0.0001,
+      "loss": 0.7232,
+      "step": 966
+    },
+    {
+      "epoch": 0.5157333333333334,
+      "grad_norm": 0.4249593090191897,
+      "learning_rate": 9.982719520956855e-05,
+      "loss": 0.6795,
+      "step": 967
+    },
+    {
+      "epoch": 0.5162666666666667,
+      "grad_norm": 0.38694978508371686,
+      "learning_rate": 9.965439093515841e-05,
+      "loss": 0.6764,
+      "step": 968
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.5271099145074317,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7437,
+      "step": 969
+    },
+    {
+      "epoch": 0.5173333333333333,
+      "grad_norm": 0.46321151785706344,
+      "learning_rate": 9.930878599847821e-05,
+      "loss": 0.7345,
+      "step": 970
+    },
+    {
+      "epoch": 0.5178666666666667,
+      "grad_norm": 0.45221869725403974,
+      "learning_rate": 9.913598636823693e-05,
+      "loss": 0.7907,
+      "step": 971
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.42539803281429966,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.6517,
+      "step": 972
+    },
+    {
+      "epoch": 0.5189333333333334,
+      "grad_norm": 0.36006699989058355,
+      "learning_rate": 9.879039536398024e-05,
+      "loss": 0.6864,
+      "step": 973
+    },
+    {
+      "epoch": 0.5194666666666666,
+      "grad_norm": 0.36030444172155374,
+      "learning_rate": 9.861760502195197e-05,
+      "loss": 0.6732,
+      "step": 974
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.6375235575589436,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7887,
+      "step": 975
+    },
+    {
+      "epoch": 0.5205333333333333,
+      "grad_norm": 0.4803671988291979,
+      "learning_rate": 9.827203723798498e-05,
+      "loss": 0.7565,
+      "step": 976
+    },
+    {
+      "epoch": 0.5210666666666667,
+      "grad_norm": 0.40744927417844706,
+      "learning_rate": 9.809926082796415e-05,
+      "loss": 0.7236,
+      "step": 977
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.6152319463546623,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.7776,
+      "step": 978
+    },
+    {
+      "epoch": 0.5221333333333333,
+      "grad_norm": 0.455641754941924,
+      "learning_rate": 9.775372555152912e-05,
+      "loss": 0.7718,
+      "step": 979
+    },
+    {
+      "epoch": 0.5226666666666666,
+      "grad_norm": 0.4261067033513993,
+      "learning_rate": 9.758096771693573e-05,
+      "loss": 0.6904,
+      "step": 980
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.45068920636638804,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6989,
+      "step": 981
+    },
+    {
+      "epoch": 0.5237333333333334,
+      "grad_norm": 0.40785941519734986,
+      "learning_rate": 9.723547423440122e-05,
+      "loss": 0.6472,
+      "step": 982
+    },
+    {
+      "epoch": 0.5242666666666667,
+      "grad_norm": 0.45278998387070374,
+      "learning_rate": 9.70627396181561e-05,
+      "loss": 0.7171,
+      "step": 983
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4702138361071314,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7618,
+      "step": 984
+    },
+    {
+      "epoch": 0.5253333333333333,
+      "grad_norm": 0.42087003933684464,
+      "learning_rate": 9.671729721476746e-05,
+      "loss": 0.6856,
+      "step": 985
+    },
+    {
+      "epoch": 0.5258666666666667,
+      "grad_norm": 0.4529602989870126,
+      "learning_rate": 9.654459045916743e-05,
+      "loss": 0.6972,
+      "step": 986
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.41772132583315275,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7117,
+      "step": 987
+    },
+    {
+      "epoch": 0.5269333333333334,
+      "grad_norm": 0.46866569234472266,
+      "learning_rate": 9.619920841879725e-05,
+      "loss": 0.7737,
+      "step": 988
+    },
+    {
+      "epoch": 0.5274666666666666,
+      "grad_norm": 0.41867640636136205,
+      "learning_rate": 9.602653416539031e-05,
+      "loss": 0.7189,
+      "step": 989
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.44186665568231653,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.7619,
+      "step": 990
+    },
+    {
+      "epoch": 0.5285333333333333,
+      "grad_norm": 0.385017555406739,
+      "learning_rate": 9.568122177028884e-05,
+      "loss": 0.6549,
+      "step": 991
+    },
+    {
+      "epoch": 0.5290666666666667,
+      "grad_norm": 0.43614042454037605,
+      "learning_rate": 9.550858465974958e-05,
+      "loss": 0.7709,
+      "step": 992
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.3762296247678466,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.6497,
+      "step": 993
+    },
+    {
+      "epoch": 0.5301333333333333,
+      "grad_norm": 0.47830120516263364,
+      "learning_rate": 9.516335119029546e-05,
+      "loss": 0.7929,
+      "step": 994
+    },
+    {
+      "epoch": 0.5306666666666666,
+      "grad_norm": 0.5227073834645749,
+      "learning_rate": 9.499075586230013e-05,
+      "loss": 0.734,
+      "step": 995
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.44905653509857996,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.7087,
+      "step": 996
+    },
+    {
+      "epoch": 0.5317333333333333,
+      "grad_norm": 0.42883202891996847,
+      "learning_rate": 9.464561059675073e-05,
+      "loss": 0.6814,
+      "step": 997
+    },
+    {
+      "epoch": 0.5322666666666667,
+      "grad_norm": 0.40213194026312754,
+      "learning_rate": 9.44730616898528e-05,
+      "loss": 0.6643,
+      "step": 998
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.4333556142908201,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.687,
+      "step": 999
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.41318618429500226,
+      "learning_rate": 9.412801390409497e-05,
+      "loss": 0.7211,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5338666666666667,
+      "grad_norm": 0.39668297734504787,
+      "learning_rate": 9.395551605560018e-05,
+      "loss": 0.6753,
+      "step": 1001
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.39651168833521083,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.6816,
+      "step": 1002
+    },
+    {
+      "epoch": 0.5349333333333334,
+      "grad_norm": 0.5556691657425049,
+      "learning_rate": 9.361057502290113e-05,
+      "loss": 0.7181,
+      "step": 1003
+    },
+    {
+      "epoch": 0.5354666666666666,
+      "grad_norm": 0.5302412334783085,
+      "learning_rate": 9.343813286874312e-05,
+      "loss": 0.7832,
+      "step": 1004
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.5133971152143451,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.7923,
+      "step": 1005
+    },
+    {
+      "epoch": 0.5365333333333333,
+      "grad_norm": 0.478268161561676,
+      "learning_rate": 9.309330785950086e-05,
+      "loss": 0.671,
+      "step": 1006
+    },
+    {
+      "epoch": 0.5370666666666667,
+      "grad_norm": 0.42157207098216287,
+      "learning_rate": 9.292092603411641e-05,
+      "loss": 0.7259,
+      "step": 1007
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.46073184075292656,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7224,
+      "step": 1008
+    },
+    {
+      "epoch": 0.5381333333333334,
+      "grad_norm": 0.4442118173092234,
+      "learning_rate": 9.257622631561085e-05,
+      "loss": 0.6875,
+      "step": 1009
+    },
+    {
+      "epoch": 0.5386666666666666,
+      "grad_norm": 0.3731653885908938,
+      "learning_rate": 9.240390945181543e-05,
+      "loss": 0.7004,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.3970596950162719,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.7128,
+      "step": 1011
+    },
+    {
+      "epoch": 0.5397333333333333,
+      "grad_norm": 0.5842297674203025,
+      "learning_rate": 9.205934428795929e-05,
+      "loss": 0.6944,
+      "step": 1012
+    },
+    {
+      "epoch": 0.5402666666666667,
+      "grad_norm": 0.48127990585054503,
+      "learning_rate": 9.188709701682247e-05,
+      "loss": 0.753,
+      "step": 1013
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.4600128750907082,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7023,
+      "step": 1014
+    },
+    {
+      "epoch": 0.5413333333333333,
+      "grad_norm": 0.4136237359433627,
+      "learning_rate": 9.154267566791223e-05,
+      "loss": 0.6706,
+      "step": 1015
+    },
+    {
+      "epoch": 0.5418666666666667,
+      "grad_norm": 0.5587048888071383,
+      "learning_rate": 9.137050261863324e-05,
+      "loss": 0.757,
+      "step": 1016
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.3967339415854174,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.6573,
+      "step": 1017
+    },
+    {
+      "epoch": 0.5429333333333334,
+      "grad_norm": 0.42270362295273634,
+      "learning_rate": 9.102623434110028e-05,
+      "loss": 0.681,
+      "step": 1018
+    },
+    {
+      "epoch": 0.5434666666666667,
+      "grad_norm": 0.4363570791175844,
+      "learning_rate": 9.085414014088369e-05,
+      "loss": 0.6836,
+      "step": 1019
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4807524333102998,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.6569,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5445333333333333,
+      "grad_norm": 0.4505781908324612,
+      "learning_rate": 9.051003418704565e-05,
+      "loss": 0.7345,
+      "step": 1021
+    },
+    {
+      "epoch": 0.5450666666666667,
+      "grad_norm": 0.41450043002651243,
+      "learning_rate": 9.033802346097682e-05,
+      "loss": 0.708,
+      "step": 1022
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.402226024781564,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.714,
+      "step": 1023
+    },
+    {
+      "epoch": 0.5461333333333334,
+      "grad_norm": 0.4657621112858707,
+      "learning_rate": 8.999408907878877e-05,
+      "loss": 0.6965,
+      "step": 1024
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 0.44556512646731455,
+      "learning_rate": 8.982216644970979e-05,
+      "loss": 0.7212,
+      "step": 1025
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.47197005330850506,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7444,
+      "step": 1026
+    },
+    {
+      "epoch": 0.5477333333333333,
+      "grad_norm": 0.476622780345975,
+      "learning_rate": 8.947841288251568e-05,
+      "loss": 0.7529,
+      "step": 1027
+    },
+    {
+      "epoch": 0.5482666666666667,
+      "grad_norm": 0.48936308862553063,
+      "learning_rate": 8.930658297090091e-05,
+      "loss": 0.8097,
+      "step": 1028
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.5634106240839507,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.778,
+      "step": 1029
+    },
+    {
+      "epoch": 0.5493333333333333,
+      "grad_norm": 0.4122836612929818,
+      "learning_rate": 8.896301945718541e-05,
+      "loss": 0.6886,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5498666666666666,
+      "grad_norm": 0.4245650764659275,
+      "learning_rate": 8.879128688101749e-05,
+      "loss": 0.679,
+      "step": 1031
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.40997646926570785,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7081,
+      "step": 1032
+    },
+    {
+      "epoch": 0.5509333333333334,
+      "grad_norm": 0.5391734428164183,
+      "learning_rate": 8.844792265415738e-05,
+      "loss": 0.7203,
+      "step": 1033
+    },
+    {
+      "epoch": 0.5514666666666667,
+      "grad_norm": 0.37763196105289126,
+      "learning_rate": 8.827629202880293e-05,
+      "loss": 0.7109,
+      "step": 1034
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.6457489522459617,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7399,
+      "step": 1035
+    },
+    {
+      "epoch": 0.5525333333333333,
+      "grad_norm": 0.5309623016242518,
+      "learning_rate": 8.793313631681915e-05,
+      "loss": 0.7563,
+      "step": 1036
+    },
+    {
+      "epoch": 0.5530666666666667,
+      "grad_norm": 0.4714886530100088,
+      "learning_rate": 8.776161225490489e-05,
+      "loss": 0.6578,
+      "step": 1037
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.42346170210757944,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.7486,
+      "step": 1038
+    },
+    {
+      "epoch": 0.5541333333333334,
+      "grad_norm": 0.37881968964596124,
+      "learning_rate": 8.741867428021446e-05,
+      "loss": 0.6547,
+      "step": 1039
+    },
+    {
+      "epoch": 0.5546666666666666,
+      "grad_norm": 0.47059974199545634,
+      "learning_rate": 8.724726139150318e-05,
+      "loss": 0.6519,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.41995862073775514,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.707,
+      "step": 1041
+    },
+    {
+      "epoch": 0.5557333333333333,
+      "grad_norm": 0.4870154589149614,
+      "learning_rate": 8.690455037067141e-05,
+      "loss": 0.6871,
+      "step": 1042
+    },
+    {
+      "epoch": 0.5562666666666667,
+      "grad_norm": 0.5446337912386945,
+      "learning_rate": 8.673325326193806e-05,
+      "loss": 0.7219,
+      "step": 1043
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.4600082573187779,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.6752,
+      "step": 1044
+    },
+    {
+      "epoch": 0.5573333333333333,
+      "grad_norm": 0.602259027120918,
+      "learning_rate": 8.639077840543077e-05,
+      "loss": 0.7799,
+      "step": 1045
+    },
+    {
+      "epoch": 0.5578666666666666,
+      "grad_norm": 0.4195158570317328,
+      "learning_rate": 8.621960168033867e-05,
+      "loss": 0.7083,
+      "step": 1046
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.45297381521337465,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.76,
+      "step": 1047
+    },
+    {
+      "epoch": 0.5589333333333333,
+      "grad_norm": 0.44714546612521117,
+      "learning_rate": 8.587737219227462e-05,
+      "loss": 0.7611,
+      "step": 1048
+    },
+    {
+      "epoch": 0.5594666666666667,
+      "grad_norm": 0.5761347301832973,
+      "learning_rate": 8.570632045125185e-05,
+      "loss": 0.735,
+      "step": 1049
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4066225432883208,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6534,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5605333333333333,
+      "grad_norm": 0.42139487861342373,
+      "learning_rate": 8.536434552915556e-05,
+      "loss": 0.7228,
+      "step": 1051
+    },
+    {
+      "epoch": 0.5610666666666667,
+      "grad_norm": 0.4606520290345603,
+      "learning_rate": 8.519342336927105e-05,
+      "loss": 0.7882,
+      "step": 1052
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.3970252044933702,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6499,
+      "step": 1053
+    },
+    {
+      "epoch": 0.5621333333333334,
+      "grad_norm": 0.4261427255932377,
+      "learning_rate": 8.485171220382545e-05,
+      "loss": 0.695,
+      "step": 1054
+    },
+    {
+      "epoch": 0.5626666666666666,
+      "grad_norm": 0.5251732898816305,
+      "learning_rate": 8.468092421866573e-05,
+      "loss": 0.714,
+      "step": 1055
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.41141372444674557,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7439,
+      "step": 1056
+    },
+    {
+      "epoch": 0.5637333333333333,
+      "grad_norm": 0.41889632476168526,
+      "learning_rate": 8.433948599346516e-05,
+      "loss": 0.6985,
+      "step": 1057
+    },
+    {
+      "epoch": 0.5642666666666667,
+      "grad_norm": 0.38595234209643636,
+      "learning_rate": 8.416883677301069e-05,
+      "loss": 0.6944,
+      "step": 1058
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.4351385284929873,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6671,
+      "step": 1059
+    },
+    {
+      "epoch": 0.5653333333333334,
+      "grad_norm": 0.46213495852699893,
+      "learning_rate": 8.382768066431425e-05,
+      "loss": 0.7162,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5658666666666666,
+      "grad_norm": 0.38040261025201877,
+      "learning_rate": 8.36571747948162e-05,
+      "loss": 0.6336,
+      "step": 1061
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4778906493943159,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.6506,
+      "step": 1062
+    },
+    {
+      "epoch": 0.5669333333333333,
+      "grad_norm": 0.5604201857765944,
+      "learning_rate": 8.33163099713009e-05,
+      "loss": 0.7572,
+      "step": 1063
+    },
+    {
+      "epoch": 0.5674666666666667,
+      "grad_norm": 0.37664708048191525,
+      "learning_rate": 8.31459520351578e-05,
+      "loss": 0.6771,
+      "step": 1064
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.47181259684094984,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.6562,
+      "step": 1065
+    },
+    {
+      "epoch": 0.5685333333333333,
+      "grad_norm": 0.41530801771730214,
+      "learning_rate": 8.280538765767235e-05,
+      "loss": 0.6284,
+      "step": 1066
+    },
+    {
+      "epoch": 0.5690666666666667,
+      "grad_norm": 0.5174511494038859,
+      "learning_rate": 8.263518223330697e-05,
+      "loss": 0.6459,
+      "step": 1067
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.6389466638942793,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7926,
+      "step": 1068
+    },
+    {
+      "epoch": 0.5701333333333334,
+      "grad_norm": 0.4948881751553678,
+      "learning_rate": 8.22949274546255e-05,
+      "loss": 0.7238,
+      "step": 1069
+    },
+    {
+      "epoch": 0.5706666666666667,
+      "grad_norm": 0.3991432212775443,
+      "learning_rate": 8.212487911636184e-05,
+      "loss": 0.6606,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.46658729965932916,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.7219,
+      "step": 1071
+    },
+    {
+      "epoch": 0.5717333333333333,
+      "grad_norm": 0.4413099807534331,
+      "learning_rate": 8.178494308093789e-05,
+      "loss": 0.7006,
+      "step": 1072
+    },
+    {
+      "epoch": 0.5722666666666667,
+      "grad_norm": 0.4958849327877985,
+      "learning_rate": 8.161505639887817e-05,
+      "loss": 0.6708,
+      "step": 1073
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.45632539986067144,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.6722,
+      "step": 1074
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 0.5116081581399695,
+      "learning_rate": 8.127544824259889e-05,
+      "loss": 0.69,
+      "step": 1075
+    },
+    {
+      "epoch": 0.5738666666666666,
+      "grad_norm": 0.4883601563346209,
+      "learning_rate": 8.110572778250085e-05,
+      "loss": 0.7335,
+      "step": 1076
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.5028990558941931,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.6732,
+      "step": 1077
+    },
+    {
+      "epoch": 0.5749333333333333,
+      "grad_norm": 0.4719857979519838,
+      "learning_rate": 8.076645663244168e-05,
+      "loss": 0.701,
+      "step": 1078
+    },
+    {
+      "epoch": 0.5754666666666667,
+      "grad_norm": 0.46411915311096336,
+      "learning_rate": 8.059690695559568e-05,
+      "loss": 0.7122,
+      "step": 1079
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3934084226407831,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.6416,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5765333333333333,
+      "grad_norm": 0.4648759855900219,
+      "learning_rate": 8.025798192977481e-05,
+      "loss": 0.7334,
+      "step": 1081
+    },
+    {
+      "epoch": 0.5770666666666666,
+      "grad_norm": 0.47281681448887125,
+      "learning_rate": 8.008860759288147e-05,
+      "loss": 0.6864,
+      "step": 1082
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.4394932571747774,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.65,
+      "step": 1083
+    },
+    {
+      "epoch": 0.5781333333333334,
+      "grad_norm": 0.4790191382059904,
+      "learning_rate": 7.975003780001485e-05,
+      "loss": 0.7583,
+      "step": 1084
+    },
+    {
+      "epoch": 0.5786666666666667,
+      "grad_norm": 0.5237037737491155,
+      "learning_rate": 7.958084335506239e-05,
+      "loss": 0.8054,
+      "step": 1085
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.5098117771489851,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6473,
+      "step": 1086
+    },
+    {
+      "epoch": 0.5797333333333333,
+      "grad_norm": 0.4682109938448841,
+      "learning_rate": 7.924263789431912e-05,
+      "loss": 0.7574,
+      "step": 1087
+    },
+    {
+      "epoch": 0.5802666666666667,
+      "grad_norm": 0.35640481541049723,
+      "learning_rate": 7.907362788846116e-05,
+      "loss": 0.7113,
+      "step": 1088
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.38739878010398104,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.6585,
+      "step": 1089
+    },
+    {
+      "epoch": 0.5813333333333334,
+      "grad_norm": 0.3983792120697055,
+      "learning_rate": 7.873579584921869e-05,
+      "loss": 0.7034,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5818666666666666,
+      "grad_norm": 0.4113268423149,
+      "learning_rate": 7.856697482465196e-05,
+      "loss": 0.626,
+      "step": 1091
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.4088959160760906,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6491,
+      "step": 1092
+    },
+    {
+      "epoch": 0.5829333333333333,
+      "grad_norm": 0.4274621841367322,
+      "learning_rate": 7.822952528625191e-05,
+      "loss": 0.6996,
+      "step": 1093
+    },
+    {
+      "epoch": 0.5834666666666667,
+      "grad_norm": 0.40221440742043035,
+      "learning_rate": 7.806089778009421e-05,
+      "loss": 0.6947,
+      "step": 1094
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.5283388538749225,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.7336,
+      "step": 1095
+    },
+    {
+      "epoch": 0.5845333333333333,
+      "grad_norm": 0.40324576874310253,
+      "learning_rate": 7.772383981159849e-05,
+      "loss": 0.6637,
+      "step": 1096
+    },
+    {
+      "epoch": 0.5850666666666666,
+      "grad_norm": 0.42919137885726905,
+      "learning_rate": 7.755541035576677e-05,
+      "loss": 0.6753,
+      "step": 1097
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.440153196601291,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.7209,
+      "step": 1098
+    },
+    {
+      "epoch": 0.5861333333333333,
+      "grad_norm": 0.42001973533574605,
+      "learning_rate": 7.721875301571359e-05,
+      "loss": 0.6795,
+      "step": 1099
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.5868176666072477,
+      "learning_rate": 7.705052613680211e-05,
+      "loss": 0.6963,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.45549033769661046,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.6951,
+      "step": 1101
+    },
+    {
+      "epoch": 0.5877333333333333,
+      "grad_norm": 0.44250320987177044,
+      "learning_rate": 7.671427847296275e-05,
+      "loss": 0.6385,
+      "step": 1102
+    },
+    {
+      "epoch": 0.5882666666666667,
+      "grad_norm": 0.39035373524284306,
+      "learning_rate": 7.654625869212146e-05,
+      "loss": 0.6368,
+      "step": 1103
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.464696040293888,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.6875,
+      "step": 1104
+    },
+    {
+      "epoch": 0.5893333333333334,
+      "grad_norm": 0.4586068794220114,
+      "learning_rate": 7.6210429741257e-05,
+      "loss": 0.6837,
+      "step": 1105
+    },
+    {
+      "epoch": 0.5898666666666667,
+      "grad_norm": 0.41551188145094703,
+      "learning_rate": 7.604262157407007e-05,
+      "loss": 0.6586,
+      "step": 1106
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.403407353589169,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.7105,
+      "step": 1107
+    },
+    {
+      "epoch": 0.5909333333333333,
+      "grad_norm": 0.4209550504605238,
+      "learning_rate": 7.570722036168854e-05,
+      "loss": 0.627,
+      "step": 1108
+    },
+    {
+      "epoch": 0.5914666666666667,
+      "grad_norm": 0.4025562030361292,
+      "learning_rate": 7.55396283180529e-05,
+      "loss": 0.6829,
+      "step": 1109
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4115361755718086,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.6782,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5925333333333334,
+      "grad_norm": 0.46293428949464527,
+      "learning_rate": 7.520466385816671e-05,
+      "loss": 0.711,
+      "step": 1111
+    },
+    {
+      "epoch": 0.5930666666666666,
+      "grad_norm": 0.4525500352174917,
+      "learning_rate": 7.503729244217086e-05,
+      "loss": 0.7214,
+      "step": 1112
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.39438228243314793,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6468,
+      "step": 1113
+    },
+    {
+      "epoch": 0.5941333333333333,
+      "grad_norm": 0.49263504033528177,
+      "learning_rate": 7.470277373705461e-05,
+      "loss": 0.6148,
+      "step": 1114
+    },
+    {
+      "epoch": 0.5946666666666667,
+      "grad_norm": 0.39727946685820814,
+      "learning_rate": 7.453562744685778e-05,
+      "loss": 0.655,
+      "step": 1115
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.4596553941643732,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.7272,
+      "step": 1116
+    },
+    {
+      "epoch": 0.5957333333333333,
+      "grad_norm": 0.4282382813081575,
+      "learning_rate": 7.42015634868062e-05,
+      "loss": 0.6842,
+      "step": 1117
+    },
+    {
+      "epoch": 0.5962666666666666,
+      "grad_norm": 0.37547710323647004,
+      "learning_rate": 7.403464681451715e-05,
+      "loss": 0.6973,
+      "step": 1118
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.4966657023913872,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.654,
+      "step": 1119
+    },
+    {
+      "epoch": 0.5973333333333334,
+      "grad_norm": 0.44782442402270567,
+      "learning_rate": 7.370104657760361e-05,
+      "loss": 0.6642,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5978666666666667,
+      "grad_norm": 0.4381324912137808,
+      "learning_rate": 7.353436400916004e-05,
+      "loss": 0.7392,
+      "step": 1121
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4254768671365201,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.6449,
+      "step": 1122
+    },
+    {
+      "epoch": 0.5989333333333333,
+      "grad_norm": 0.4424722561990435,
+      "learning_rate": 7.320123646099519e-05,
+      "loss": 0.6821,
+      "step": 1123
+    },
+    {
+      "epoch": 0.5994666666666667,
+      "grad_norm": 0.41725615187438503,
+      "learning_rate": 7.303479247604332e-05,
+      "loss": 0.7084,
+      "step": 1124
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.48119746299765054,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.742,
+      "step": 1125
+    },
+    {
+      "epoch": 0.6005333333333334,
+      "grad_norm": 0.5834048082380597,
+      "learning_rate": 7.270214656953415e-05,
+      "loss": 0.739,
+      "step": 1126
+    },
+    {
+      "epoch": 0.6010666666666666,
+      "grad_norm": 0.45966539280766744,
+      "learning_rate": 7.253594564130804e-05,
+      "loss": 0.7358,
+      "step": 1127
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.5246437083991626,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.6915,
+      "step": 1128
+    },
+    {
+      "epoch": 0.6021333333333333,
+      "grad_norm": 0.4555261618321849,
+      "learning_rate": 7.22037903164173e-05,
+      "loss": 0.6493,
+      "step": 1129
+    },
+    {
+      "epoch": 0.6026666666666667,
+      "grad_norm": 0.4235653668589521,
+      "learning_rate": 7.203783691161883e-05,
+      "loss": 0.7031,
+      "step": 1130
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.4474416245759516,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.6918,
+      "step": 1131
+    },
+    {
+      "epoch": 0.6037333333333333,
+      "grad_norm": 0.48694069827586794,
+      "learning_rate": 7.170618109512465e-05,
+      "loss": 0.7333,
+      "step": 1132
+    },
+    {
+      "epoch": 0.6042666666666666,
+      "grad_norm": 0.4376511170189795,
+      "learning_rate": 7.154047967380354e-05,
+      "loss": 0.7553,
+      "step": 1133
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.43980114940399667,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.733,
+      "step": 1134
+    },
+    {
+      "epoch": 0.6053333333333333,
+      "grad_norm": 0.4540225086137344,
+      "learning_rate": 7.12093322790597e-05,
+      "loss": 0.6927,
+      "step": 1135
+    },
+    {
+      "epoch": 0.6058666666666667,
+      "grad_norm": 0.4028754666797807,
+      "learning_rate": 7.104388729449338e-05,
+      "loss": 0.6917,
+      "step": 1136
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.5450526119568553,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7245,
+      "step": 1137
+    },
+    {
+      "epoch": 0.6069333333333333,
+      "grad_norm": 0.40295745572301156,
+      "learning_rate": 7.071325722118963e-05,
+      "loss": 0.6052,
+      "step": 1138
+    },
+    {
+      "epoch": 0.6074666666666667,
+      "grad_norm": 0.4187400391892376,
+      "learning_rate": 7.054807311976379e-05,
+      "loss": 0.6958,
+      "step": 1139
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.407667069869259,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6963,
+      "step": 1140
+    },
+    {
+      "epoch": 0.6085333333333334,
+      "grad_norm": 0.4178318695815154,
+      "learning_rate": 7.021796925368667e-05,
+      "loss": 0.6729,
+      "step": 1141
+    },
+    {
+      "epoch": 0.6090666666666666,
+      "grad_norm": 0.4162537281857959,
+      "learning_rate": 7.005305047477566e-05,
+      "loss": 0.7507,
+      "step": 1142
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.3926901260513355,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6546,
+      "step": 1143
+    },
+    {
+      "epoch": 0.6101333333333333,
+      "grad_norm": 0.48529116831628166,
+      "learning_rate": 6.972348168756983e-05,
+      "loss": 0.6648,
+      "step": 1144
+    },
+    {
+      "epoch": 0.6106666666666667,
+      "grad_norm": 0.45305284197698503,
+      "learning_rate": 6.955883266341741e-05,
+      "loss": 0.6826,
+      "step": 1145
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.4561702027539845,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6977,
+      "step": 1146
+    },
+    {
+      "epoch": 0.6117333333333334,
+      "grad_norm": 0.43187343070490947,
+      "learning_rate": 6.922980781234699e-05,
+      "loss": 0.7343,
+      "step": 1147
+    },
+    {
+      "epoch": 0.6122666666666666,
+      "grad_norm": 0.5301500087261996,
+      "learning_rate": 6.906543296794714e-05,
+      "loss": 0.6863,
+      "step": 1148
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.4747256643705912,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6237,
+      "step": 1149
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.4709266319943518,
+      "learning_rate": 6.873696089565786e-05,
+      "loss": 0.63,
+      "step": 1150
+    },
+    {
+      "epoch": 0.6138666666666667,
+      "grad_norm": 0.3974957399383896,
+      "learning_rate": 6.85728646486359e-05,
+      "loss": 0.6546,
+      "step": 1151
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.503966249187983,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.7036,
+      "step": 1152
+    },
+    {
+      "epoch": 0.6149333333333333,
+      "grad_norm": 0.41288684339233356,
+      "learning_rate": 6.82449541829174e-05,
+      "loss": 0.6381,
+      "step": 1153
+    },
+    {
+      "epoch": 0.6154666666666667,
+      "grad_norm": 0.46913523549298286,
+      "learning_rate": 6.80811409434113e-05,
+      "loss": 0.6982,
+      "step": 1154
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.4752171520813708,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.6727,
+      "step": 1155
+    },
+    {
+      "epoch": 0.6165333333333334,
+      "grad_norm": 0.4943113707571559,
+      "learning_rate": 6.775380089695986e-05,
+      "loss": 0.7352,
+      "step": 1156
+    },
+    {
+      "epoch": 0.6170666666666667,
+      "grad_norm": 0.4264027372481861,
+      "learning_rate": 6.759027506750158e-05,
+      "loss": 0.7045,
+      "step": 1157
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.7235056153582596,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7683,
+      "step": 1158
+    },
+    {
+      "epoch": 0.6181333333333333,
+      "grad_norm": 0.563107580693706,
+      "learning_rate": 6.726351423768322e-05,
+      "loss": 0.7123,
+      "step": 1159
+    },
+    {
+      "epoch": 0.6186666666666667,
+      "grad_norm": 0.40194345871372006,
+      "learning_rate": 6.710028021308061e-05,
+      "loss": 0.6651,
+      "step": 1160
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.4202305543838757,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.6785,
+      "step": 1161
+    },
+    {
+      "epoch": 0.6197333333333334,
+      "grad_norm": 0.46577617049452436,
+      "learning_rate": 6.677410738169485e-05,
+      "loss": 0.7267,
+      "step": 1162
+    },
+    {
+      "epoch": 0.6202666666666666,
+      "grad_norm": 0.4723083390327057,
+      "learning_rate": 6.661116954891328e-05,
+      "loss": 0.6962,
+      "step": 1163
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.39044608177447104,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.6706,
+      "step": 1164
+    },
+    {
+      "epoch": 0.6213333333333333,
+      "grad_norm": 0.3892753596936745,
+      "learning_rate": 6.62855934819569e-05,
+      "loss": 0.6718,
+      "step": 1165
+    },
+    {
+      "epoch": 0.6218666666666667,
+      "grad_norm": 0.4701937200996348,
+      "learning_rate": 6.612295622000162e-05,
+      "loss": 0.6883,
+      "step": 1166
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.35726664334890657,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6548,
+      "step": 1167
+    },
+    {
+      "epoch": 0.6229333333333333,
+      "grad_norm": 0.4090789779534833,
+      "learning_rate": 6.579798566743314e-05,
+      "loss": 0.6547,
+      "step": 1168
+    },
+    {
+      "epoch": 0.6234666666666666,
+      "grad_norm": 0.3990999408536816,
+      "learning_rate": 6.563565334723134e-05,
+      "loss": 0.6635,
+      "step": 1169
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.4363843309324732,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7254,
+      "step": 1170
+    },
+    {
+      "epoch": 0.6245333333333334,
+      "grad_norm": 0.5106673101572715,
+      "learning_rate": 6.531129704273604e-05,
+      "loss": 0.7341,
+      "step": 1171
+    },
+    {
+      "epoch": 0.6250666666666667,
+      "grad_norm": 0.392666217890261,
+      "learning_rate": 6.514927402701964e-05,
+      "loss": 0.6688,
+      "step": 1172
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.48809746540697774,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.7258,
+      "step": 1173
+    },
+    {
+      "epoch": 0.6261333333333333,
+      "grad_norm": 0.41397786052470315,
+      "learning_rate": 6.48255406877745e-05,
+      "loss": 0.6374,
+      "step": 1174
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 0.37907725401780545,
+      "learning_rate": 6.466383133096267e-05,
+      "loss": 0.6633,
+      "step": 1175
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3774594167209582,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6541,
+      "step": 1176
+    },
+    {
+      "epoch": 0.6277333333333334,
+      "grad_norm": 0.4890042336911856,
+      "learning_rate": 6.434072965740242e-05,
+      "loss": 0.6494,
+      "step": 1177
+    },
+    {
+      "epoch": 0.6282666666666666,
+      "grad_norm": 0.39851877805950653,
+      "learning_rate": 6.417933830548467e-05,
+      "loss": 0.7063,
+      "step": 1178
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.329191610211986,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.6097,
+      "step": 1179
+    },
+    {
+      "epoch": 0.6293333333333333,
+      "grad_norm": 0.4903775189635333,
+      "learning_rate": 6.385687698106781e-05,
+      "loss": 0.7756,
+      "step": 1180
+    },
+    {
+      "epoch": 0.6298666666666667,
+      "grad_norm": 0.4962745895251398,
+      "learning_rate": 6.369580797148718e-05,
+      "loss": 0.697,
+      "step": 1181
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.4215012367902803,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6696,
+      "step": 1182
+    },
+    {
+      "epoch": 0.6309333333333333,
+      "grad_norm": 0.5602802699887822,
+      "learning_rate": 6.337399566246257e-05,
+      "loss": 0.7679,
+      "step": 1183
+    },
+    {
+      "epoch": 0.6314666666666666,
+      "grad_norm": 0.42503625863823996,
+      "learning_rate": 6.321325332399903e-05,
+      "loss": 0.6987,
+      "step": 1184
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.4557339264919862,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7613,
+      "step": 1185
+    },
+    {
+      "epoch": 0.6325333333333333,
+      "grad_norm": 0.3894184456838005,
+      "learning_rate": 6.289209867917312e-05,
+      "loss": 0.6568,
+      "step": 1186
+    },
+    {
+      "epoch": 0.6330666666666667,
+      "grad_norm": 0.43330359333084534,
+      "learning_rate": 6.273168733182722e-05,
+      "loss": 0.7066,
+      "step": 1187
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.44576363394684887,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.7179,
+      "step": 1188
+    },
+    {
+      "epoch": 0.6341333333333333,
+      "grad_norm": 0.4145433388543684,
+      "learning_rate": 6.241119898233144e-05,
+      "loss": 0.7144,
+      "step": 1189
+    },
+    {
+      "epoch": 0.6346666666666667,
+      "grad_norm": 0.45566017580548274,
+      "learning_rate": 6.225112293720836e-05,
+      "loss": 0.7061,
+      "step": 1190
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.3949098601987805,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.6876,
+      "step": 1191
+    },
+    {
+      "epoch": 0.6357333333333334,
+      "grad_norm": 0.6521678312516719,
+      "learning_rate": 6.19313094962673e-05,
+      "loss": 0.6718,
+      "step": 1192
+    },
+    {
+      "epoch": 0.6362666666666666,
+      "grad_norm": 0.4331369024736756,
+      "learning_rate": 6.177157305546078e-05,
+      "loss": 0.6468,
+      "step": 1193
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.47666236297639186,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.746,
+      "step": 1194
+    },
+    {
+      "epoch": 0.6373333333333333,
+      "grad_norm": 0.44631760948927673,
+      "learning_rate": 6.145244311816063e-05,
+      "loss": 0.7215,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6378666666666667,
+      "grad_norm": 0.40879940519529123,
+      "learning_rate": 6.129305057463741e-05,
+      "loss": 0.622,
+      "step": 1196
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.46886407286861076,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.7576,
+      "step": 1197
+    },
+    {
+      "epoch": 0.6389333333333334,
+      "grad_norm": 0.3948554102536063,
+      "learning_rate": 6.0974612717695004e-05,
+      "loss": 0.6663,
+      "step": 1198
+    },
+    {
+      "epoch": 0.6394666666666666,
+      "grad_norm": 0.4259830535046582,
+      "learning_rate": 6.0815568355179556e-05,
+      "loss": 0.7286,
+      "step": 1199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.44144107124314336,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.7311,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6405333333333333,
+      "grad_norm": 0.543239191674462,
+      "learning_rate": 6.0497831136711836e-05,
+      "loss": 0.7578,
+      "step": 1201
+    },
+    {
+      "epoch": 0.6410666666666667,
+      "grad_norm": 0.4872870664724931,
+      "learning_rate": 6.0339139229571116e-05,
+      "loss": 0.6416,
+      "step": 1202
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.4214827835586575,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.6882,
+      "step": 1203
+    },
+    {
+      "epoch": 0.6421333333333333,
+      "grad_norm": 0.7102502410108671,
+      "learning_rate": 6.002211118886514e-05,
+      "loss": 0.7106,
+      "step": 1204
+    },
+    {
+      "epoch": 0.6426666666666667,
+      "grad_norm": 0.42612395704378414,
+      "learning_rate": 5.986377600199371e-05,
+      "loss": 0.6712,
+      "step": 1205
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.4112579344710362,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6264,
+      "step": 1206
+    },
+    {
+      "epoch": 0.6437333333333334,
+      "grad_norm": 0.3992990179883584,
+      "learning_rate": 5.9547465659277215e-05,
+      "loss": 0.6441,
+      "step": 1207
+    },
+    {
+      "epoch": 0.6442666666666667,
+      "grad_norm": 0.49175577071302734,
+      "learning_rate": 5.938949144798279e-05,
+      "loss": 0.7145,
+      "step": 1208
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.38660563173633494,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6236,
+      "step": 1209
+    },
+    {
+      "epoch": 0.6453333333333333,
+      "grad_norm": 0.4825247592155358,
+      "learning_rate": 5.907390730419507e-05,
+      "loss": 0.6602,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6458666666666667,
+      "grad_norm": 0.49627838405883506,
+      "learning_rate": 5.8916298314083915e-05,
+      "loss": 0.7141,
+      "step": 1211
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.38542797334088236,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6469,
+      "step": 1212
+    },
+    {
+      "epoch": 0.6469333333333334,
+      "grad_norm": 0.4036454395837023,
+      "learning_rate": 5.860144885064751e-05,
+      "loss": 0.6091,
+      "step": 1213
+    },
+    {
+      "epoch": 0.6474666666666666,
+      "grad_norm": 0.3999331565255694,
+      "learning_rate": 5.8444209317510514e-05,
+      "loss": 0.6556,
+      "step": 1214
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.4047271951245858,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6822,
+      "step": 1215
+    },
+    {
+      "epoch": 0.6485333333333333,
+      "grad_norm": 0.4638043930440745,
+      "learning_rate": 5.813010299610313e-05,
+      "loss": 0.7976,
+      "step": 1216
+    },
+    {
+      "epoch": 0.6490666666666667,
+      "grad_norm": 0.4140751557173669,
+      "learning_rate": 5.797323714580192e-05,
+      "loss": 0.6654,
+      "step": 1217
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.40175278138010456,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6837,
+      "step": 1218
+    },
+    {
+      "epoch": 0.6501333333333333,
+      "grad_norm": 0.448361264712076,
+      "learning_rate": 5.765988240812921e-05,
+      "loss": 0.7054,
+      "step": 1219
+    },
+    {
+      "epoch": 0.6506666666666666,
+      "grad_norm": 0.4903001942235706,
+      "learning_rate": 5.750339445648252e-05,
+      "loss": 0.7065,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.49209525819268396,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.6848,
+      "step": 1221
+    },
+    {
+      "epoch": 0.6517333333333334,
+      "grad_norm": 0.40232475033847775,
+      "learning_rate": 5.7190799724050924e-05,
+      "loss": 0.6638,
+      "step": 1222
+    },
+    {
+      "epoch": 0.6522666666666667,
+      "grad_norm": 0.42607086576536785,
+      "learning_rate": 5.7034693876721376e-05,
+      "loss": 0.7159,
+      "step": 1223
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.487693420824358,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.7019,
+      "step": 1224
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 0.4213757880177138,
+      "learning_rate": 5.6722867550612116e-05,
+      "loss": 0.6985,
+      "step": 1225
+    },
+    {
+      "epoch": 0.6538666666666667,
+      "grad_norm": 0.42890405014839217,
+      "learning_rate": 5.6567148002993164e-05,
+      "loss": 0.6592,
+      "step": 1226
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.4759282538023443,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.6273,
+      "step": 1227
+    },
+    {
+      "epoch": 0.6549333333333334,
+      "grad_norm": 0.45053499198596025,
+      "learning_rate": 5.625609846363622e-05,
+      "loss": 0.6976,
+      "step": 1228
+    },
+    {
+      "epoch": 0.6554666666666666,
+      "grad_norm": 0.43882677659957103,
+      "learning_rate": 5.6100769400739383e-05,
+      "loss": 0.6876,
+      "step": 1229
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.7611731610544257,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.7676,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6565333333333333,
+      "grad_norm": 0.41985260340167807,
+      "learning_rate": 5.579050500768836e-05,
+      "loss": 0.6873,
+      "step": 1231
+    },
+    {
+      "epoch": 0.6570666666666667,
+      "grad_norm": 0.642664100308076,
+      "learning_rate": 5.5635570604030705e-05,
+      "loss": 0.7394,
+      "step": 1232
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.4490334732239554,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.736,
+      "step": 1233
+    },
+    {
+      "epoch": 0.6581333333333333,
+      "grad_norm": 0.6671440485736441,
+      "learning_rate": 5.53260996957381e-05,
+      "loss": 0.7755,
+      "step": 1234
+    },
+    {
+      "epoch": 0.6586666666666666,
+      "grad_norm": 0.4648759376811008,
+      "learning_rate": 5.5171564115230254e-05,
+      "loss": 0.6835,
+      "step": 1235
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.39704081725262047,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6712,
+      "step": 1236
+    },
+    {
+      "epoch": 0.6597333333333333,
+      "grad_norm": 0.4341131618860968,
+      "learning_rate": 5.486289500882355e-05,
+      "loss": 0.637,
+      "step": 1237
+    },
+    {
+      "epoch": 0.6602666666666667,
+      "grad_norm": 0.5132419370155383,
+      "learning_rate": 5.47087624046575e-05,
+      "loss": 0.6802,
+      "step": 1238
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.3970131149411168,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.6439,
+      "step": 1239
+    },
+    {
+      "epoch": 0.6613333333333333,
+      "grad_norm": 0.5563796139787245,
+      "learning_rate": 5.4400903395715366e-05,
+      "loss": 0.7427,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6618666666666667,
+      "grad_norm": 0.4727356660983191,
+      "learning_rate": 5.424717791025302e-05,
+      "loss": 0.7454,
+      "step": 1241
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4678442552896839,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.5738,
+      "step": 1242
+    },
+    {
+      "epoch": 0.6629333333333334,
+      "grad_norm": 0.49860848199843866,
+      "learning_rate": 5.394013727258254e-05,
+      "loss": 0.7387,
+      "step": 1243
+    },
+    {
+      "epoch": 0.6634666666666666,
+      "grad_norm": 0.45505539284147367,
+      "learning_rate": 5.378682303724435e-05,
+      "loss": 0.6501,
+      "step": 1244
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.40299107203734746,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6747,
+      "step": 1245
+    },
+    {
+      "epoch": 0.6645333333333333,
+      "grad_norm": 0.3893315999426221,
+      "learning_rate": 5.348060902265871e-05,
+      "loss": 0.6468,
+      "step": 1246
+    },
+    {
+      "epoch": 0.6650666666666667,
+      "grad_norm": 0.4146645361836117,
+      "learning_rate": 5.332771015781275e-05,
+      "loss": 0.6628,
+      "step": 1247
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.39969819955401586,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.6847,
+      "step": 1248
+    },
+    {
+      "epoch": 0.6661333333333334,
+      "grad_norm": 0.5499794887435745,
+      "learning_rate": 5.302233099590928e-05,
+      "loss": 0.698,
+      "step": 1249
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.48647645191739924,
+      "learning_rate": 5.286985161076029e-05,
+      "loss": 0.6309,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.4285453792077148,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6534,
+      "step": 1251
+    },
+    {
+      "epoch": 0.6677333333333333,
+      "grad_norm": 0.4324294114731592,
+      "learning_rate": 5.2565315508699376e-05,
+      "loss": 0.6817,
+      "step": 1252
+    },
+    {
+      "epoch": 0.6682666666666667,
+      "grad_norm": 0.46608044171446206,
+      "learning_rate": 5.2413259701178505e-05,
+      "loss": 0.694,
+      "step": 1253
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.4073665107443134,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6852,
+      "step": 1254
+    },
+    {
+      "epoch": 0.6693333333333333,
+      "grad_norm": 0.45869252338084976,
+      "learning_rate": 5.210957484346314e-05,
+      "loss": 0.6808,
+      "step": 1255
+    },
+    {
+      "epoch": 0.6698666666666667,
+      "grad_norm": 0.5828362701029322,
+      "learning_rate": 5.195794670011776e-05,
+      "loss": 0.7766,
+      "step": 1256
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.4269296331948093,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.7202,
+      "step": 1257
+    },
+    {
+      "epoch": 0.6709333333333334,
+      "grad_norm": 0.4519904640620554,
+      "learning_rate": 5.165512124837344e-05,
+      "loss": 0.7003,
+      "step": 1258
+    },
+    {
+      "epoch": 0.6714666666666667,
+      "grad_norm": 0.6444979308466544,
+      "learning_rate": 5.150392484425728e-05,
+      "loss": 0.6902,
+      "step": 1259
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4699420609867648,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.7015,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6725333333333333,
+      "grad_norm": 0.5022268277610995,
+      "learning_rate": 5.120196693701267e-05,
+      "loss": 0.7158,
+      "step": 1261
+    },
+    {
+      "epoch": 0.6730666666666667,
+      "grad_norm": 0.46082265208514983,
+      "learning_rate": 5.105120633557634e-05,
+      "loss": 0.6412,
+      "step": 1262
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.5103161933623592,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7312,
+      "step": 1263
+    },
+    {
+      "epoch": 0.6741333333333334,
+      "grad_norm": 0.5147347915351437,
+      "learning_rate": 5.075012408804458e-05,
+      "loss": 0.7794,
+      "step": 1264
+    },
+    {
+      "epoch": 0.6746666666666666,
+      "grad_norm": 0.5421872091812179,
+      "learning_rate": 5.059980334102637e-05,
+      "loss": 0.668,
+      "step": 1265
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.46411215498995456,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.7255,
+      "step": 1266
+    },
+    {
+      "epoch": 0.6757333333333333,
+      "grad_norm": 0.3887518797785626,
+      "learning_rate": 5.0299604844886985e-05,
+      "loss": 0.6083,
+      "step": 1267
+    },
+    {
+      "epoch": 0.6762666666666667,
+      "grad_norm": 0.45110254673438077,
+      "learning_rate": 5.014972799220403e-05,
+      "loss": 0.7377,
+      "step": 1268
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.3958967453111445,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.6958,
+      "step": 1269
+    },
+    {
+      "epoch": 0.6773333333333333,
+      "grad_norm": 0.423884885430956,
+      "learning_rate": 4.985042131538545e-05,
+      "loss": 0.7114,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6778666666666666,
+      "grad_norm": 0.4146595568379947,
+      "learning_rate": 4.9700992385024934e-05,
+      "loss": 0.7451,
+      "step": 1271
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.4220457869659306,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6996,
+      "step": 1272
+    },
+    {
+      "epoch": 0.6789333333333334,
+      "grad_norm": 0.3810707534390687,
+      "learning_rate": 4.940258557148765e-05,
+      "loss": 0.6529,
+      "step": 1273
+    },
+    {
+      "epoch": 0.6794666666666667,
+      "grad_norm": 0.3810992708104994,
+      "learning_rate": 4.9253608579398855e-05,
+      "loss": 0.632,
+      "step": 1274
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.4637435578141067,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.7968,
+      "step": 1275
+    },
+    {
+      "epoch": 0.6805333333333333,
+      "grad_norm": 0.4832133631853069,
+      "learning_rate": 4.895610964891923e-05,
+      "loss": 0.7038,
+      "step": 1276
+    },
+    {
+      "epoch": 0.6810666666666667,
+      "grad_norm": 0.36830718999500317,
+      "learning_rate": 4.880758859890536e-05,
+      "loss": 0.679,
+      "step": 1277
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.41448228827687683,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6287,
+      "step": 1278
+    },
+    {
+      "epoch": 0.6821333333333334,
+      "grad_norm": 0.4261682036368644,
+      "learning_rate": 4.851100554686021e-05,
+      "loss": 0.7054,
+      "step": 1279
+    },
+    {
+      "epoch": 0.6826666666666666,
+      "grad_norm": 0.39173878941998846,
+      "learning_rate": 4.836294443047088e-05,
+      "loss": 0.6892,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.4612650363372707,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.7067,
+      "step": 1281
+    },
+    {
+      "epoch": 0.6837333333333333,
+      "grad_norm": 0.5415273429390662,
+      "learning_rate": 4.8067285227622404e-05,
+      "loss": 0.7567,
+      "step": 1282
+    },
+    {
+      "epoch": 0.6842666666666667,
+      "grad_norm": 0.42985550421095275,
+      "learning_rate": 4.791968802404648e-05,
+      "loss": 0.6789,
+      "step": 1283
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.4514876328128349,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.7334,
+      "step": 1284
+    },
+    {
+      "epoch": 0.6853333333333333,
+      "grad_norm": 0.48622030317888465,
+      "learning_rate": 4.762496061632814e-05,
+      "loss": 0.6965,
+      "step": 1285
+    },
+    {
+      "epoch": 0.6858666666666666,
+      "grad_norm": 0.48515398417323474,
+      "learning_rate": 4.747783129228656e-05,
+      "loss": 0.649,
+      "step": 1286
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.42337213509437976,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.5853,
+      "step": 1287
+    },
+    {
+      "epoch": 0.6869333333333333,
+      "grad_norm": 0.46739335531326504,
+      "learning_rate": 4.718404360058966e-05,
+      "loss": 0.6734,
+      "step": 1288
+    },
+    {
+      "epoch": 0.6874666666666667,
+      "grad_norm": 0.43779643398988793,
+      "learning_rate": 4.7037386110228985e-05,
+      "loss": 0.6782,
+      "step": 1289
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4600479763327274,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.6969,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6885333333333333,
+      "grad_norm": 0.39366032769643583,
+      "learning_rate": 4.6744546030189486e-05,
+      "loss": 0.653,
+      "step": 1291
+    },
+    {
+      "epoch": 0.6890666666666667,
+      "grad_norm": 0.37558272896056105,
+      "learning_rate": 4.659836431497563e-05,
+      "loss": 0.6196,
+      "step": 1292
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.37272200377118736,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.6228,
+      "step": 1293
+    },
+    {
+      "epoch": 0.6901333333333334,
+      "grad_norm": 0.42945206411674275,
+      "learning_rate": 4.630647971676232e-05,
+      "loss": 0.6268,
+      "step": 1294
+    },
+    {
+      "epoch": 0.6906666666666667,
+      "grad_norm": 0.6166029413629494,
+      "learning_rate": 4.6160777705374524e-05,
+      "loss": 0.7235,
+      "step": 1295
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.42629619211839953,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.7182,
+      "step": 1296
+    },
+    {
+      "epoch": 0.6917333333333333,
+      "grad_norm": 0.3671463855501407,
+      "learning_rate": 4.586985643347717e-05,
+      "loss": 0.5879,
+      "step": 1297
+    },
+    {
+      "epoch": 0.6922666666666667,
+      "grad_norm": 0.4094433707586426,
+      "learning_rate": 4.572463804170263e-05,
+      "loss": 0.6218,
+      "step": 1298
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.4157481878534906,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.618,
+      "step": 1299
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.4274197173120849,
+      "learning_rate": 4.543468791472131e-05,
+      "loss": 0.669,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6938666666666666,
+      "grad_norm": 0.3970755103030679,
+      "learning_rate": 4.5289957045349653e-05,
+      "loss": 0.6384,
+      "step": 1301
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.45084601233135135,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.7519,
+      "step": 1302
+    },
+    {
+      "epoch": 0.6949333333333333,
+      "grad_norm": 0.40924399535224726,
+      "learning_rate": 4.5000985855784746e-05,
+      "loss": 0.6525,
+      "step": 1303
+    },
+    {
+      "epoch": 0.6954666666666667,
+      "grad_norm": 0.5987694684687338,
+      "learning_rate": 4.485674639850333e-05,
+      "loss": 0.6564,
+      "step": 1304
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.41156457909001926,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.6745,
+      "step": 1305
+    },
+    {
+      "epoch": 0.6965333333333333,
+      "grad_norm": 0.5675373666675694,
+      "learning_rate": 4.456876191254582e-05,
+      "loss": 0.7554,
+      "step": 1306
+    },
+    {
+      "epoch": 0.6970666666666666,
+      "grad_norm": 0.4530565326014445,
+      "learning_rate": 4.442501774383515e-05,
+      "loss": 0.7656,
+      "step": 1307
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.43455739336857785,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.7639,
+      "step": 1308
+    },
+    {
+      "epoch": 0.6981333333333334,
+      "grad_norm": 0.4625805066144766,
+      "learning_rate": 4.413802770115816e-05,
+      "loss": 0.7301,
+      "step": 1309
+    },
+    {
+      "epoch": 0.6986666666666667,
+      "grad_norm": 0.4687573326755966,
+      "learning_rate": 4.399478268418771e-05,
+      "loss": 0.6784,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.5400521150678994,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.6784,
+      "step": 1311
+    },
+    {
+      "epoch": 0.6997333333333333,
+      "grad_norm": 0.37891632845292245,
+      "learning_rate": 4.3708794797738375e-05,
+      "loss": 0.6627,
+      "step": 1312
+    },
+    {
+      "epoch": 0.7002666666666667,
+      "grad_norm": 0.4485527940764225,
+      "learning_rate": 4.3566052782262735e-05,
+      "loss": 0.6813,
+      "step": 1313
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.4540503467371925,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6693,
+      "step": 1314
+    },
+    {
+      "epoch": 0.7013333333333334,
+      "grad_norm": 0.4865435283342518,
+      "learning_rate": 4.328107473805487e-05,
+      "loss": 0.715,
+      "step": 1315
+    },
+    {
+      "epoch": 0.7018666666666666,
+      "grad_norm": 0.43575127743652914,
+      "learning_rate": 4.3138839560310303e-05,
+      "loss": 0.6665,
+      "step": 1316
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.43019359102950366,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6586,
+      "step": 1317
+    },
+    {
+      "epoch": 0.7029333333333333,
+      "grad_norm": 0.5113079554729141,
+      "learning_rate": 4.2854879017217894e-05,
+      "loss": 0.748,
+      "step": 1318
+    },
+    {
+      "epoch": 0.7034666666666667,
+      "grad_norm": 0.4821203181807719,
+      "learning_rate": 4.271315449981934e-05,
+      "loss": 0.7528,
+      "step": 1319
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.42134366694499037,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.6756,
+      "step": 1320
+    },
+    {
+      "epoch": 0.7045333333333333,
+      "grad_norm": 0.4504160027209184,
+      "learning_rate": 4.2430219089370823e-05,
+      "loss": 0.6741,
+      "step": 1321
+    },
+    {
+      "epoch": 0.7050666666666666,
+      "grad_norm": 0.4042945996342561,
+      "learning_rate": 4.228900904120895e-05,
+      "loss": 0.666,
+      "step": 1322
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.5034168527915088,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.7473,
+      "step": 1323
+    },
+    {
+      "epoch": 0.7061333333333333,
+      "grad_norm": 0.4730515114462499,
+      "learning_rate": 4.200710636738189e-05,
+      "loss": 0.7597,
+      "step": 1324
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 0.3375261764602711,
+      "learning_rate": 4.1866414583520877e-05,
+      "loss": 0.5817,
+      "step": 1325
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.42148433248647954,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6713,
+      "step": 1326
+    },
+    {
+      "epoch": 0.7077333333333333,
+      "grad_norm": 0.3536704135783801,
+      "learning_rate": 4.158555222253771e-05,
+      "loss": 0.6422,
+      "step": 1327
+    },
+    {
+      "epoch": 0.7082666666666667,
+      "grad_norm": 0.4732608797163496,
+      "learning_rate": 4.14453824841132e-05,
+      "loss": 0.7479,
+      "step": 1328
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.5396160418355413,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6324,
+      "step": 1329
+    },
+    {
+      "epoch": 0.7093333333333334,
+      "grad_norm": 0.4842880189383717,
+      "learning_rate": 4.1165567984237764e-05,
+      "loss": 0.6485,
+      "step": 1330
+    },
+    {
+      "epoch": 0.7098666666666666,
+      "grad_norm": 0.5563890271607748,
+      "learning_rate": 4.102592405835536e-05,
+      "loss": 0.6521,
+      "step": 1331
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4425083354838779,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7191,
+      "step": 1332
+    },
+    {
+      "epoch": 0.7109333333333333,
+      "grad_norm": 0.5980880189944785,
+      "learning_rate": 4.074716493968975e-05,
+      "loss": 0.644,
+      "step": 1333
+    },
+    {
+      "epoch": 0.7114666666666667,
+      "grad_norm": 0.48233570275749316,
+      "learning_rate": 4.060805057932359e-05,
+      "loss": 0.7742,
+      "step": 1334
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.4012527813911815,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6813,
+      "step": 1335
+    },
+    {
+      "epoch": 0.7125333333333334,
+      "grad_norm": 0.3679257098356443,
+      "learning_rate": 4.0330354333606234e-05,
+      "loss": 0.605,
+      "step": 1336
+    },
+    {
+      "epoch": 0.7130666666666666,
+      "grad_norm": 0.40018105835742707,
+      "learning_rate": 4.019177327749822e-05,
+      "loss": 0.6156,
+      "step": 1337
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.4308179433569843,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.7381,
+      "step": 1338
+    },
+    {
+      "epoch": 0.7141333333333333,
+      "grad_norm": 0.5272604799209487,
+      "learning_rate": 3.991514736790258e-05,
+      "loss": 0.65,
+      "step": 1339
+    },
+    {
+      "epoch": 0.7146666666666667,
+      "grad_norm": 0.4421385760524196,
+      "learning_rate": 3.977710334046193e-05,
+      "loss": 0.7449,
+      "step": 1340
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.44588841990396755,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.6753,
+      "step": 1341
+    },
+    {
+      "epoch": 0.7157333333333333,
+      "grad_norm": 0.40033860076889666,
+      "learning_rate": 3.950155520139581e-05,
+      "loss": 0.6779,
+      "step": 1342
+    },
+    {
+      "epoch": 0.7162666666666667,
+      "grad_norm": 0.4538456241393483,
+      "learning_rate": 3.936405191259891e-05,
+      "loss": 0.6184,
+      "step": 1343
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3666606205717183,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6501,
+      "step": 1344
+    },
+    {
+      "epoch": 0.7173333333333334,
+      "grad_norm": 0.4934538346550755,
+      "learning_rate": 3.9089588949504655e-05,
+      "loss": 0.5735,
+      "step": 1345
+    },
+    {
+      "epoch": 0.7178666666666667,
+      "grad_norm": 0.6109550738227458,
+      "learning_rate": 3.895263009479534e-05,
+      "loss": 0.7158,
+      "step": 1346
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.4751922921016551,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6365,
+      "step": 1347
+    },
+    {
+      "epoch": 0.7189333333333333,
+      "grad_norm": 0.4442858112293756,
+      "learning_rate": 3.867925968395085e-05,
+      "loss": 0.6313,
+      "step": 1348
+    },
+    {
+      "epoch": 0.7194666666666667,
+      "grad_norm": 0.4502163638711434,
+      "learning_rate": 3.854284894414122e-05,
+      "loss": 0.719,
+      "step": 1349
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3850076588185577,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.6237,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7205333333333334,
+      "grad_norm": 0.5012850995227692,
+      "learning_rate": 3.82705784324618e-05,
+      "loss": 0.6857,
+      "step": 1351
+    },
+    {
+      "epoch": 0.7210666666666666,
+      "grad_norm": 0.4261235683833505,
+      "learning_rate": 3.8134719473633094e-05,
+      "loss": 0.6391,
+      "step": 1352
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.3706138666410532,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.5812,
+      "step": 1353
+    },
+    {
+      "epoch": 0.7221333333333333,
+      "grad_norm": 0.4297917510015146,
+      "learning_rate": 3.786355617847385e-05,
+      "loss": 0.7014,
+      "step": 1354
+    },
+    {
+      "epoch": 0.7226666666666667,
+      "grad_norm": 0.5049950909930079,
+      "learning_rate": 3.772825265187802e-05,
+      "loss": 0.6924,
+      "step": 1355
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.410115601309095,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.6814,
+      "step": 1356
+    },
+    {
+      "epoch": 0.7237333333333333,
+      "grad_norm": 0.435910679791374,
+      "learning_rate": 3.7458203860837234e-05,
+      "loss": 0.6852,
+      "step": 1357
+    },
+    {
+      "epoch": 0.7242666666666666,
+      "grad_norm": 0.3304471307752587,
+      "learning_rate": 3.732345940279893e-05,
+      "loss": 0.5666,
+      "step": 1358
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.5115953250373663,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.7767,
+      "step": 1359
+    },
+    {
+      "epoch": 0.7253333333333334,
+      "grad_norm": 0.45214157826179824,
+      "learning_rate": 3.705453237352227e-05,
+      "loss": 0.6494,
+      "step": 1360
+    },
+    {
+      "epoch": 0.7258666666666667,
+      "grad_norm": 0.39820922009077997,
+      "learning_rate": 3.692035060534088e-05,
+      "loss": 0.6352,
+      "step": 1361
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.4789431281088089,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.6866,
+      "step": 1362
+    },
+    {
+      "epoch": 0.7269333333333333,
+      "grad_norm": 0.40275237465807845,
+      "learning_rate": 3.665255256532638e-05,
+      "loss": 0.6591,
+      "step": 1363
+    },
+    {
+      "epoch": 0.7274666666666667,
+      "grad_norm": 0.3713650013349836,
+      "learning_rate": 3.651893709317887e-05,
+      "loss": 0.6425,
+      "step": 1364
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.41188885924377205,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6242,
+      "step": 1365
+    },
+    {
+      "epoch": 0.7285333333333334,
+      "grad_norm": 0.34893506665356233,
+      "learning_rate": 3.625227523958252e-05,
+      "loss": 0.5932,
+      "step": 1366
+    },
+    {
+      "epoch": 0.7290666666666666,
+      "grad_norm": 1.2163344446598188,
+      "learning_rate": 3.611922965442648e-05,
+      "loss": 0.6846,
+      "step": 1367
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.4067397288108614,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.6401,
+      "step": 1368
+    },
+    {
+      "epoch": 0.7301333333333333,
+      "grad_norm": 0.42477780694654677,
+      "learning_rate": 3.5853711153868965e-05,
+      "loss": 0.6874,
+      "step": 1369
+    },
+    {
+      "epoch": 0.7306666666666667,
+      "grad_norm": 0.44182676800461945,
+      "learning_rate": 3.5721239031346066e-05,
+      "loss": 0.6943,
+      "step": 1370
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.41404454441862243,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.7061,
+      "step": 1371
+    },
+    {
+      "epoch": 0.7317333333333333,
+      "grad_norm": 0.3942082685818331,
+      "learning_rate": 3.545687101972013e-05,
+      "loss": 0.6874,
+      "step": 1372
+    },
+    {
+      "epoch": 0.7322666666666666,
+      "grad_norm": 0.4279982410771876,
+      "learning_rate": 3.53249759200601e-05,
+      "loss": 0.6441,
+      "step": 1373
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.41782546710583973,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6537,
+      "step": 1374
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.41077064496653026,
+      "learning_rate": 3.506176550233863e-05,
+      "loss": 0.7045,
+      "step": 1375
+    },
+    {
+      "epoch": 0.7338666666666667,
+      "grad_norm": 0.3880835455808636,
+      "learning_rate": 3.4930450970263485e-05,
+      "loss": 0.6458,
+      "step": 1376
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.4377648053400171,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6573,
+      "step": 1377
+    },
+    {
+      "epoch": 0.7349333333333333,
+      "grad_norm": 0.465498029493161,
+      "learning_rate": 3.46684052203088e-05,
+      "loss": 0.665,
+      "step": 1378
+    },
+    {
+      "epoch": 0.7354666666666667,
+      "grad_norm": 0.518912253454699,
+      "learning_rate": 3.4537674784937614e-05,
+      "loss": 0.7123,
+      "step": 1379
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.4592564284357289,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.692,
+      "step": 1380
+    },
+    {
+      "epoch": 0.7365333333333334,
+      "grad_norm": 0.421272376232594,
+      "learning_rate": 3.427680074531113e-05,
+      "loss": 0.6788,
+      "step": 1381
+    },
+    {
+      "epoch": 0.7370666666666666,
+      "grad_norm": 0.4190421951105113,
+      "learning_rate": 3.4146657920065285e-05,
+      "loss": 0.6089,
+      "step": 1382
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.5296063611058046,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7328,
+      "step": 1383
+    },
+    {
+      "epoch": 0.7381333333333333,
+      "grad_norm": 0.4119536279993497,
+      "learning_rate": 3.388696260183832e-05,
+      "loss": 0.6775,
+      "step": 1384
+    },
+    {
+      "epoch": 0.7386666666666667,
+      "grad_norm": 0.4372127695497076,
+      "learning_rate": 3.3757410884346894e-05,
+      "loss": 0.7381,
+      "step": 1385
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.4623282475892554,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6621,
+      "step": 1386
+    },
+    {
+      "epoch": 0.7397333333333334,
+      "grad_norm": 0.41289449863476224,
+      "learning_rate": 3.3498901266912396e-05,
+      "loss": 0.6196,
+      "step": 1387
+    },
+    {
+      "epoch": 0.7402666666666666,
+      "grad_norm": 0.480771875212797,
+      "learning_rate": 3.336994413891828e-05,
+      "loss": 0.7198,
+      "step": 1388
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.39296287900786697,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.6006,
+      "step": 1389
+    },
+    {
+      "epoch": 0.7413333333333333,
+      "grad_norm": 0.41689104434774116,
+      "learning_rate": 3.3112627169802946e-05,
+      "loss": 0.6478,
+      "step": 1390
+    },
+    {
+      "epoch": 0.7418666666666667,
+      "grad_norm": 0.43257497028506464,
+      "learning_rate": 3.298426809706928e-05,
+      "loss": 0.6128,
+      "step": 1391
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.4681766355573039,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.703,
+      "step": 1392
+    },
+    {
+      "epoch": 0.7429333333333333,
+      "grad_norm": 0.5935400521657089,
+      "learning_rate": 3.2728150691747115e-05,
+      "loss": 0.7587,
+      "step": 1393
+    },
+    {
+      "epoch": 0.7434666666666667,
+      "grad_norm": 0.5077586112824047,
+      "learning_rate": 3.2600393123964113e-05,
+      "loss": 0.6681,
+      "step": 1394
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4606788504253986,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.7005,
+      "step": 1395
+    },
+    {
+      "epoch": 0.7445333333333334,
+      "grad_norm": 0.4409361567025991,
+      "learning_rate": 3.234548216567049e-05,
+      "loss": 0.72,
+      "step": 1396
+    },
+    {
+      "epoch": 0.7450666666666667,
+      "grad_norm": 0.390447354877361,
+      "learning_rate": 3.2218329536362704e-05,
+      "loss": 0.6881,
+      "step": 1397
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.46312060958483303,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.5681,
+      "step": 1398
+    },
+    {
+      "epoch": 0.7461333333333333,
+      "grad_norm": 0.4367757132898544,
+      "learning_rate": 3.196463187590929e-05,
+      "loss": 0.6619,
+      "step": 1399
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.4553356814837973,
+      "learning_rate": 3.1838087602343344e-05,
+      "loss": 0.6796,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.45225260368863773,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6866,
+      "step": 1401
+    },
+    {
+      "epoch": 0.7477333333333334,
+      "grad_norm": 0.4536254760360822,
+      "learning_rate": 3.158561005793402e-05,
+      "loss": 0.7353,
+      "step": 1402
+    },
+    {
+      "epoch": 0.7482666666666666,
+      "grad_norm": 0.45167273611910475,
+      "learning_rate": 3.145967754102691e-05,
+      "loss": 0.6506,
+      "step": 1403
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.4653418492705205,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.6487,
+      "step": 1404
+    },
+    {
+      "epoch": 0.7493333333333333,
+      "grad_norm": 0.43782464350968103,
+      "learning_rate": 3.120842689807468e-05,
+      "loss": 0.6619,
+      "step": 1405
+    },
+    {
+      "epoch": 0.7498666666666667,
+      "grad_norm": 0.4204359056290449,
+      "learning_rate": 3.108310952230212e-05,
+      "loss": 0.6473,
+      "step": 1406
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.4300521099061381,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6369,
+      "step": 1407
+    },
+    {
+      "epoch": 0.7509333333333333,
+      "grad_norm": 0.44165546763416386,
+      "learning_rate": 3.083309253324651e-05,
+      "loss": 0.6794,
+      "step": 1408
+    },
+    {
+      "epoch": 0.7514666666666666,
+      "grad_norm": 0.43339208508125643,
+      "learning_rate": 3.070839366655215e-05,
+      "loss": 0.6419,
+      "step": 1409
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.40186803579797137,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6658,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7525333333333334,
+      "grad_norm": 0.3841899309149006,
+      "learning_rate": 3.0459617050677868e-05,
+      "loss": 0.6107,
+      "step": 1411
+    },
+    {
+      "epoch": 0.7530666666666667,
+      "grad_norm": 0.4158909919301638,
+      "learning_rate": 3.0335540044382694e-05,
+      "loss": 0.6268,
+      "step": 1412
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.659571482120607,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.7197,
+      "step": 1413
+    },
+    {
+      "epoch": 0.7541333333333333,
+      "grad_norm": 0.4482032932856501,
+      "learning_rate": 3.008801048763914e-05,
+      "loss": 0.6397,
+      "step": 1414
+    },
+    {
+      "epoch": 0.7546666666666667,
+      "grad_norm": 0.4761032227320628,
+      "learning_rate": 2.996455867635155e-05,
+      "loss": 0.7583,
+      "step": 1415
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.48003199160702137,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.7259,
+      "step": 1416
+    },
+    {
+      "epoch": 0.7557333333333334,
+      "grad_norm": 0.4074685491078901,
+      "learning_rate": 2.9718282831172883e-05,
+      "loss": 0.6619,
+      "step": 1417
+    },
+    {
+      "epoch": 0.7562666666666666,
+      "grad_norm": 0.7582833567287415,
+      "learning_rate": 2.9595459532698854e-05,
+      "loss": 0.7086,
+      "step": 1418
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.4186350368778986,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6425,
+      "step": 1419
+    },
+    {
+      "epoch": 0.7573333333333333,
+      "grad_norm": 0.4875366017253042,
+      "learning_rate": 2.9350444017825385e-05,
+      "loss": 0.6936,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7578666666666667,
+      "grad_norm": 0.47152194894082033,
+      "learning_rate": 2.922825253307947e-05,
+      "loss": 0.6941,
+      "step": 1421
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.5077366430562386,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.7384,
+      "step": 1422
+    },
+    {
+      "epoch": 0.7589333333333333,
+      "grad_norm": 0.4844444506906377,
+      "learning_rate": 2.898450393337977e-05,
+      "loss": 0.6448,
+      "step": 1423
+    },
+    {
+      "epoch": 0.7594666666666666,
+      "grad_norm": 0.567970942644562,
+      "learning_rate": 2.8862947546296315e-05,
+      "loss": 0.7004,
+      "step": 1424
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.45285637611447,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.6252,
+      "step": 1425
+    },
+    {
+      "epoch": 0.7605333333333333,
+      "grad_norm": 0.39904939346272067,
+      "learning_rate": 2.8620472412590228e-05,
+      "loss": 0.7037,
+      "step": 1426
+    },
+    {
+      "epoch": 0.7610666666666667,
+      "grad_norm": 0.40870457636421614,
+      "learning_rate": 2.8499554390035143e-05,
+      "loss": 0.6915,
+      "step": 1427
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.38517232522524636,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6231,
+      "step": 1428
+    },
+    {
+      "epoch": 0.7621333333333333,
+      "grad_norm": 0.3980631285502874,
+      "learning_rate": 2.8258359238917665e-05,
+      "loss": 0.6637,
+      "step": 1429
+    },
+    {
+      "epoch": 0.7626666666666667,
+      "grad_norm": 0.38569816720663086,
+      "learning_rate": 2.8138082830600554e-05,
+      "loss": 0.6396,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.4235640896081435,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6323,
+      "step": 1431
+    },
+    {
+      "epoch": 0.7637333333333334,
+      "grad_norm": 0.3669973174944435,
+      "learning_rate": 2.7898174144266732e-05,
+      "loss": 0.6365,
+      "step": 1432
+    },
+    {
+      "epoch": 0.7642666666666666,
+      "grad_norm": 0.502319781637613,
+      "learning_rate": 2.7778542582653744e-05,
+      "loss": 0.729,
+      "step": 1433
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.4443826452871617,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6004,
+      "step": 1434
+    },
+    {
+      "epoch": 0.7653333333333333,
+      "grad_norm": 0.4009814414355795,
+      "learning_rate": 2.753992680872457e-05,
+      "loss": 0.68,
+      "step": 1435
+    },
+    {
+      "epoch": 0.7658666666666667,
+      "grad_norm": 0.4072069258362513,
+      "learning_rate": 2.7420943308951284e-05,
+      "loss": 0.6438,
+      "step": 1436
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.534313603465075,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.7025,
+      "step": 1437
+    },
+    {
+      "epoch": 0.7669333333333334,
+      "grad_norm": 0.38522779892576314,
+      "learning_rate": 2.7183626860300247e-05,
+      "loss": 0.6212,
+      "step": 1438
+    },
+    {
+      "epoch": 0.7674666666666666,
+      "grad_norm": 0.42540386104077615,
+      "learning_rate": 2.7065294620085424e-05,
+      "loss": 0.6505,
+      "step": 1439
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4415591223220511,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6922,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7685333333333333,
+      "grad_norm": 0.4974882957691649,
+      "learning_rate": 2.6829283874666233e-05,
+      "loss": 0.6332,
+      "step": 1441
+    },
+    {
+      "epoch": 0.7690666666666667,
+      "grad_norm": 0.4572713358634199,
+      "learning_rate": 2.6711606074225782e-05,
+      "loss": 0.6756,
+      "step": 1442
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.5632009516879064,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6886,
+      "step": 1443
+    },
+    {
+      "epoch": 0.7701333333333333,
+      "grad_norm": 0.5654612611201677,
+      "learning_rate": 2.647690737490106e-05,
+      "loss": 0.7607,
+      "step": 1444
+    },
+    {
+      "epoch": 0.7706666666666667,
+      "grad_norm": 0.46848708751036583,
+      "learning_rate": 2.6359887176862718e-05,
+      "loss": 0.6297,
+      "step": 1445
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.4019437430654483,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6642,
+      "step": 1446
+    },
+    {
+      "epoch": 0.7717333333333334,
+      "grad_norm": 0.5391388624920869,
+      "learning_rate": 2.6126506831233344e-05,
+      "loss": 0.7346,
+      "step": 1447
+    },
+    {
+      "epoch": 0.7722666666666667,
+      "grad_norm": 0.39762033129290314,
+      "learning_rate": 2.6010147380551475e-05,
+      "loss": 0.6499,
+      "step": 1448
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.6089772015665591,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.7228,
+      "step": 1449
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.4091684206732233,
+      "learning_rate": 2.577809166078716e-05,
+      "loss": 0.6212,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7738666666666667,
+      "grad_norm": 0.3875516194768679,
+      "learning_rate": 2.566239608465838e-05,
+      "loss": 0.6148,
+      "step": 1451
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.5087414360417498,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.669,
+      "step": 1452
+    },
+    {
+      "epoch": 0.7749333333333334,
+      "grad_norm": 0.42052223640750874,
+      "learning_rate": 2.543167122732918e-05,
+      "loss": 0.6819,
+      "step": 1453
+    },
+    {
+      "epoch": 0.7754666666666666,
+      "grad_norm": 0.5571920739198382,
+      "learning_rate": 2.5316642635108244e-05,
+      "loss": 0.7379,
+      "step": 1454
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.5589725779775973,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.6986,
+      "step": 1455
+    },
+    {
+      "epoch": 0.7765333333333333,
+      "grad_norm": 0.4077774865425764,
+      "learning_rate": 2.508725484101684e-05,
+      "loss": 0.5673,
+      "step": 1456
+    },
+    {
+      "epoch": 0.7770666666666667,
+      "grad_norm": 0.3966051236938129,
+      "learning_rate": 2.4972896324133144e-05,
+      "loss": 0.6,
+      "step": 1457
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.4274422908439089,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6335,
+      "step": 1458
+    },
+    {
+      "epoch": 0.7781333333333333,
+      "grad_norm": 0.33771537843772004,
+      "learning_rate": 2.4744851758148156e-05,
+      "loss": 0.5881,
+      "step": 1459
+    },
+    {
+      "epoch": 0.7786666666666666,
+      "grad_norm": 0.4093214392340917,
+      "learning_rate": 2.4631166390022574e-05,
+      "loss": 0.6448,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.44006827822558175,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6848,
+      "step": 1461
+    },
+    {
+      "epoch": 0.7797333333333333,
+      "grad_norm": 0.3957670014836868,
+      "learning_rate": 2.4404471180913058e-05,
+      "loss": 0.6361,
+      "step": 1462
+    },
+    {
+      "epoch": 0.7802666666666667,
+      "grad_norm": 0.42111179500586743,
+      "learning_rate": 2.429146201687538e-05,
+      "loss": 0.7322,
+      "step": 1463
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.447595877658861,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6413,
+      "step": 1464
+    },
+    {
+      "epoch": 0.7813333333333333,
+      "grad_norm": 0.5004926499095018,
+      "learning_rate": 2.4066122257145894e-05,
+      "loss": 0.6786,
+      "step": 1465
+    },
+    {
+      "epoch": 0.7818666666666667,
+      "grad_norm": 0.465628029025143,
+      "learning_rate": 2.3953792334352787e-05,
+      "loss": 0.6439,
+      "step": 1466
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.4516378355070736,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6734,
+      "step": 1467
+    },
+    {
+      "epoch": 0.7829333333333334,
+      "grad_norm": 0.4193879525975684,
+      "learning_rate": 2.3729814080079816e-05,
+      "loss": 0.6887,
+      "step": 1468
+    },
+    {
+      "epoch": 0.7834666666666666,
+      "grad_norm": 0.37092129339802005,
+      "learning_rate": 2.361816641743303e-05,
+      "loss": 0.6312,
+      "step": 1469
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4105063644947912,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.713,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7845333333333333,
+      "grad_norm": 0.36374646860495985,
+      "learning_rate": 2.339555568810221e-05,
+      "loss": 0.6263,
+      "step": 1471
+    },
+    {
+      "epoch": 0.7850666666666667,
+      "grad_norm": 0.5074406776993624,
+      "learning_rate": 2.328459328616759e-05,
+      "loss": 0.6642,
+      "step": 1472
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.4128235360455528,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6121,
+      "step": 1473
+    },
+    {
+      "epoch": 0.7861333333333334,
+      "grad_norm": 0.36038965284139735,
+      "learning_rate": 2.306335606451181e-05,
+      "loss": 0.6256,
+      "step": 1474
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 0.39572687509499344,
+      "learning_rate": 2.295308190543859e-05,
+      "loss": 0.6351,
+      "step": 1475
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.423484534839522,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.645,
+      "step": 1476
+    },
+    {
+      "epoch": 0.7877333333333333,
+      "grad_norm": 0.4386416765973164,
+      "learning_rate": 2.2733224137277366e-05,
+      "loss": 0.6079,
+      "step": 1477
+    },
+    {
+      "epoch": 0.7882666666666667,
+      "grad_norm": 0.39285308813316205,
+      "learning_rate": 2.262364118471805e-05,
+      "loss": 0.6764,
+      "step": 1478
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.4459364112421892,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6397,
+      "step": 1479
+    },
+    {
+      "epoch": 0.7893333333333333,
+      "grad_norm": 0.39168469410823503,
+      "learning_rate": 2.2405168778797646e-05,
+      "loss": 0.6507,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7898666666666667,
+      "grad_norm": 0.3722640125985169,
+      "learning_rate": 2.2296279977828337e-05,
+      "loss": 0.6252,
+      "step": 1481
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.4355454966323861,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.7141,
+      "step": 1482
+    },
+    {
+      "epoch": 0.7909333333333334,
+      "grad_norm": 0.3430399297126906,
+      "learning_rate": 2.2079198805662914e-05,
+      "loss": 0.6126,
+      "step": 1483
+    },
+    {
+      "epoch": 0.7914666666666667,
+      "grad_norm": 0.4583537527387117,
+      "learning_rate": 2.1971007082704164e-05,
+      "loss": 0.6296,
+      "step": 1484
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.4279278274481135,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.645,
+      "step": 1485
+    },
+    {
+      "epoch": 0.7925333333333333,
+      "grad_norm": 0.5038785683153161,
+      "learning_rate": 2.1755322978418137e-05,
+      "loss": 0.6757,
+      "step": 1486
+    },
+    {
+      "epoch": 0.7930666666666667,
+      "grad_norm": 0.48090150321401093,
+      "learning_rate": 2.1647831241156302e-05,
+      "loss": 0.638,
+      "step": 1487
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.423908701621885,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6314,
+      "step": 1488
+    },
+    {
+      "epoch": 0.7941333333333334,
+      "grad_norm": 0.4564407646195038,
+      "learning_rate": 2.1433550001327373e-05,
+      "loss": 0.6831,
+      "step": 1489
+    },
+    {
+      "epoch": 0.7946666666666666,
+      "grad_norm": 0.5008191526678891,
+      "learning_rate": 2.1326761138636553e-05,
+      "loss": 0.6883,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.45733730495896463,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6117,
+      "step": 1491
+    },
+    {
+      "epoch": 0.7957333333333333,
+      "grad_norm": 0.42993085879339066,
+      "learning_rate": 2.111388852214001e-05,
+      "loss": 0.699,
+      "step": 1492
+    },
+    {
+      "epoch": 0.7962666666666667,
+      "grad_norm": 0.476292986154224,
+      "learning_rate": 2.1007805404004242e-05,
+      "loss": 0.7199,
+      "step": 1493
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.4102089662875839,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.6555,
+      "step": 1494
+    },
+    {
+      "epoch": 0.7973333333333333,
+      "grad_norm": 0.4180688510909825,
+      "learning_rate": 2.0796347131858186e-05,
+      "loss": 0.6948,
+      "step": 1495
+    },
+    {
+      "epoch": 0.7978666666666666,
+      "grad_norm": 0.38743362692091166,
+      "learning_rate": 2.069097260929439e-05,
+      "loss": 0.643,
+      "step": 1496
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.4149369355782099,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6803,
+      "step": 1497
+    },
+    {
+      "epoch": 0.7989333333333334,
+      "grad_norm": 0.37950622060268036,
+      "learning_rate": 2.048093436450603e-05,
+      "loss": 0.655,
+      "step": 1498
+    },
+    {
+      "epoch": 0.7994666666666667,
+      "grad_norm": 0.4943825903704621,
+      "learning_rate": 2.0376271269487514e-05,
+      "loss": 0.7194,
+      "step": 1499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.400978347396589,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6154,
+      "step": 1500
+    },
+    {
+      "epoch": 0.8005333333333333,
+      "grad_norm": 0.4775873407139853,
+      "learning_rate": 2.0167658696900317e-05,
+      "loss": 0.6247,
+      "step": 1501
+    },
+    {
+      "epoch": 0.8010666666666667,
+      "grad_norm": 0.4553479375148142,
+      "learning_rate": 2.0063709842280432e-05,
+      "loss": 0.7256,
+      "step": 1502
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.4244814429336164,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.681,
+      "step": 1503
+    },
+    {
+      "epoch": 0.8021333333333334,
+      "grad_norm": 0.4508251541923756,
+      "learning_rate": 1.985652854842247e-05,
+      "loss": 0.6884,
+      "step": 1504
+    },
+    {
+      "epoch": 0.8026666666666666,
+      "grad_norm": 0.6243044339114079,
+      "learning_rate": 1.9753296727859195e-05,
+      "loss": 0.6957,
+      "step": 1505
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.44969579306877716,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6317,
+      "step": 1506
+    },
+    {
+      "epoch": 0.8037333333333333,
+      "grad_norm": 0.43554840209066353,
+      "learning_rate": 1.9547552280792524e-05,
+      "loss": 0.5915,
+      "step": 1507
+    },
+    {
+      "epoch": 0.8042666666666667,
+      "grad_norm": 0.5896809712135934,
+      "learning_rate": 1.9445040268673298e-05,
+      "loss": 0.7204,
+      "step": 1508
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.46883068997796684,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.686,
+      "step": 1509
+    },
+    {
+      "epoch": 0.8053333333333333,
+      "grad_norm": 0.4670614214856696,
+      "learning_rate": 1.9240738197844278e-05,
+      "loss": 0.6685,
+      "step": 1510
+    },
+    {
+      "epoch": 0.8058666666666666,
+      "grad_norm": 0.46998743057063663,
+      "learning_rate": 1.9138948749211472e-05,
+      "loss": 0.7182,
+      "step": 1511
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.4386788250764763,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6414,
+      "step": 1512
+    },
+    {
+      "epoch": 0.8069333333333333,
+      "grad_norm": 0.4136227444175789,
+      "learning_rate": 1.8936094545302095e-05,
+      "loss": 0.6453,
+      "step": 1513
+    },
+    {
+      "epoch": 0.8074666666666667,
+      "grad_norm": 0.5414548857949473,
+      "learning_rate": 1.883503039577894e-05,
+      "loss": 0.6031,
+      "step": 1514
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.3719962187265138,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.6053,
+      "step": 1515
+    },
+    {
+      "epoch": 0.8085333333333333,
+      "grad_norm": 0.4177683383290942,
+      "learning_rate": 1.8633629510559314e-05,
+      "loss": 0.6506,
+      "step": 1516
+    },
+    {
+      "epoch": 0.8090666666666667,
+      "grad_norm": 0.4729371368750245,
+      "learning_rate": 1.8533293376276472e-05,
+      "loss": 0.6814,
+      "step": 1517
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3262855975367515,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.589,
+      "step": 1518
+    },
+    {
+      "epoch": 0.8101333333333334,
+      "grad_norm": 0.5480520090986583,
+      "learning_rate": 1.8333351222458407e-05,
+      "loss": 0.773,
+      "step": 1519
+    },
+    {
+      "epoch": 0.8106666666666666,
+      "grad_norm": 0.44517043921596944,
+      "learning_rate": 1.8233745799980817e-05,
+      "loss": 0.6706,
+      "step": 1520
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.577481543003918,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.7275,
+      "step": 1521
+    },
+    {
+      "epoch": 0.8117333333333333,
+      "grad_norm": 0.4397967027612655,
+      "learning_rate": 1.803526775107217e-05,
+      "loss": 0.6359,
+      "step": 1522
+    },
+    {
+      "epoch": 0.8122666666666667,
+      "grad_norm": 0.3918583356457884,
+      "learning_rate": 1.7936395717326704e-05,
+      "loss": 0.6502,
+      "step": 1523
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4776416558413356,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.5677,
+      "step": 1524
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 0.4420162143207249,
+      "learning_rate": 1.773938710748706e-05,
+      "loss": 0.6636,
+      "step": 1525
+    },
+    {
+      "epoch": 0.8138666666666666,
+      "grad_norm": 0.4443006247321744,
+      "learning_rate": 1.7641251119690505e-05,
+      "loss": 0.6446,
+      "step": 1526
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.4503667815472766,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.6492,
+      "step": 1527
+    },
+    {
+      "epoch": 0.8149333333333333,
+      "grad_norm": 0.4612675657557288,
+      "learning_rate": 1.744571724358789e-05,
+      "loss": 0.6477,
+      "step": 1528
+    },
+    {
+      "epoch": 0.8154666666666667,
+      "grad_norm": 0.43209222342026876,
+      "learning_rate": 1.7348319939175637e-05,
+      "loss": 0.6564,
+      "step": 1529
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.5477723334156583,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.7517,
+      "step": 1530
+    },
+    {
+      "epoch": 0.8165333333333333,
+      "grad_norm": 0.40481915910603655,
+      "learning_rate": 1.715426605184407e-05,
+      "loss": 0.6442,
+      "step": 1531
+    },
+    {
+      "epoch": 0.8170666666666667,
+      "grad_norm": 0.4947924933794212,
+      "learning_rate": 1.705761004839911e-05,
+      "loss": 0.7493,
+      "step": 1532
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.5931479437790562,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7587,
+      "step": 1533
+    },
+    {
+      "epoch": 0.8181333333333334,
+      "grad_norm": 0.40706977530746524,
+      "learning_rate": 1.6865041365097435e-05,
+      "loss": 0.6408,
+      "step": 1534
+    },
+    {
+      "epoch": 0.8186666666666667,
+      "grad_norm": 0.45563104066901616,
+      "learning_rate": 1.676912926028007e-05,
+      "loss": 0.6549,
+      "step": 1535
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.4315762607401044,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.714,
+      "step": 1536
+    },
+    {
+      "epoch": 0.8197333333333333,
+      "grad_norm": 0.4711245706366215,
+      "learning_rate": 1.6578050956351886e-05,
+      "loss": 0.6258,
+      "step": 1537
+    },
+    {
+      "epoch": 0.8202666666666667,
+      "grad_norm": 0.46143225200024446,
+      "learning_rate": 1.6482885327829913e-05,
+      "loss": 0.5804,
+      "step": 1538
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.44584803737341006,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.7378,
+      "step": 1539
+    },
+    {
+      "epoch": 0.8213333333333334,
+      "grad_norm": 0.4803316915992794,
+      "learning_rate": 1.6293302538564382e-05,
+      "loss": 0.6421,
+      "step": 1540
+    },
+    {
+      "epoch": 0.8218666666666666,
+      "grad_norm": 0.3871107029547407,
+      "learning_rate": 1.619888594394382e-05,
+      "loss": 0.6333,
+      "step": 1541
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.45914687917486513,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6542,
+      "step": 1542
+    },
+    {
+      "epoch": 0.8229333333333333,
+      "grad_norm": 0.4508811891950307,
+      "learning_rate": 1.601080376443763e-05,
+      "loss": 0.6193,
+      "step": 1543
+    },
+    {
+      "epoch": 0.8234666666666667,
+      "grad_norm": 0.5939130372404645,
+      "learning_rate": 1.5917138741193973e-05,
+      "loss": 0.6265,
+      "step": 1544
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.46173118075953334,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.7207,
+      "step": 1545
+    },
+    {
+      "epoch": 0.8245333333333333,
+      "grad_norm": 0.4440593688395496,
+      "learning_rate": 1.573056222621453e-05,
+      "loss": 0.6581,
+      "step": 1546
+    },
+    {
+      "epoch": 0.8250666666666666,
+      "grad_norm": 0.4251307059431981,
+      "learning_rate": 1.5637651291624523e-05,
+      "loss": 0.659,
+      "step": 1547
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.46460031624996845,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.7175,
+      "step": 1548
+    },
+    {
+      "epoch": 0.8261333333333334,
+      "grad_norm": 0.5848587823882815,
+      "learning_rate": 1.5452585455473977e-05,
+      "loss": 0.7212,
+      "step": 1549
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.590168152666092,
+      "learning_rate": 1.536043110654809e-05,
+      "loss": 0.6946,
+      "step": 1550
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.4675547950081361,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6913,
+      "step": 1551
+    },
+    {
+      "epoch": 0.8277333333333333,
+      "grad_norm": 0.4467296778332705,
+      "learning_rate": 1.5176880922928616e-05,
+      "loss": 0.6609,
+      "step": 1552
+    },
+    {
+      "epoch": 0.8282666666666667,
+      "grad_norm": 0.4102157735989331,
+      "learning_rate": 1.5085485636343755e-05,
+      "loss": 0.6468,
+      "step": 1553
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.4413627034705757,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6379,
+      "step": 1554
+    },
+    {
+      "epoch": 0.8293333333333334,
+      "grad_norm": 0.44763494794833336,
+      "learning_rate": 1.4903456038223939e-05,
+      "loss": 0.6661,
+      "step": 1555
+    },
+    {
+      "epoch": 0.8298666666666666,
+      "grad_norm": 0.40904324118452856,
+      "learning_rate": 1.4812822270257009e-05,
+      "loss": 0.6629,
+      "step": 1556
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.4387769979377223,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.7335,
+      "step": 1557
+    },
+    {
+      "epoch": 0.8309333333333333,
+      "grad_norm": 0.5082678181995608,
+      "learning_rate": 1.4632318149739177e-05,
+      "loss": 0.6619,
+      "step": 1558
+    },
+    {
+      "epoch": 0.8314666666666667,
+      "grad_norm": 0.49378095505218844,
+      "learning_rate": 1.454244833620102e-05,
+      "loss": 0.6885,
+      "step": 1559
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3781146245279851,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6381,
+      "step": 1560
+    },
+    {
+      "epoch": 0.8325333333333333,
+      "grad_norm": 0.4403455658984126,
+      "learning_rate": 1.4363474544389877e-05,
+      "loss": 0.6521,
+      "step": 1561
+    },
+    {
+      "epoch": 0.8330666666666666,
+      "grad_norm": 0.449520980445605,
+      "learning_rate": 1.4274371100559791e-05,
+      "loss": 0.6525,
+      "step": 1562
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.5040550123853111,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.6642,
+      "step": 1563
+    },
+    {
+      "epoch": 0.8341333333333333,
+      "grad_norm": 0.5171102560168016,
+      "learning_rate": 1.409693244743192e-05,
+      "loss": 0.7313,
+      "step": 1564
+    },
+    {
+      "epoch": 0.8346666666666667,
+      "grad_norm": 0.4243066672704072,
+      "learning_rate": 1.4008597767992871e-05,
+      "loss": 0.6828,
+      "step": 1565
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.5606977986193548,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.7393,
+      "step": 1566
+    },
+    {
+      "epoch": 0.8357333333333333,
+      "grad_norm": 0.356765312259291,
+      "learning_rate": 1.3832699022267515e-05,
+      "loss": 0.5923,
+      "step": 1567
+    },
+    {
+      "epoch": 0.8362666666666667,
+      "grad_norm": 0.38578952542997724,
+      "learning_rate": 1.37451354812416e-05,
+      "loss": 0.6018,
+      "step": 1568
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.4055517387530648,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6542,
+      "step": 1569
+    },
+    {
+      "epoch": 0.8373333333333334,
+      "grad_norm": 0.387200711065784,
+      "learning_rate": 1.3570781370252582e-05,
+      "loss": 0.69,
+      "step": 1570
+    },
+    {
+      "epoch": 0.8378666666666666,
+      "grad_norm": 0.46122566793370784,
+      "learning_rate": 1.3483991320937306e-05,
+      "loss": 0.6456,
+      "step": 1571
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3779164218156875,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.631,
+      "step": 1572
+    },
+    {
+      "epoch": 0.8389333333333333,
+      "grad_norm": 0.41905632593435094,
+      "learning_rate": 1.3311186530505838e-05,
+      "loss": 0.6634,
+      "step": 1573
+    },
+    {
+      "epoch": 0.8394666666666667,
+      "grad_norm": 0.3492251627583365,
+      "learning_rate": 1.322517230541096e-05,
+      "loss": 0.5941,
+      "step": 1574
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.5269141861545891,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6662,
+      "step": 1575
+    },
+    {
+      "epoch": 0.8405333333333334,
+      "grad_norm": 0.4041237366118928,
+      "learning_rate": 1.30539214797198e-05,
+      "loss": 0.6543,
+      "step": 1576
+    },
+    {
+      "epoch": 0.8410666666666666,
+      "grad_norm": 0.3417551454751479,
+      "learning_rate": 1.2968685390504465e-05,
+      "loss": 0.5916,
+      "step": 1577
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.367928203255809,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6329,
+      "step": 1578
+    },
+    {
+      "epoch": 0.8421333333333333,
+      "grad_norm": 0.44284394360614165,
+      "learning_rate": 1.2798993131973091e-05,
+      "loss": 0.6217,
+      "step": 1579
+    },
+    {
+      "epoch": 0.8426666666666667,
+      "grad_norm": 0.5289192468404308,
+      "learning_rate": 1.2714537469383858e-05,
+      "loss": 0.6665,
+      "step": 1580
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.41253131494010853,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6808,
+      "step": 1581
+    },
+    {
+      "epoch": 0.8437333333333333,
+      "grad_norm": 0.39958183092768723,
+      "learning_rate": 1.2546408338544769e-05,
+      "loss": 0.6281,
+      "step": 1582
+    },
+    {
+      "epoch": 0.8442666666666667,
+      "grad_norm": 0.40604548549275743,
+      "learning_rate": 1.2462735372353996e-05,
+      "loss": 0.6353,
+      "step": 1583
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.4127316174238306,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6235,
+      "step": 1584
+    },
+    {
+      "epoch": 0.8453333333333334,
+      "grad_norm": 0.39994693947875093,
+      "learning_rate": 1.2296173887730123e-05,
+      "loss": 0.6515,
+      "step": 1585
+    },
+    {
+      "epoch": 0.8458666666666667,
+      "grad_norm": 0.46198926568901394,
+      "learning_rate": 1.2213285866674905e-05,
+      "loss": 0.7261,
+      "step": 1586
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.36620587997299,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6,
+      "step": 1587
+    },
+    {
+      "epoch": 0.8469333333333333,
+      "grad_norm": 0.4028536167555855,
+      "learning_rate": 1.2048296504658207e-05,
+      "loss": 0.6736,
+      "step": 1588
+    },
+    {
+      "epoch": 0.8474666666666667,
+      "grad_norm": 0.45078438510895197,
+      "learning_rate": 1.1966195656380031e-05,
+      "loss": 0.6627,
+      "step": 1589
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4273797825239026,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6443,
+      "step": 1590
+    },
+    {
+      "epoch": 0.8485333333333334,
+      "grad_norm": 0.45175325555552603,
+      "learning_rate": 1.1802782851111205e-05,
+      "loss": 0.6592,
+      "step": 1591
+    },
+    {
+      "epoch": 0.8490666666666666,
+      "grad_norm": 0.5273387684892102,
+      "learning_rate": 1.1721471382096027e-05,
+      "loss": 0.7525,
+      "step": 1592
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.3901100788076458,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6339,
+      "step": 1593
+    },
+    {
+      "epoch": 0.8501333333333333,
+      "grad_norm": 0.45213561226244414,
+      "learning_rate": 1.1559639525345311e-05,
+      "loss": 0.7457,
+      "step": 1594
+    },
+    {
+      "epoch": 0.8506666666666667,
+      "grad_norm": 0.5300440695732356,
+      "learning_rate": 1.1479119620864276e-05,
+      "loss": 0.7088,
+      "step": 1595
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.49408808988181024,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.728,
+      "step": 1596
+    },
+    {
+      "epoch": 0.8517333333333333,
+      "grad_norm": 0.40975088663541037,
+      "learning_rate": 1.1318873061913405e-05,
+      "loss": 0.6124,
+      "step": 1597
+    },
+    {
+      "epoch": 0.8522666666666666,
+      "grad_norm": 0.4303423444928659,
+      "learning_rate": 1.123914688596409e-05,
+      "loss": 0.6476,
+      "step": 1598
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.382474083230923,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.62,
+      "step": 1599
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.47548658272504035,
+      "learning_rate": 1.1080489931489391e-05,
+      "loss": 0.6212,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8538666666666667,
+      "grad_norm": 0.49857153711888064,
+      "learning_rate": 1.1001559626737756e-05,
+      "loss": 0.6032,
+      "step": 1601
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.478559876485903,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6869,
+      "step": 1602
+    },
+    {
+      "epoch": 0.8549333333333333,
+      "grad_norm": 0.43451585111407687,
+      "learning_rate": 1.0844496540694515e-05,
+      "loss": 0.659,
+      "step": 1603
+    },
+    {
+      "epoch": 0.8554666666666667,
+      "grad_norm": 0.4067218835749194,
+      "learning_rate": 1.0766364228417148e-05,
+      "loss": 0.6459,
+      "step": 1604
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.4488765971746705,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.7156,
+      "step": 1605
+    },
+    {
+      "epoch": 0.8565333333333334,
+      "grad_norm": 0.5039706276132117,
+      "learning_rate": 1.0610899231924886e-05,
+      "loss": 0.6485,
+      "step": 1606
+    },
+    {
+      "epoch": 0.8570666666666666,
+      "grad_norm": 0.4660787888411466,
+      "learning_rate": 1.0533567011952094e-05,
+      "loss": 0.6908,
+      "step": 1607
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.5086288860613459,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.7498,
+      "step": 1608
+    },
+    {
+      "epoch": 0.8581333333333333,
+      "grad_norm": 0.42840707314228765,
+      "learning_rate": 1.0379704283181179e-05,
+      "loss": 0.618,
+      "step": 1609
+    },
+    {
+      "epoch": 0.8586666666666667,
+      "grad_norm": 0.4930273945764172,
+      "learning_rate": 1.0303174233840528e-05,
+      "loss": 0.706,
+      "step": 1610
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.40006850983935494,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6097,
+      "step": 1611
+    },
+    {
+      "epoch": 0.8597333333333333,
+      "grad_norm": 0.4335664772092174,
+      "learning_rate": 1.0150917907899926e-05,
+      "loss": 0.6549,
+      "step": 1612
+    },
+    {
+      "epoch": 0.8602666666666666,
+      "grad_norm": 0.38758194133564866,
+      "learning_rate": 1.007519208596045e-05,
+      "loss": 0.6469,
+      "step": 1613
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.4352181098736593,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6362,
+      "step": 1614
+    },
+    {
+      "epoch": 0.8613333333333333,
+      "grad_norm": 0.43614995094953685,
+      "learning_rate": 9.924546254786493e-06,
+      "loss": 0.6483,
+      "step": 1615
+    },
+    {
+      "epoch": 0.8618666666666667,
+      "grad_norm": 0.3734845483574157,
+      "learning_rate": 9.849626695403324e-06,
+      "loss": 0.6248,
+      "step": 1616
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.4673738756550724,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6286,
+      "step": 1617
+    },
+    {
+      "epoch": 0.8629333333333333,
+      "grad_norm": 0.46931770163566056,
+      "learning_rate": 9.700595407649805e-06,
+      "loss": 0.6718,
+      "step": 1618
+    },
+    {
+      "epoch": 0.8634666666666667,
+      "grad_norm": 0.5054076199605804,
+      "learning_rate": 9.62648412430951e-06,
+      "loss": 0.6601,
+      "step": 1619
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.4156887330306433,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6871,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8645333333333334,
+      "grad_norm": 0.4848129151230194,
+      "learning_rate": 9.479071385238892e-06,
+      "loss": 0.6318,
+      "step": 1621
+    },
+    {
+      "epoch": 0.8650666666666667,
+      "grad_norm": 0.4090143670275399,
+      "learning_rate": 9.40577036970538e-06,
+      "loss": 0.7051,
+      "step": 1622
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.47023117574919554,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6403,
+      "step": 1623
+    },
+    {
+      "epoch": 0.8661333333333333,
+      "grad_norm": 0.4855511342637403,
+      "learning_rate": 9.259980141081115e-06,
+      "loss": 0.7308,
+      "step": 1624
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.42127353506362863,
+      "learning_rate": 9.187491363342093e-06,
+      "loss": 0.6605,
+      "step": 1625
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.399395936860529,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6525,
+      "step": 1626
+    },
+    {
+      "epoch": 0.8677333333333334,
+      "grad_norm": 0.3648858569255673,
+      "learning_rate": 9.043327563322112e-06,
+      "loss": 0.6127,
+      "step": 1627
+    },
+    {
+      "epoch": 0.8682666666666666,
+      "grad_norm": 0.4425814065357602,
+      "learning_rate": 8.971652971536148e-06,
+      "loss": 0.6974,
+      "step": 1628
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.479016021723167,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6837,
+      "step": 1629
+    },
+    {
+      "epoch": 0.8693333333333333,
+      "grad_norm": 0.44310138485519696,
+      "learning_rate": 8.829119474567671e-06,
+      "loss": 0.6285,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8698666666666667,
+      "grad_norm": 0.4340924327388896,
+      "learning_rate": 8.758260995011825e-06,
+      "loss": 0.6982,
+      "step": 1631
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.43148264237357026,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6611,
+      "step": 1632
+    },
+    {
+      "epoch": 0.8709333333333333,
+      "grad_norm": 0.48868884574412247,
+      "learning_rate": 8.617361631727138e-06,
+      "loss": 0.7426,
+      "step": 1633
+    },
+    {
+      "epoch": 0.8714666666666666,
+      "grad_norm": 0.4120744248577442,
+      "learning_rate": 8.547321168745193e-06,
+      "loss": 0.6261,
+      "step": 1634
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.6190391045957169,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.7688,
+      "step": 1635
+    },
+    {
+      "epoch": 0.8725333333333334,
+      "grad_norm": 0.4049601799461113,
+      "learning_rate": 8.408059725858719e-06,
+      "loss": 0.6023,
+      "step": 1636
+    },
+    {
+      "epoch": 0.8730666666666667,
+      "grad_norm": 0.39554572306685953,
+      "learning_rate": 8.338839161809997e-06,
+      "loss": 0.686,
+      "step": 1637
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4286225445682662,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.6289,
+      "step": 1638
+    },
+    {
+      "epoch": 0.8741333333333333,
+      "grad_norm": 0.4845861275686903,
+      "learning_rate": 8.201219382016556e-06,
+      "loss": 0.626,
+      "step": 1639
+    },
+    {
+      "epoch": 0.8746666666666667,
+      "grad_norm": 0.3963837981727752,
+      "learning_rate": 8.132820577225387e-06,
+      "loss": 0.5965,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.4134911418769395,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6541,
+      "step": 1641
+    },
+    {
+      "epoch": 0.8757333333333334,
+      "grad_norm": 0.42104755345183614,
+      "learning_rate": 7.996846159099557e-06,
+      "loss": 0.6809,
+      "step": 1642
+    },
+    {
+      "epoch": 0.8762666666666666,
+      "grad_norm": 0.4259978224833793,
+      "learning_rate": 7.929270951805178e-06,
+      "loss": 0.7122,
+      "step": 1643
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.4380553719130741,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6813,
+      "step": 1644
+    },
+    {
+      "epoch": 0.8773333333333333,
+      "grad_norm": 0.4273175887634649,
+      "learning_rate": 7.794945549701993e-06,
+      "loss": 0.643,
+      "step": 1645
+    },
+    {
+      "epoch": 0.8778666666666667,
+      "grad_norm": 0.6612426177197686,
+      "learning_rate": 7.728195756009204e-06,
+      "loss": 0.6687,
+      "step": 1646
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.33655424416475366,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6255,
+      "step": 1647
+    },
+    {
+      "epoch": 0.8789333333333333,
+      "grad_norm": 0.4771674213689379,
+      "learning_rate": 7.595522979965819e-06,
+      "loss": 0.7404,
+      "step": 1648
+    },
+    {
+      "epoch": 0.8794666666666666,
+      "grad_norm": 0.3952446378554752,
+      "learning_rate": 7.529600393796232e-06,
+      "loss": 0.6337,
+      "step": 1649
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.41448139569062153,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6876,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8805333333333333,
+      "grad_norm": 0.4288409230911495,
+      "learning_rate": 7.3985838094349444e-06,
+      "loss": 0.7102,
+      "step": 1651
+    },
+    {
+      "epoch": 0.8810666666666667,
+      "grad_norm": 0.3783481327021028,
+      "learning_rate": 7.333490202478666e-06,
+      "loss": 0.6534,
+      "step": 1652
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.4542569647870413,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6881,
+      "step": 1653
+    },
+    {
+      "epoch": 0.8821333333333333,
+      "grad_norm": 0.4006263879464807,
+      "learning_rate": 7.204133330911178e-06,
+      "loss": 0.694,
+      "step": 1654
+    },
+    {
+      "epoch": 0.8826666666666667,
+      "grad_norm": 0.453401714144608,
+      "learning_rate": 7.1398704525792e-06,
+      "loss": 0.5955,
+      "step": 1655
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.44396075332125057,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6449,
+      "step": 1656
+    },
+    {
+      "epoch": 0.8837333333333334,
+      "grad_norm": 0.464087674884783,
+      "learning_rate": 7.012176770311862e-06,
+      "loss": 0.6529,
+      "step": 1657
+    },
+    {
+      "epoch": 0.8842666666666666,
+      "grad_norm": 0.46665628649754415,
+      "learning_rate": 6.948746347689183e-06,
+      "loss": 0.6134,
+      "step": 1658
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.4267206777431721,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6107,
+      "step": 1659
+    },
+    {
+      "epoch": 0.8853333333333333,
+      "grad_norm": 0.45369038854807936,
+      "learning_rate": 6.8227192865295995e-06,
+      "loss": 0.6156,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8858666666666667,
+      "grad_norm": 0.49824396400889714,
+      "learning_rate": 6.760123024328624e-06,
+      "loss": 0.7031,
+      "step": 1661
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.41115424024889685,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.5949,
+      "step": 1662
+    },
+    {
+      "epoch": 0.8869333333333334,
+      "grad_norm": 0.43144386097553383,
+      "learning_rate": 6.635765971293484e-06,
+      "loss": 0.5647,
+      "step": 1663
+    },
+    {
+      "epoch": 0.8874666666666666,
+      "grad_norm": 0.42295463970623487,
+      "learning_rate": 6.5740055518083375e-06,
+      "loss": 0.6755,
+      "step": 1664
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.41488207349200906,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6194,
+      "step": 1665
+    },
+    {
+      "epoch": 0.8885333333333333,
+      "grad_norm": 0.47559587547989834,
+      "learning_rate": 6.451321849032288e-06,
+      "loss": 0.6773,
+      "step": 1666
+    },
+    {
+      "epoch": 0.8890666666666667,
+      "grad_norm": 0.42148859792374616,
+      "learning_rate": 6.390398932093555e-06,
+      "loss": 0.6323,
+      "step": 1667
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.5122183779481083,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6266,
+      "step": 1668
+    },
+    {
+      "epoch": 0.8901333333333333,
+      "grad_norm": 0.44424699537254064,
+      "learning_rate": 6.269391876739495e-06,
+      "loss": 0.63,
+      "step": 1669
+    },
+    {
+      "epoch": 0.8906666666666667,
+      "grad_norm": 0.442110611521818,
+      "learning_rate": 6.209308099669597e-06,
+      "loss": 0.6242,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.4545369208026764,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.727,
+      "step": 1671
+    },
+    {
+      "epoch": 0.8917333333333334,
+      "grad_norm": 0.5105146615317737,
+      "learning_rate": 6.089980943839924e-06,
+      "loss": 0.5459,
+      "step": 1672
+    },
+    {
+      "epoch": 0.8922666666666667,
+      "grad_norm": 0.49004569063844167,
+      "learning_rate": 6.030737921409169e-06,
+      "loss": 0.6407,
+      "step": 1673
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.4359967625922056,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.652,
+      "step": 1674
+    },
+    {
+      "epoch": 0.8933333333333333,
+      "grad_norm": 0.4677743393043125,
+      "learning_rate": 5.913093872058528e-06,
+      "loss": 0.7219,
+      "step": 1675
+    },
+    {
+      "epoch": 0.8938666666666667,
+      "grad_norm": 0.42052686908942477,
+      "learning_rate": 5.854693196441641e-06,
+      "loss": 0.6367,
+      "step": 1676
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.5224132859527966,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6171,
+      "step": 1677
+    },
+    {
+      "epoch": 0.8949333333333334,
+      "grad_norm": 0.4200233190413443,
+      "learning_rate": 5.738735415290642e-06,
+      "loss": 0.6195,
+      "step": 1678
+    },
+    {
+      "epoch": 0.8954666666666666,
+      "grad_norm": 0.525695660376052,
+      "learning_rate": 5.681178656024055e-06,
+      "loss": 0.7904,
+      "step": 1679
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.40558947882316576,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6588,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8965333333333333,
+      "grad_norm": 0.5103020883576876,
+      "learning_rate": 5.566910259474289e-06,
+      "loss": 0.657,
+      "step": 1681
+    },
+    {
+      "epoch": 0.8970666666666667,
+      "grad_norm": 0.3873266453233226,
+      "learning_rate": 5.510198963413881e-06,
+      "loss": 0.5752,
+      "step": 1682
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.4285241457568737,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6976,
+      "step": 1683
+    },
+    {
+      "epoch": 0.8981333333333333,
+      "grad_norm": 0.40857593789774727,
+      "learning_rate": 5.397623022464226e-06,
+      "loss": 0.6062,
+      "step": 1684
+    },
+    {
+      "epoch": 0.8986666666666666,
+      "grad_norm": 0.5410195800482295,
+      "learning_rate": 5.341758713743828e-06,
+      "loss": 0.7412,
+      "step": 1685
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.40797808336174124,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6788,
+      "step": 1686
+    },
+    {
+      "epoch": 0.8997333333333334,
+      "grad_norm": 0.560901318464611,
+      "learning_rate": 5.230878253907912e-06,
+      "loss": 0.7575,
+      "step": 1687
+    },
+    {
+      "epoch": 0.9002666666666667,
+      "grad_norm": 0.5212007528527816,
+      "learning_rate": 5.175862433898282e-06,
+      "loss": 0.6335,
+      "step": 1688
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.5186025566667148,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6791,
+      "step": 1689
+    },
+    {
+      "epoch": 0.9013333333333333,
+      "grad_norm": 0.362825359016172,
+      "learning_rate": 5.066680435123106e-06,
+      "loss": 0.6582,
+      "step": 1690
+    },
+    {
+      "epoch": 0.9018666666666667,
+      "grad_norm": 0.4175997612171381,
+      "learning_rate": 5.012514582391592e-06,
+      "loss": 0.7087,
+      "step": 1691
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.4361299595187199,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6422,
+      "step": 1692
+    },
+    {
+      "epoch": 0.9029333333333334,
+      "grad_norm": 0.40379275500431583,
+      "learning_rate": 4.905033978977491e-06,
+      "loss": 0.6474,
+      "step": 1693
+    },
+    {
+      "epoch": 0.9034666666666666,
+      "grad_norm": 0.5432519210652561,
+      "learning_rate": 4.851719549248301e-06,
+      "loss": 0.7521,
+      "step": 1694
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.49489904176517713,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.726,
+      "step": 1695
+    },
+    {
+      "epoch": 0.9045333333333333,
+      "grad_norm": 0.4778114674274999,
+      "learning_rate": 4.745943229770122e-06,
+      "loss": 0.6466,
+      "step": 1696
+    },
+    {
+      "epoch": 0.9050666666666667,
+      "grad_norm": 0.4933705071723949,
+      "learning_rate": 4.693481655885257e-06,
+      "loss": 0.665,
+      "step": 1697
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.5589503002222523,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.7616,
+      "step": 1698
+    },
+    {
+      "epoch": 0.9061333333333333,
+      "grad_norm": 0.47525552375930347,
+      "learning_rate": 4.58941246311464e-06,
+      "loss": 0.6679,
+      "step": 1699
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.4113964591102893,
+      "learning_rate": 4.537805154995278e-06,
+      "loss": 0.6147,
+      "step": 1700
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.3689891073538882,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6669,
+      "step": 1701
+    },
+    {
+      "epoch": 0.9077333333333333,
+      "grad_norm": 0.4628204941072679,
+      "learning_rate": 4.435445885824285e-06,
+      "loss": 0.6799,
+      "step": 1702
+    },
+    {
+      "epoch": 0.9082666666666667,
+      "grad_norm": 0.4468459877387718,
+      "learning_rate": 4.384694230432984e-06,
+      "loss": 0.6032,
+      "step": 1703
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3947347800824203,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6312,
+      "step": 1704
+    },
+    {
+      "epoch": 0.9093333333333333,
+      "grad_norm": 0.5701409670608103,
+      "learning_rate": 4.2840476357989825e-06,
+      "loss": 0.7194,
+      "step": 1705
+    },
+    {
+      "epoch": 0.9098666666666667,
+      "grad_norm": 0.4151374112707517,
+      "learning_rate": 4.2341529971023255e-06,
+      "loss": 0.6649,
+      "step": 1706
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.4793828838725344,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.718,
+      "step": 1707
+    },
+    {
+      "epoch": 0.9109333333333334,
+      "grad_norm": 0.7068717534009299,
+      "learning_rate": 4.135221781914034e-06,
+      "loss": 0.6686,
+      "step": 1708
+    },
+    {
+      "epoch": 0.9114666666666666,
+      "grad_norm": 0.47401068139669833,
+      "learning_rate": 4.0861855008460405e-06,
+      "loss": 0.6603,
+      "step": 1709
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.44061731871766496,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.5891,
+      "step": 1710
+    },
+    {
+      "epoch": 0.9125333333333333,
+      "grad_norm": 0.4207110761408695,
+      "learning_rate": 3.988972323910778e-06,
+      "loss": 0.6806,
+      "step": 1711
+    },
+    {
+      "epoch": 0.9130666666666667,
+      "grad_norm": 0.3874035896421297,
+      "learning_rate": 3.9407957183368095e-06,
+      "loss": 0.59,
+      "step": 1712
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.40253686610510786,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.556,
+      "step": 1713
+    },
+    {
+      "epoch": 0.9141333333333334,
+      "grad_norm": 0.5415411228332792,
+      "learning_rate": 3.845303192289074e-06,
+      "loss": 0.6956,
+      "step": 1714
+    },
+    {
+      "epoch": 0.9146666666666666,
+      "grad_norm": 0.3945963732370113,
+      "learning_rate": 3.797987556970495e-06,
+      "loss": 0.5793,
+      "step": 1715
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.4366079996575693,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6568,
+      "step": 1716
+    },
+    {
+      "epoch": 0.9157333333333333,
+      "grad_norm": 0.44634155267347175,
+      "learning_rate": 3.7042182482018075e-06,
+      "loss": 0.6857,
+      "step": 1717
+    },
+    {
+      "epoch": 0.9162666666666667,
+      "grad_norm": 0.4115469207308905,
+      "learning_rate": 3.6577648547611033e-06,
+      "loss": 0.609,
+      "step": 1718
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.5419266391289995,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.7071,
+      "step": 1719
+    },
+    {
+      "epoch": 0.9173333333333333,
+      "grad_norm": 0.47637699928223176,
+      "learning_rate": 3.565721283350931e-06,
+      "loss": 0.698,
+      "step": 1720
+    },
+    {
+      "epoch": 0.9178666666666667,
+      "grad_norm": 0.5723400801568562,
+      "learning_rate": 3.5201313802375456e-06,
+      "loss": 0.7531,
+      "step": 1721
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.4009591922291512,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6807,
+      "step": 1722
+    },
+    {
+      "epoch": 0.9189333333333334,
+      "grad_norm": 0.4331252958668281,
+      "learning_rate": 3.4298160198856568e-06,
+      "loss": 0.6509,
+      "step": 1723
+    },
+    {
+      "epoch": 0.9194666666666667,
+      "grad_norm": 0.399769059999911,
+      "learning_rate": 3.3850908323424967e-06,
+      "loss": 0.6546,
+      "step": 1724
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.45085868281053915,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.7071,
+      "step": 1725
+    },
+    {
+      "epoch": 0.9205333333333333,
+      "grad_norm": 0.5514151785507176,
+      "learning_rate": 3.296506110302422e-06,
+      "loss": 0.752,
+      "step": 1726
+    },
+    {
+      "epoch": 0.9210666666666667,
+      "grad_norm": 0.3959419936368739,
+      "learning_rate": 3.252646840332918e-06,
+      "loss": 0.6068,
+      "step": 1727
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4483500145944997,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6213,
+      "step": 1728
+    },
+    {
+      "epoch": 0.9221333333333334,
+      "grad_norm": 0.36596013283213197,
+      "learning_rate": 3.1657951373467497e-06,
+      "loss": 0.6135,
+      "step": 1729
+    },
+    {
+      "epoch": 0.9226666666666666,
+      "grad_norm": 0.3603482051100899,
+      "learning_rate": 3.1228029636824475e-06,
+      "loss": 0.6312,
+      "step": 1730
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.3998640794375178,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6364,
+      "step": 1731
+    },
+    {
+      "epoch": 0.9237333333333333,
+      "grad_norm": 0.428649379549905,
+      "learning_rate": 3.037686613916857e-06,
+      "loss": 0.6464,
+      "step": 1732
+    },
+    {
+      "epoch": 0.9242666666666667,
+      "grad_norm": 0.390593369924257,
+      "learning_rate": 2.995562691985898e-06,
+      "loss": 0.6983,
+      "step": 1733
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.6016004599654001,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.7652,
+      "step": 1734
+    },
+    {
+      "epoch": 0.9253333333333333,
+      "grad_norm": 0.5882288235315833,
+      "learning_rate": 2.912183982969385e-06,
+      "loss": 0.6941,
+      "step": 1735
+    },
+    {
+      "epoch": 0.9258666666666666,
+      "grad_norm": 0.4613460981333321,
+      "learning_rate": 2.8709294448653225e-06,
+      "loss": 0.6764,
+      "step": 1736
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.3898156569663358,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6392,
+      "step": 1737
+    },
+    {
+      "epoch": 0.9269333333333334,
+      "grad_norm": 0.4382461918130465,
+      "learning_rate": 2.789290617426765e-06,
+      "loss": 0.6724,
+      "step": 1738
+    },
+    {
+      "epoch": 0.9274666666666667,
+      "grad_norm": 0.5305696960018302,
+      "learning_rate": 2.748906571878207e-06,
+      "loss": 0.687,
+      "step": 1739
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4187652465149059,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.6663,
+      "step": 1740
+    },
+    {
+      "epoch": 0.9285333333333333,
+      "grad_norm": 0.5111558907436493,
+      "learning_rate": 2.6690098200866098e-06,
+      "loss": 0.6815,
+      "step": 1741
+    },
+    {
+      "epoch": 0.9290666666666667,
+      "grad_norm": 0.34906468871197094,
+      "learning_rate": 2.6294973524274125e-06,
+      "loss": 0.5988,
+      "step": 1742
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.42478348658580695,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6394,
+      "step": 1743
+    },
+    {
+      "epoch": 0.9301333333333334,
+      "grad_norm": 0.33392343235361127,
+      "learning_rate": 2.551344823532964e-06,
+      "loss": 0.5464,
+      "step": 1744
+    },
+    {
+      "epoch": 0.9306666666666666,
+      "grad_norm": 0.42316395196008005,
+      "learning_rate": 2.5127049956730207e-06,
+      "loss": 0.639,
+      "step": 1745
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.43602319797551964,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.6127,
+      "step": 1746
+    },
+    {
+      "epoch": 0.9317333333333333,
+      "grad_norm": 0.34533336200222786,
+      "learning_rate": 2.436298790049363e-06,
+      "loss": 0.5881,
+      "step": 1747
+    },
+    {
+      "epoch": 0.9322666666666667,
+      "grad_norm": 0.3875861538559915,
+      "learning_rate": 2.3985326404461604e-06,
+      "loss": 0.6516,
+      "step": 1748
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.493267360371915,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6516,
+      "step": 1749
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.41237596426549245,
+      "learning_rate": 2.3238748115339324e-06,
+      "loss": 0.6859,
+      "step": 1750
+    },
+    {
+      "epoch": 0.9338666666666666,
+      "grad_norm": 0.4083831627195735,
+      "learning_rate": 2.286983355164529e-06,
+      "loss": 0.6716,
+      "step": 1751
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.4115879104570026,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6518,
+      "step": 1752
+    },
+    {
+      "epoch": 0.9349333333333333,
+      "grad_norm": 0.47996363631082506,
+      "learning_rate": 2.2140759094162467e-06,
+      "loss": 0.5994,
+      "step": 1753
+    },
+    {
+      "epoch": 0.9354666666666667,
+      "grad_norm": 0.41235060776600496,
+      "learning_rate": 2.178060137750071e-06,
+      "loss": 0.6065,
+      "step": 1754
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.39209566990715516,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.6153,
+      "step": 1755
+    },
+    {
+      "epoch": 0.9365333333333333,
+      "grad_norm": 0.39731442729571154,
+      "learning_rate": 2.106905034576112e-06,
+      "loss": 0.6739,
+      "step": 1756
+    },
+    {
+      "epoch": 0.9370666666666667,
+      "grad_norm": 0.5135213180159308,
+      "learning_rate": 2.0717659155482738e-06,
+      "loss": 0.7119,
+      "step": 1757
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.42965918561188526,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.6858,
+      "step": 1758
+    },
+    {
+      "epoch": 0.9381333333333334,
+      "grad_norm": 0.40389515210214166,
+      "learning_rate": 2.002365067264289e-06,
+      "loss": 0.5784,
+      "step": 1759
+    },
+    {
+      "epoch": 0.9386666666666666,
+      "grad_norm": 0.5590010970100625,
+      "learning_rate": 1.968103545249611e-06,
+      "loss": 0.649,
+      "step": 1760
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.41491347188958494,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6251,
+      "step": 1761
+    },
+    {
+      "epoch": 0.9397333333333333,
+      "grad_norm": 0.45496219950380323,
+      "learning_rate": 1.900458817025097e-06,
+      "loss": 0.5803,
+      "step": 1762
+    },
+    {
+      "epoch": 0.9402666666666667,
+      "grad_norm": 0.44954579665324645,
+      "learning_rate": 1.8670758128126909e-06,
+      "loss": 0.6048,
+      "step": 1763
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.4134655703681969,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6379,
+      "step": 1764
+    },
+    {
+      "epoch": 0.9413333333333334,
+      "grad_norm": 0.46066883694485544,
+      "learning_rate": 1.8011890226208527e-06,
+      "loss": 0.6663,
+      "step": 1765
+    },
+    {
+      "epoch": 0.9418666666666666,
+      "grad_norm": 0.3984938279054454,
+      "learning_rate": 1.7686854333893833e-06,
+      "loss": 0.6545,
+      "step": 1766
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.39823108645855065,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6879,
+      "step": 1767
+    },
+    {
+      "epoch": 0.9429333333333333,
+      "grad_norm": 0.7422035044982245,
+      "learning_rate": 1.7045583519583074e-06,
+      "loss": 0.6672,
+      "step": 1768
+    },
+    {
+      "epoch": 0.9434666666666667,
+      "grad_norm": 0.517550103030721,
+      "learning_rate": 1.6729350512519005e-06,
+      "loss": 0.7166,
+      "step": 1769
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4406193421538898,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6439,
+      "step": 1770
+    },
+    {
+      "epoch": 0.9445333333333333,
+      "grad_norm": 0.44058810187732483,
+      "learning_rate": 1.6105694020169593e-06,
+      "loss": 0.6405,
+      "step": 1771
+    },
+    {
+      "epoch": 0.9450666666666667,
+      "grad_norm": 0.3964524908097487,
+      "learning_rate": 1.5798272397217095e-06,
+      "loss": 0.6255,
+      "step": 1772
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.43469904654814184,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6235,
+      "step": 1773
+    },
+    {
+      "epoch": 0.9461333333333334,
+      "grad_norm": 0.5106227882923048,
+      "learning_rate": 1.5192246987791981e-06,
+      "loss": 0.6526,
+      "step": 1774
+    },
+    {
+      "epoch": 0.9466666666666667,
+      "grad_norm": 0.70962295043018,
+      "learning_rate": 1.489364501100332e-06,
+      "loss": 0.7534,
+      "step": 1775
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.4423402123776244,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6327,
+      "step": 1776
+    },
+    {
+      "epoch": 0.9477333333333333,
+      "grad_norm": 0.45116531658470177,
+      "learning_rate": 1.430526697162482e-06,
+      "loss": 0.6964,
+      "step": 1777
+    },
+    {
+      "epoch": 0.9482666666666667,
+      "grad_norm": 0.5864062161055378,
+      "learning_rate": 1.4015492666021312e-06,
+      "loss": 0.6849,
+      "step": 1778
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.4509488119478894,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.6471,
+      "step": 1779
+    },
+    {
+      "epoch": 0.9493333333333334,
+      "grad_norm": 0.4381582202926343,
+      "learning_rate": 1.344477780953346e-06,
+      "loss": 0.6643,
+      "step": 1780
+    },
+    {
+      "epoch": 0.9498666666666666,
+      "grad_norm": 0.4237960794460431,
+      "learning_rate": 1.3163838962890195e-06,
+      "loss": 0.6879,
+      "step": 1781
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.4374546398625835,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6416,
+      "step": 1782
+    },
+    {
+      "epoch": 0.9509333333333333,
+      "grad_norm": 0.5221175916812671,
+      "learning_rate": 1.261080262743297e-06,
+      "loss": 0.639,
+      "step": 1783
+    },
+    {
+      "epoch": 0.9514666666666667,
+      "grad_norm": 0.356264189927237,
+      "learning_rate": 1.2338706790069431e-06,
+      "loss": 0.5445,
+      "step": 1784
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.38973756787229913,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.5965,
+      "step": 1785
+    },
+    {
+      "epoch": 0.9525333333333333,
+      "grad_norm": 0.543973752427358,
+      "learning_rate": 1.1803363838667092e-06,
+      "loss": 0.805,
+      "step": 1786
+    },
+    {
+      "epoch": 0.9530666666666666,
+      "grad_norm": 0.42979347962101977,
+      "learning_rate": 1.1540118323243865e-06,
+      "loss": 0.6575,
+      "step": 1787
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.5042407052213125,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6405,
+      "step": 1788
+    },
+    {
+      "epoch": 0.9541333333333334,
+      "grad_norm": 0.5241413589166609,
+      "learning_rate": 1.1022483143405705e-06,
+      "loss": 0.674,
+      "step": 1789
+    },
+    {
+      "epoch": 0.9546666666666667,
+      "grad_norm": 0.408613955885076,
+      "learning_rate": 1.076809502472831e-06,
+      "loss": 0.7184,
+      "step": 1790
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.4379860229388707,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6262,
+      "step": 1791
+    },
+    {
+      "epoch": 0.9557333333333333,
+      "grad_norm": 0.4323480463923861,
+      "learning_rate": 1.0268181528061749e-06,
+      "loss": 0.6519,
+      "step": 1792
+    },
+    {
+      "epoch": 0.9562666666666667,
+      "grad_norm": 0.38506660557803357,
+      "learning_rate": 1.0022657642890231e-06,
+      "loss": 0.6361,
+      "step": 1793
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.37880504506736284,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.6247,
+      "step": 1794
+    },
+    {
+      "epoch": 0.9573333333333334,
+      "grad_norm": 0.39685044634962247,
+      "learning_rate": 9.540479264726676e-07,
+      "loss": 0.5881,
+      "step": 1795
+    },
+    {
+      "epoch": 0.9578666666666666,
+      "grad_norm": 0.4194357098536144,
+      "learning_rate": 9.303826211592315e-07,
+      "loss": 0.6035,
+      "step": 1796
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.43625634338608676,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6912,
+      "step": 1797
+    },
+    {
+      "epoch": 0.9589333333333333,
+      "grad_norm": 0.4027428662526194,
+      "learning_rate": 8.839395910626213e-07,
+      "loss": 0.6952,
+      "step": 1798
+    },
+    {
+      "epoch": 0.9594666666666667,
+      "grad_norm": 0.42632449428971303,
+      "learning_rate": 8.611620049653879e-07,
+      "loss": 0.6388,
+      "step": 1799
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.5337207026513404,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.7587,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9605333333333334,
+      "grad_norm": 0.3697182493057979,
+      "learning_rate": 8.16495030759501e-07,
+      "loss": 0.5743,
+      "step": 1801
+    },
+    {
+      "epoch": 0.9610666666666666,
+      "grad_norm": 0.4531441112225663,
+      "learning_rate": 7.946057760332193e-07,
+      "loss": 0.681,
+      "step": 1802
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.3584300034986864,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.618,
+      "step": 1803
+    },
+    {
+      "epoch": 0.9621333333333333,
+      "grad_norm": 0.4226329000909182,
+      "learning_rate": 7.517160581569372e-07,
+      "loss": 0.7039,
+      "step": 1804
+    },
+    {
+      "epoch": 0.9626666666666667,
+      "grad_norm": 0.42418772454024434,
+      "learning_rate": 7.307157230821426e-07,
+      "loss": 0.6533,
+      "step": 1805
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.405422813829527,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6175,
+      "step": 1806
+    },
+    {
+      "epoch": 0.9637333333333333,
+      "grad_norm": 0.45982871735958847,
+      "learning_rate": 6.896044142100433e-07,
+      "loss": 0.6622,
+      "step": 1807
+    },
+    {
+      "epoch": 0.9642666666666667,
+      "grad_norm": 0.4975151975727332,
+      "learning_rate": 6.694935631773258e-07,
+      "loss": 0.6809,
+      "step": 1808
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.5625783691772048,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6318,
+      "step": 1809
+    },
+    {
+      "epoch": 0.9653333333333334,
+      "grad_norm": 0.4858590432799323,
+      "learning_rate": 6.301617681886863e-07,
+      "loss": 0.751,
+      "step": 1810
+    },
+    {
+      "epoch": 0.9658666666666667,
+      "grad_norm": 0.4433252691438286,
+      "learning_rate": 6.109409416834688e-07,
+      "loss": 0.6705,
+      "step": 1811
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.5128465720864208,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6579,
+      "step": 1812
+    },
+    {
+      "epoch": 0.9669333333333333,
+      "grad_norm": 0.4935130039822559,
+      "learning_rate": 5.733897176325665e-07,
+      "loss": 0.6647,
+      "step": 1813
+    },
+    {
+      "epoch": 0.9674666666666667,
+      "grad_norm": 0.3941506153553523,
+      "learning_rate": 5.550594322205504e-07,
+      "loss": 0.6243,
+      "step": 1814
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.4136221075040379,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6288,
+      "step": 1815
+    },
+    {
+      "epoch": 0.9685333333333334,
+      "grad_norm": 0.43436322990507925,
+      "learning_rate": 5.192897883082747e-07,
+      "loss": 0.6246,
+      "step": 1816
+    },
+    {
+      "epoch": 0.9690666666666666,
+      "grad_norm": 0.45907323619964835,
+      "learning_rate": 5.018505366216175e-07,
+      "loss": 0.5889,
+      "step": 1817
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.41713400266272166,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6305,
+      "step": 1818
+    },
+    {
+      "epoch": 0.9701333333333333,
+      "grad_norm": 0.3953181357796512,
+      "learning_rate": 4.678634341683252e-07,
+      "loss": 0.6505,
+      "step": 1819
+    },
+    {
+      "epoch": 0.9706666666666667,
+      "grad_norm": 0.3943289612083844,
+      "learning_rate": 4.5131568489236166e-07,
+      "loss": 0.6478,
+      "step": 1820
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.47474097580682073,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6493,
+      "step": 1821
+    },
+    {
+      "epoch": 0.9717333333333333,
+      "grad_norm": 0.5115671907494317,
+      "learning_rate": 4.191120373120749e-07,
+      "loss": 0.701,
+      "step": 1822
+    },
+    {
+      "epoch": 0.9722666666666666,
+      "grad_norm": 0.41664514407348774,
+      "learning_rate": 4.034562351727389e-07,
+      "loss": 0.581,
+      "step": 1823
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.4034158594351879,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.7405,
+      "step": 1824
+    },
+    {
+      "epoch": 0.9733333333333334,
+      "grad_norm": 0.40086384740895586,
+      "learning_rate": 3.73036907948543e-07,
+      "loss": 0.6511,
+      "step": 1825
+    },
+    {
+      "epoch": 0.9738666666666667,
+      "grad_norm": 0.5085314721751837,
+      "learning_rate": 3.582734737004101e-07,
+      "loss": 0.6701,
+      "step": 1826
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.3627036576986965,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.662,
+      "step": 1827
+    },
+    {
+      "epoch": 0.9749333333333333,
+      "grad_norm": 0.3969669843973643,
+      "learning_rate": 3.296392843612273e-07,
+      "loss": 0.6636,
+      "step": 1828
+    },
+    {
+      "epoch": 0.9754666666666667,
+      "grad_norm": 0.4279935043259893,
+      "learning_rate": 3.1576861477621287e-07,
+      "loss": 0.6628,
+      "step": 1829
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.46402161198748143,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6347,
+      "step": 1830
+    },
+    {
+      "epoch": 0.9765333333333334,
+      "grad_norm": 0.4316028438126866,
+      "learning_rate": 2.889203328748424e-07,
+      "loss": 0.6276,
+      "step": 1831
+    },
+    {
+      "epoch": 0.9770666666666666,
+      "grad_norm": 0.4873488395493903,
+      "learning_rate": 2.759428007315212e-07,
+      "loss": 0.6645,
+      "step": 1832
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.3930856325704254,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.5831,
+      "step": 1833
+    },
+    {
+      "epoch": 0.9781333333333333,
+      "grad_norm": 0.44410962630873474,
+      "learning_rate": 2.5088114782392257e-07,
+      "loss": 0.664,
+      "step": 1834
+    },
+    {
+      "epoch": 0.9786666666666667,
+      "grad_norm": 0.5129387411644766,
+      "learning_rate": 2.3879710189753656e-07,
+      "loss": 0.6779,
+      "step": 1835
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.420044883229394,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.5983,
+      "step": 1836
+    },
+    {
+      "epoch": 0.9797333333333333,
+      "grad_norm": 0.42475271473599874,
+      "learning_rate": 2.15522751523467e-07,
+      "loss": 0.5825,
+      "step": 1837
+    },
+    {
+      "epoch": 0.9802666666666666,
+      "grad_norm": 0.41883555551198826,
+      "learning_rate": 2.0433251657653308e-07,
+      "loss": 0.6679,
+      "step": 1838
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.4240520730595356,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6697,
+      "step": 1839
+    },
+    {
+      "epoch": 0.9813333333333333,
+      "grad_norm": 0.42882446982717437,
+      "learning_rate": 1.8284609424142895e-07,
+      "loss": 0.6238,
+      "step": 1840
+    },
+    {
+      "epoch": 0.9818666666666667,
+      "grad_norm": 0.45484800450889074,
+      "learning_rate": 1.7254997101500137e-07,
+      "loss": 0.6526,
+      "step": 1841
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.49953186634440994,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6902,
+      "step": 1842
+    },
+    {
+      "epoch": 0.9829333333333333,
+      "grad_norm": 0.44691847958582187,
+      "learning_rate": 1.5285205417319149e-07,
+      "loss": 0.6406,
+      "step": 1843
+    },
+    {
+      "epoch": 0.9834666666666667,
+      "grad_norm": 0.43100974826637617,
+      "learning_rate": 1.4345031937879062e-07,
+      "loss": 0.5908,
+      "step": 1844
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.4631337704823913,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6253,
+      "step": 1845
+    },
+    {
+      "epoch": 0.9845333333333334,
+      "grad_norm": 0.42084410540376455,
+      "learning_rate": 1.255414374179531e-07,
+      "loss": 0.6199,
+      "step": 1846
+    },
+    {
+      "epoch": 0.9850666666666666,
+      "grad_norm": 0.4552591722852423,
+      "learning_rate": 1.170343437301491e-07,
+      "loss": 0.7056,
+      "step": 1847
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3643185236453847,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.5695,
+      "step": 1848
+    },
+    {
+      "epoch": 0.9861333333333333,
+      "grad_norm": 0.48570227590289416,
+      "learning_rate": 1.0091497795706728e-07,
+      "loss": 0.6864,
+      "step": 1849
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.532717249543278,
+      "learning_rate": 9.330275400666332e-08,
+      "loss": 0.6669,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.43681308469909813,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6378,
+      "step": 1851
+    },
+    {
+      "epoch": 0.9877333333333334,
+      "grad_norm": 0.46099230085208504,
+      "learning_rate": 7.8973337634336e-08,
+      "loss": 0.6504,
+      "step": 1852
+    },
+    {
+      "epoch": 0.9882666666666666,
+      "grad_norm": 0.44090056851603815,
+      "learning_rate": 7.225618800222877e-08,
+      "loss": 0.7049,
+      "step": 1853
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.46180798760173736,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.596,
+      "step": 1854
+    },
+    {
+      "epoch": 0.9893333333333333,
+      "grad_norm": 0.4391562689172221,
+      "learning_rate": 5.971710613821291e-08,
+      "loss": 0.6667,
+      "step": 1855
+    },
+    {
+      "epoch": 0.9898666666666667,
+      "grad_norm": 0.8626806184707471,
+      "learning_rate": 5.389521134989695e-08,
+      "loss": 0.6449,
+      "step": 1856
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.43409069557301966,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6563,
+      "step": 1857
+    },
+    {
+      "epoch": 0.9909333333333333,
+      "grad_norm": 0.41969529930274724,
+      "learning_rate": 4.314680098592705e-08,
+      "loss": 0.6356,
+      "step": 1858
+    },
+    {
+      "epoch": 0.9914666666666667,
+      "grad_norm": 0.4222269715265879,
+      "learning_rate": 3.8220317506654226e-08,
+      "loss": 0.6429,
+      "step": 1859
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4670697419201925,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6474,
+      "step": 1860
+    },
+    {
+      "epoch": 0.9925333333333334,
+      "grad_norm": 0.4452819177418937,
+      "learning_rate": 2.9262867509605163e-08,
+      "loss": 0.6875,
+      "step": 1861
+    },
+    {
+      "epoch": 0.9930666666666667,
+      "grad_norm": 0.463490060267024,
+      "learning_rate": 2.5231927740154704e-08,
+      "loss": 0.6633,
+      "step": 1862
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.3940813094632287,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.65,
+      "step": 1863
+    },
+    {
+      "epoch": 0.9941333333333333,
+      "grad_norm": 0.44305740563962925,
+      "learning_rate": 1.8065678844314538e-08,
+      "loss": 0.7005,
+      "step": 1864
+    },
+    {
+      "epoch": 0.9946666666666667,
+      "grad_norm": 0.4651866154502036,
+      "learning_rate": 1.4930391117451426e-08,
+      "loss": 0.6124,
+      "step": 1865
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.4674151975111781,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.5779,
+      "step": 1866
+    },
+    {
+      "epoch": 0.9957333333333334,
+      "grad_norm": 0.4823441352205061,
+      "learning_rate": 9.555535917993297e-09,
+      "loss": 0.7893,
+      "step": 1867
+    },
+    {
+      "epoch": 0.9962666666666666,
+      "grad_norm": 0.5016534944918397,
+      "learning_rate": 7.315984495548378e-09,
+      "loss": 0.6479,
+      "step": 1868
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.38102575372409986,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6086,
+      "step": 1869
+    },
+    {
+      "epoch": 0.9973333333333333,
+      "grad_norm": 0.4143079328856298,
+      "learning_rate": 3.732667443390181e-09,
+      "loss": 0.6794,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9978666666666667,
+      "grad_norm": 0.48552698446001646,
+      "learning_rate": 2.388912514017516e-09,
+      "loss": 0.6634,
+      "step": 1871
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.4271016805768734,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.5702,
+      "step": 1872
+    },
+    {
+      "epoch": 0.9989333333333333,
+      "grad_norm": 0.4467084259531285,
+      "learning_rate": 5.972299119250125e-10,
+      "loss": 0.6671,
+      "step": 1873
+    },
+    {
+      "epoch": 0.9994666666666666,
+      "grad_norm": 0.5044855288814932,
+      "learning_rate": 1.4930758944764479e-10,
+      "loss": 0.61,
+      "step": 1874
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.4426977553290052,
+      "learning_rate": 0.0,
+      "loss": 0.6366,
+      "step": 1875
+    },
+    {
+      "epoch": 1.0,
+      "step": 1875,
+      "total_flos": 1615624508571648.0,
+      "train_loss": 0.728021297454834,
+      "train_runtime": 29030.5177,
+      "train_samples_per_second": 1.033,
+      "train_steps_per_second": 0.065
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1615624508571648.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e4609419f1cec8e876ad4aefe228cb4adf6620a
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "q_proj",
+    "down_proj",
+    "k_proj",
+    "up_proj",
+    "v_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ee8c74ecb6d491e525e8268856af3fb554b1f09a
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95a28f5ce059aa30ed757830ec3ba2e43f2714c43c7380841383a7717f868ee9
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..89d28814449aa3d58a811a1c46c975b24271dbc6
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b7e7d8002a045bd7b515936259cf5e3afbf0ef2e0bfa46c06962624cb61706c
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1dbe9cf566f9c639c583d2ab630f7845dffc4595
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,13167 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005333333333333334,
+      "grad_norm": 0.9187046636727805,
+      "learning_rate": 3.5087719298245615e-06,
+      "loss": 1.3821,
+      "step": 1
+    },
+    {
+      "epoch": 0.0010666666666666667,
+      "grad_norm": 1.0792624480572932,
+      "learning_rate": 7.017543859649123e-06,
+      "loss": 1.4152,
+      "step": 2
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.1509679477952406,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.5638,
+      "step": 3
+    },
+    {
+      "epoch": 0.0021333333333333334,
+      "grad_norm": 1.0153140074758051,
+      "learning_rate": 1.4035087719298246e-05,
+      "loss": 1.4518,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026666666666666666,
+      "grad_norm": 0.8305882584346376,
+      "learning_rate": 1.7543859649122806e-05,
+      "loss": 1.2897,
+      "step": 5
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9580646718718947,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.4666,
+      "step": 6
+    },
+    {
+      "epoch": 0.0037333333333333333,
+      "grad_norm": 0.904298840595612,
+      "learning_rate": 2.456140350877193e-05,
+      "loss": 1.2781,
+      "step": 7
+    },
+    {
+      "epoch": 0.004266666666666667,
+      "grad_norm": 1.0013840824825258,
+      "learning_rate": 2.8070175438596492e-05,
+      "loss": 1.2608,
+      "step": 8
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.8790398686631322,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.2048,
+      "step": 9
+    },
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 0.9106038042230968,
+      "learning_rate": 3.508771929824561e-05,
+      "loss": 1.0825,
+      "step": 10
+    },
+    {
+      "epoch": 0.005866666666666667,
+      "grad_norm": 0.8364248025570733,
+      "learning_rate": 3.859649122807018e-05,
+      "loss": 1.0387,
+      "step": 11
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8471561810879394,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.015,
+      "step": 12
+    },
+    {
+      "epoch": 0.006933333333333333,
+      "grad_norm": 0.7953795863229357,
+      "learning_rate": 4.56140350877193e-05,
+      "loss": 0.9882,
+      "step": 13
+    },
+    {
+      "epoch": 0.007466666666666667,
+      "grad_norm": 0.8785775945766162,
+      "learning_rate": 4.912280701754386e-05,
+      "loss": 1.0482,
+      "step": 14
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.723417623282454,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 0.9608,
+      "step": 15
+    },
+    {
+      "epoch": 0.008533333333333334,
+      "grad_norm": 0.7611912751822496,
+      "learning_rate": 5.6140350877192984e-05,
+      "loss": 0.9426,
+      "step": 16
+    },
+    {
+      "epoch": 0.009066666666666667,
+      "grad_norm": 0.7252600959832706,
+      "learning_rate": 5.9649122807017544e-05,
+      "loss": 0.9846,
+      "step": 17
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.74931191744177,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.0573,
+      "step": 18
+    },
+    {
+      "epoch": 0.010133333333333333,
+      "grad_norm": 0.8045241840406074,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 1.0575,
+      "step": 19
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": 0.6035180579178393,
+      "learning_rate": 7.017543859649122e-05,
+      "loss": 0.9674,
+      "step": 20
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.5945409277287728,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.9895,
+      "step": 21
+    },
+    {
+      "epoch": 0.011733333333333333,
+      "grad_norm": 0.7344825474590768,
+      "learning_rate": 7.719298245614036e-05,
+      "loss": 0.9581,
+      "step": 22
+    },
+    {
+      "epoch": 0.012266666666666667,
+      "grad_norm": 0.53022118934138,
+      "learning_rate": 8.070175438596491e-05,
+      "loss": 0.8633,
+      "step": 23
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.6036465457595838,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.9019,
+      "step": 24
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 0.7434093244468174,
+      "learning_rate": 8.771929824561403e-05,
+      "loss": 1.0828,
+      "step": 25
+    },
+    {
+      "epoch": 0.013866666666666666,
+      "grad_norm": 0.5789996465537413,
+      "learning_rate": 9.12280701754386e-05,
+      "loss": 0.9046,
+      "step": 26
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.6207781817193246,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9771,
+      "step": 27
+    },
+    {
+      "epoch": 0.014933333333333333,
+      "grad_norm": 0.50118463017605,
+      "learning_rate": 9.824561403508771e-05,
+      "loss": 0.8243,
+      "step": 28
+    },
+    {
+      "epoch": 0.015466666666666667,
+      "grad_norm": 0.5150599428286029,
+      "learning_rate": 0.0001017543859649123,
+      "loss": 0.8724,
+      "step": 29
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5974204555861369,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.9403,
+      "step": 30
+    },
+    {
+      "epoch": 0.016533333333333334,
+      "grad_norm": 0.46479309739544894,
+      "learning_rate": 0.00010877192982456141,
+      "loss": 0.8376,
+      "step": 31
+    },
+    {
+      "epoch": 0.017066666666666667,
+      "grad_norm": 0.5924711783520056,
+      "learning_rate": 0.00011228070175438597,
+      "loss": 0.9561,
+      "step": 32
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.6199971213166166,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.9147,
+      "step": 33
+    },
+    {
+      "epoch": 0.018133333333333335,
+      "grad_norm": 0.5861895660841031,
+      "learning_rate": 0.00011929824561403509,
+      "loss": 0.9514,
+      "step": 34
+    },
+    {
+      "epoch": 0.018666666666666668,
+      "grad_norm": 0.5650218068326689,
+      "learning_rate": 0.00012280701754385965,
+      "loss": 0.8629,
+      "step": 35
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.4992585437751919,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8666,
+      "step": 36
+    },
+    {
+      "epoch": 0.019733333333333332,
+      "grad_norm": 0.6587479357119546,
+      "learning_rate": 0.0001298245614035088,
+      "loss": 0.8885,
+      "step": 37
+    },
+    {
+      "epoch": 0.020266666666666665,
+      "grad_norm": 0.45989413876422613,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.8707,
+      "step": 38
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.5541375304460954,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9042,
+      "step": 39
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 0.48678040316016685,
+      "learning_rate": 0.00014035087719298245,
+      "loss": 0.8844,
+      "step": 40
+    },
+    {
+      "epoch": 0.021866666666666666,
+      "grad_norm": 0.5724673519486235,
+      "learning_rate": 0.00014385964912280703,
+      "loss": 0.8746,
+      "step": 41
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5773561902579017,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.8465,
+      "step": 42
+    },
+    {
+      "epoch": 0.022933333333333333,
+      "grad_norm": 0.522888769255152,
+      "learning_rate": 0.00015087719298245616,
+      "loss": 0.9228,
+      "step": 43
+    },
+    {
+      "epoch": 0.023466666666666667,
+      "grad_norm": 0.5010094318522428,
+      "learning_rate": 0.0001543859649122807,
+      "loss": 0.8968,
+      "step": 44
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.7323733472135996,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8296,
+      "step": 45
+    },
+    {
+      "epoch": 0.024533333333333334,
+      "grad_norm": 0.46966043829200105,
+      "learning_rate": 0.00016140350877192982,
+      "loss": 0.822,
+      "step": 46
+    },
+    {
+      "epoch": 0.025066666666666668,
+      "grad_norm": 0.4784577654699424,
+      "learning_rate": 0.0001649122807017544,
+      "loss": 0.8326,
+      "step": 47
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5225519559970078,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.9327,
+      "step": 48
+    },
+    {
+      "epoch": 0.026133333333333335,
+      "grad_norm": 0.5305933486499982,
+      "learning_rate": 0.00017192982456140353,
+      "loss": 0.866,
+      "step": 49
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.48942561215519315,
+      "learning_rate": 0.00017543859649122806,
+      "loss": 0.8911,
+      "step": 50
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.5438001724361794,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8168,
+      "step": 51
+    },
+    {
+      "epoch": 0.027733333333333332,
+      "grad_norm": 0.5252206611833089,
+      "learning_rate": 0.0001824561403508772,
+      "loss": 0.8275,
+      "step": 52
+    },
+    {
+      "epoch": 0.028266666666666666,
+      "grad_norm": 0.4802938547461449,
+      "learning_rate": 0.00018596491228070177,
+      "loss": 0.8618,
+      "step": 53
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5118393802144957,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8513,
+      "step": 54
+    },
+    {
+      "epoch": 0.029333333333333333,
+      "grad_norm": 0.4612064453172076,
+      "learning_rate": 0.00019298245614035088,
+      "loss": 0.8184,
+      "step": 55
+    },
+    {
+      "epoch": 0.029866666666666666,
+      "grad_norm": 0.48808752152242685,
+      "learning_rate": 0.00019649122807017543,
+      "loss": 0.8178,
+      "step": 56
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.5108056735883002,
+      "learning_rate": 0.0002,
+      "loss": 0.9261,
+      "step": 57
+    },
+    {
+      "epoch": 0.030933333333333334,
+      "grad_norm": 0.45353549861879516,
+      "learning_rate": 0.00019999985069241055,
+      "loss": 0.7657,
+      "step": 58
+    },
+    {
+      "epoch": 0.031466666666666664,
+      "grad_norm": 0.502886332585044,
+      "learning_rate": 0.00019999940277008808,
+      "loss": 0.7874,
+      "step": 59
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5072212825451821,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8058,
+      "step": 60
+    },
+    {
+      "epoch": 0.03253333333333333,
+      "grad_norm": 0.49202407084104344,
+      "learning_rate": 0.00019999761108748597,
+      "loss": 0.7515,
+      "step": 61
+    },
+    {
+      "epoch": 0.03306666666666667,
+      "grad_norm": 0.578940194689475,
+      "learning_rate": 0.00019999626733255662,
+      "loss": 0.9381,
+      "step": 62
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.6085015022875951,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.9601,
+      "step": 63
+    },
+    {
+      "epoch": 0.034133333333333335,
+      "grad_norm": 0.5075157023962301,
+      "learning_rate": 0.00019999268401550447,
+      "loss": 0.8628,
+      "step": 64
+    },
+    {
+      "epoch": 0.034666666666666665,
+      "grad_norm": 0.535946755129502,
+      "learning_rate": 0.000199990444464082,
+      "loss": 0.8856,
+      "step": 65
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.42942779507639506,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.7587,
+      "step": 66
+    },
+    {
+      "epoch": 0.03573333333333333,
+      "grad_norm": 0.48832417407332307,
+      "learning_rate": 0.00019998506960888256,
+      "loss": 0.8236,
+      "step": 67
+    },
+    {
+      "epoch": 0.03626666666666667,
+      "grad_norm": 0.5739164258014042,
+      "learning_rate": 0.00019998193432115572,
+      "loss": 0.8619,
+      "step": 68
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.7300022332551496,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8394,
+      "step": 69
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 0.557117083548834,
+      "learning_rate": 0.00019997476807225985,
+      "loss": 0.833,
+      "step": 70
+    },
+    {
+      "epoch": 0.037866666666666667,
+      "grad_norm": 0.49123527914022486,
+      "learning_rate": 0.0001999707371324904,
+      "loss": 0.722,
+      "step": 71
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.6932706617976879,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8676,
+      "step": 72
+    },
+    {
+      "epoch": 0.038933333333333334,
+      "grad_norm": 0.5373073147194048,
+      "learning_rate": 0.00019996177968249334,
+      "loss": 0.7757,
+      "step": 73
+    },
+    {
+      "epoch": 0.039466666666666664,
+      "grad_norm": 0.5315954660113272,
+      "learning_rate": 0.0001999568531990141,
+      "loss": 0.8667,
+      "step": 74
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.5507964885116554,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.9107,
+      "step": 75
+    },
+    {
+      "epoch": 0.04053333333333333,
+      "grad_norm": 0.5456497491184378,
+      "learning_rate": 0.00019994610478865011,
+      "loss": 0.9106,
+      "step": 76
+    },
+    {
+      "epoch": 0.04106666666666667,
+      "grad_norm": 0.5416706546625402,
+      "learning_rate": 0.0001999402828938618,
+      "loss": 0.8239,
+      "step": 77
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4995789896391561,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8562,
+      "step": 78
+    },
+    {
+      "epoch": 0.042133333333333335,
+      "grad_norm": 0.4799575897021053,
+      "learning_rate": 0.00019992774381199778,
+      "loss": 0.8265,
+      "step": 79
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 0.4437816317223756,
+      "learning_rate": 0.00019992102666236566,
+      "loss": 0.7988,
+      "step": 80
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.46714732306952683,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8644,
+      "step": 81
+    },
+    {
+      "epoch": 0.04373333333333333,
+      "grad_norm": 0.5700411679789269,
+      "learning_rate": 0.00019990669724599336,
+      "loss": 0.8827,
+      "step": 82
+    },
+    {
+      "epoch": 0.04426666666666667,
+      "grad_norm": 0.47593653750268666,
+      "learning_rate": 0.00019989908502204292,
+      "loss": 0.8986,
+      "step": 83
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5442012033062404,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8896,
+      "step": 84
+    },
+    {
+      "epoch": 0.04533333333333334,
+      "grad_norm": 0.5160866877355936,
+      "learning_rate": 0.00019988296565626987,
+      "loss": 0.834,
+      "step": 85
+    },
+    {
+      "epoch": 0.04586666666666667,
+      "grad_norm": 0.49848101926340194,
+      "learning_rate": 0.00019987445856258206,
+      "loss": 0.7804,
+      "step": 86
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.5434634623520227,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.9614,
+      "step": 87
+    },
+    {
+      "epoch": 0.046933333333333334,
+      "grad_norm": 0.5625199742085497,
+      "learning_rate": 0.00019985654968062122,
+      "loss": 0.7917,
+      "step": 88
+    },
+    {
+      "epoch": 0.047466666666666664,
+      "grad_norm": 0.4231055623029346,
+      "learning_rate": 0.00019984714794582683,
+      "loss": 0.75,
+      "step": 89
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5293710754735215,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8658,
+      "step": 90
+    },
+    {
+      "epoch": 0.04853333333333333,
+      "grad_norm": 0.5558763172233137,
+      "learning_rate": 0.000199827450028985,
+      "loss": 0.9193,
+      "step": 91
+    },
+    {
+      "epoch": 0.04906666666666667,
+      "grad_norm": 0.41754803852935723,
+      "learning_rate": 0.00019981715390575858,
+      "loss": 0.7898,
+      "step": 92
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.6256971093758742,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8806,
+      "step": 93
+    },
+    {
+      "epoch": 0.050133333333333335,
+      "grad_norm": 0.43284536757002134,
+      "learning_rate": 0.00019979566748342347,
+      "loss": 0.7827,
+      "step": 94
+    },
+    {
+      "epoch": 0.050666666666666665,
+      "grad_norm": 0.44742761100470746,
+      "learning_rate": 0.00019978447724847652,
+      "loss": 0.7217,
+      "step": 95
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4168981829618602,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.7092,
+      "step": 96
+    },
+    {
+      "epoch": 0.05173333333333333,
+      "grad_norm": 0.5575162923981515,
+      "learning_rate": 0.00019976120289810247,
+      "loss": 0.8252,
+      "step": 97
+    },
+    {
+      "epoch": 0.05226666666666667,
+      "grad_norm": 0.5302261971536973,
+      "learning_rate": 0.00019974911885217608,
+      "loss": 0.8462,
+      "step": 98
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.47874017698553667,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8181,
+      "step": 99
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.5012430924834531,
+      "learning_rate": 0.0001997240571992685,
+      "loss": 0.788,
+      "step": 100
+    },
+    {
+      "epoch": 0.05386666666666667,
+      "grad_norm": 0.5048653719282753,
+      "learning_rate": 0.00019971107966712518,
+      "loss": 0.8573,
+      "step": 101
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.49047578456287433,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.7898,
+      "step": 102
+    },
+    {
+      "epoch": 0.054933333333333334,
+      "grad_norm": 0.4970060165629044,
+      "learning_rate": 0.0001996842313852238,
+      "loss": 0.8532,
+      "step": 103
+    },
+    {
+      "epoch": 0.055466666666666664,
+      "grad_norm": 0.5401711101860304,
+      "learning_rate": 0.00019967036071563877,
+      "loss": 0.7758,
+      "step": 104
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.5153031149175047,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.8737,
+      "step": 105
+    },
+    {
+      "epoch": 0.05653333333333333,
+      "grad_norm": 0.5559208186733371,
+      "learning_rate": 0.0001996417265262996,
+      "loss": 0.8369,
+      "step": 106
+    },
+    {
+      "epoch": 0.05706666666666667,
+      "grad_norm": 0.4716374538788385,
+      "learning_rate": 0.00019962696309205148,
+      "loss": 0.8803,
+      "step": 107
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.550845985188517,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8375,
+      "step": 108
+    },
+    {
+      "epoch": 0.058133333333333335,
+      "grad_norm": 0.42156146280243834,
+      "learning_rate": 0.0001995965437648273,
+      "loss": 0.753,
+      "step": 109
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 0.48509810772114836,
+      "learning_rate": 0.00019958088796268793,
+      "loss": 0.8412,
+      "step": 110
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.5023266195694092,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8301,
+      "step": 111
+    },
+    {
+      "epoch": 0.05973333333333333,
+      "grad_norm": 0.513339172085789,
+      "learning_rate": 0.00019954868431510764,
+      "loss": 0.8015,
+      "step": 112
+    },
+    {
+      "epoch": 0.06026666666666667,
+      "grad_norm": 0.6203368739647647,
+      "learning_rate": 0.00019953213656583168,
+      "loss": 0.9331,
+      "step": 113
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5355397165316813,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8685,
+      "step": 114
+    },
+    {
+      "epoch": 0.06133333333333333,
+      "grad_norm": 0.449465322541369,
+      "learning_rate": 0.00019949814946337838,
+      "loss": 0.7457,
+      "step": 115
+    },
+    {
+      "epoch": 0.06186666666666667,
+      "grad_norm": 0.4642561650693295,
+      "learning_rate": 0.00019948071021169174,
+      "loss": 0.8127,
+      "step": 116
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.514369238120762,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.8131,
+      "step": 117
+    },
+    {
+      "epoch": 0.06293333333333333,
+      "grad_norm": 0.47582294101026024,
+      "learning_rate": 0.00019944494056777946,
+      "loss": 0.8402,
+      "step": 118
+    },
+    {
+      "epoch": 0.06346666666666667,
+      "grad_norm": 0.4561765696046747,
+      "learning_rate": 0.00019942661028236745,
+      "loss": 0.8312,
+      "step": 119
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4088762133518043,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.7634,
+      "step": 120
+    },
+    {
+      "epoch": 0.06453333333333333,
+      "grad_norm": 0.5327556918534464,
+      "learning_rate": 0.00019938905905831654,
+      "loss": 0.8029,
+      "step": 121
+    },
+    {
+      "epoch": 0.06506666666666666,
+      "grad_norm": 0.4452671115414226,
+      "learning_rate": 0.00019936983823181132,
+      "loss": 0.8139,
+      "step": 122
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.5333061507594044,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.8561,
+      "step": 123
+    },
+    {
+      "epoch": 0.06613333333333334,
+      "grad_norm": 0.5268131404123165,
+      "learning_rate": 0.00019933050643682269,
+      "loss": 0.8553,
+      "step": 124
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.4290420019311977,
+      "learning_rate": 0.00019931039558578997,
+      "loss": 0.7942,
+      "step": 125
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.7470777506995214,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.8548,
+      "step": 126
+    },
+    {
+      "epoch": 0.06773333333333334,
+      "grad_norm": 0.48833920253500857,
+      "learning_rate": 0.00019926928427691786,
+      "loss": 0.8028,
+      "step": 127
+    },
+    {
+      "epoch": 0.06826666666666667,
+      "grad_norm": 0.5456098386553334,
+      "learning_rate": 0.00019924828394184306,
+      "loss": 0.7936,
+      "step": 128
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.5519753117074031,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.7578,
+      "step": 129
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 0.5262059249313784,
+      "learning_rate": 0.0001992053942239668,
+      "loss": 0.9176,
+      "step": 130
+    },
+    {
+      "epoch": 0.06986666666666666,
+      "grad_norm": 0.48905451098926384,
+      "learning_rate": 0.0001991835049692405,
+      "loss": 0.8056,
+      "step": 131
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.5041885687187533,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8784,
+      "step": 132
+    },
+    {
+      "epoch": 0.07093333333333333,
+      "grad_norm": 0.65992878211206,
+      "learning_rate": 0.0001991388379950346,
+      "loss": 0.8824,
+      "step": 133
+    },
+    {
+      "epoch": 0.07146666666666666,
+      "grad_norm": 0.5217525886734332,
+      "learning_rate": 0.0001991160604089374,
+      "loss": 0.7804,
+      "step": 134
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5270075627577061,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.7688,
+      "step": 135
+    },
+    {
+      "epoch": 0.07253333333333334,
+      "grad_norm": 0.5568832639357242,
+      "learning_rate": 0.00019906961737884077,
+      "loss": 0.8603,
+      "step": 136
+    },
+    {
+      "epoch": 0.07306666666666667,
+      "grad_norm": 0.5789452452609885,
+      "learning_rate": 0.00019904595207352737,
+      "loss": 0.7881,
+      "step": 137
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.46035737500928864,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8328,
+      "step": 138
+    },
+    {
+      "epoch": 0.07413333333333333,
+      "grad_norm": 0.589470313748269,
+      "learning_rate": 0.000198997734235711,
+      "loss": 0.9397,
+      "step": 139
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.6242353402465898,
+      "learning_rate": 0.00019897318184719385,
+      "loss": 0.8824,
+      "step": 140
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.5050850899431011,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.7984,
+      "step": 141
+    },
+    {
+      "epoch": 0.07573333333333333,
+      "grad_norm": 0.5655401544347279,
+      "learning_rate": 0.0001989231904975272,
+      "loss": 0.8426,
+      "step": 142
+    },
+    {
+      "epoch": 0.07626666666666666,
+      "grad_norm": 0.47035047199101027,
+      "learning_rate": 0.00019889775168565943,
+      "loss": 0.7603,
+      "step": 143
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5223121829849106,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7486,
+      "step": 144
+    },
+    {
+      "epoch": 0.07733333333333334,
+      "grad_norm": 0.5794215317840429,
+      "learning_rate": 0.00019884598816767563,
+      "loss": 0.8598,
+      "step": 145
+    },
+    {
+      "epoch": 0.07786666666666667,
+      "grad_norm": 0.5378971548326088,
+      "learning_rate": 0.0001988196636161333,
+      "loss": 0.7919,
+      "step": 146
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.5343936949760187,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8111,
+      "step": 147
+    },
+    {
+      "epoch": 0.07893333333333333,
+      "grad_norm": 0.5811898199506572,
+      "learning_rate": 0.00019876612932099308,
+      "loss": 0.8998,
+      "step": 148
+    },
+    {
+      "epoch": 0.07946666666666667,
+      "grad_norm": 0.5835742724881763,
+      "learning_rate": 0.0001987389197372567,
+      "loss": 0.8931,
+      "step": 149
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.47510131365145686,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8002,
+      "step": 150
+    },
+    {
+      "epoch": 0.08053333333333333,
+      "grad_norm": 0.5761874411759367,
+      "learning_rate": 0.00019868361610371097,
+      "loss": 0.786,
+      "step": 151
+    },
+    {
+      "epoch": 0.08106666666666666,
+      "grad_norm": 0.5960372142616529,
+      "learning_rate": 0.00019865552221904665,
+      "loss": 0.9059,
+      "step": 152
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.6080990060254836,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.7834,
+      "step": 153
+    },
+    {
+      "epoch": 0.08213333333333334,
+      "grad_norm": 0.5129966938004138,
+      "learning_rate": 0.00019859845073339787,
+      "loss": 0.7771,
+      "step": 154
+    },
+    {
+      "epoch": 0.08266666666666667,
+      "grad_norm": 0.41987896899713706,
+      "learning_rate": 0.00019856947330283752,
+      "loss": 0.801,
+      "step": 155
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.6334277283030731,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.9359,
+      "step": 156
+    },
+    {
+      "epoch": 0.08373333333333334,
+      "grad_norm": 0.5141312787322188,
+      "learning_rate": 0.0001985106354988997,
+      "loss": 0.8135,
+      "step": 157
+    },
+    {
+      "epoch": 0.08426666666666667,
+      "grad_norm": 0.5090938010451109,
+      "learning_rate": 0.00019848077530122083,
+      "loss": 0.781,
+      "step": 158
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.5379179147704597,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8338,
+      "step": 159
+    },
+    {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.524448342726279,
+      "learning_rate": 0.00019842017276027832,
+      "loss": 0.7992,
+      "step": 160
+    },
+    {
+      "epoch": 0.08586666666666666,
+      "grad_norm": 0.49958124015235883,
+      "learning_rate": 0.00019838943059798304,
+      "loss": 0.758,
+      "step": 161
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.4637100100117015,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.7357,
+      "step": 162
+    },
+    {
+      "epoch": 0.08693333333333333,
+      "grad_norm": 0.8660919139153875,
+      "learning_rate": 0.0001983270649487481,
+      "loss": 0.7904,
+      "step": 163
+    },
+    {
+      "epoch": 0.08746666666666666,
+      "grad_norm": 0.5204581180046053,
+      "learning_rate": 0.0001982954416480417,
+      "loss": 0.7835,
+      "step": 164
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.6988181581621069,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.7778,
+      "step": 165
+    },
+    {
+      "epoch": 0.08853333333333334,
+      "grad_norm": 0.4410751304362726,
+      "learning_rate": 0.00019823131456661063,
+      "loss": 0.7945,
+      "step": 166
+    },
+    {
+      "epoch": 0.08906666666666667,
+      "grad_norm": 0.5624349211581529,
+      "learning_rate": 0.00019819881097737915,
+      "loss": 0.8347,
+      "step": 167
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5558362478822403,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.8885,
+      "step": 168
+    },
+    {
+      "epoch": 0.09013333333333333,
+      "grad_norm": 0.5087247847943858,
+      "learning_rate": 0.00019813292418718732,
+      "loss": 0.8215,
+      "step": 169
+    },
+    {
+      "epoch": 0.09066666666666667,
+      "grad_norm": 0.5548702062919935,
+      "learning_rate": 0.0001980995411829749,
+      "loss": 0.7815,
+      "step": 170
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.5408759607428516,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8004,
+      "step": 171
+    },
+    {
+      "epoch": 0.09173333333333333,
+      "grad_norm": 0.5902227242683268,
+      "learning_rate": 0.0001980318964547504,
+      "loss": 0.911,
+      "step": 172
+    },
+    {
+      "epoch": 0.09226666666666666,
+      "grad_norm": 0.42505372400837477,
+      "learning_rate": 0.0001979976349327357,
+      "loss": 0.8491,
+      "step": 173
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4700905729949172,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.8265,
+      "step": 174
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 0.5905772236187293,
+      "learning_rate": 0.00019792823408445174,
+      "loss": 0.8424,
+      "step": 175
+    },
+    {
+      "epoch": 0.09386666666666667,
+      "grad_norm": 0.5331782932345496,
+      "learning_rate": 0.0001978930949654239,
+      "loss": 0.9053,
+      "step": 176
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.7431522727178561,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.8982,
+      "step": 177
+    },
+    {
+      "epoch": 0.09493333333333333,
+      "grad_norm": 0.5079475839431324,
+      "learning_rate": 0.00019782193986224995,
+      "loss": 0.8392,
+      "step": 178
+    },
+    {
+      "epoch": 0.09546666666666667,
+      "grad_norm": 0.48924012256515104,
+      "learning_rate": 0.00019778592409058378,
+      "loss": 0.8213,
+      "step": 179
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5182419828560418,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.7863,
+      "step": 180
+    },
+    {
+      "epoch": 0.09653333333333333,
+      "grad_norm": 0.5010632560906729,
+      "learning_rate": 0.0001977130166448355,
+      "loss": 0.8226,
+      "step": 181
+    },
+    {
+      "epoch": 0.09706666666666666,
+      "grad_norm": 0.5204323847809875,
+      "learning_rate": 0.00019767612518846608,
+      "loss": 0.8305,
+      "step": 182
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.44514518800717273,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.7703,
+      "step": 183
+    },
+    {
+      "epoch": 0.09813333333333334,
+      "grad_norm": 0.5117132531480701,
+      "learning_rate": 0.00019760146735955388,
+      "loss": 0.8138,
+      "step": 184
+    },
+    {
+      "epoch": 0.09866666666666667,
+      "grad_norm": 0.533659319178954,
+      "learning_rate": 0.00019756370120995066,
+      "loss": 0.7735,
+      "step": 185
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.421932266465524,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7841,
+      "step": 186
+    },
+    {
+      "epoch": 0.09973333333333333,
+      "grad_norm": 0.5016188476076787,
+      "learning_rate": 0.000197487295004327,
+      "loss": 0.8096,
+      "step": 187
+    },
+    {
+      "epoch": 0.10026666666666667,
+      "grad_norm": 0.4866681536952131,
+      "learning_rate": 0.00019744865517646706,
+      "loss": 0.837,
+      "step": 188
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.5212920654667302,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.843,
+      "step": 189
+    },
+    {
+      "epoch": 0.10133333333333333,
+      "grad_norm": 0.597959752214,
+      "learning_rate": 0.0001973705026475726,
+      "loss": 0.7984,
+      "step": 190
+    },
+    {
+      "epoch": 0.10186666666666666,
+      "grad_norm": 0.5432818174149677,
+      "learning_rate": 0.00019733099017991341,
+      "loss": 0.7558,
+      "step": 191
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5810158726425654,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.7635,
+      "step": 192
+    },
+    {
+      "epoch": 0.10293333333333334,
+      "grad_norm": 0.5099329403500529,
+      "learning_rate": 0.0001972510934281218,
+      "loss": 0.7993,
+      "step": 193
+    },
+    {
+      "epoch": 0.10346666666666667,
+      "grad_norm": 0.4615061145478328,
+      "learning_rate": 0.00019721070938257324,
+      "loss": 0.7879,
+      "step": 194
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.45842820312923444,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8371,
+      "step": 195
+    },
+    {
+      "epoch": 0.10453333333333334,
+      "grad_norm": 0.6025405029850975,
+      "learning_rate": 0.0001971290705551347,
+      "loss": 0.8774,
+      "step": 196
+    },
+    {
+      "epoch": 0.10506666666666667,
+      "grad_norm": 0.42477595208472296,
+      "learning_rate": 0.00019708781601703065,
+      "loss": 0.8295,
+      "step": 197
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.4382418484595955,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.79,
+      "step": 198
+    },
+    {
+      "epoch": 0.10613333333333333,
+      "grad_norm": 0.5340809779964995,
+      "learning_rate": 0.00019700443730801413,
+      "loss": 0.8423,
+      "step": 199
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.6657501599265587,
+      "learning_rate": 0.00019696231338608316,
+      "loss": 0.9512,
+      "step": 200
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.5014340782665344,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.8572,
+      "step": 201
+    },
+    {
+      "epoch": 0.10773333333333333,
+      "grad_norm": 0.4525484927078218,
+      "learning_rate": 0.00019687719703631755,
+      "loss": 0.8157,
+      "step": 202
+    },
+    {
+      "epoch": 0.10826666666666666,
+      "grad_norm": 0.6142326781051878,
+      "learning_rate": 0.00019683420486265327,
+      "loss": 0.8234,
+      "step": 203
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.5709852030076734,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.9266,
+      "step": 204
+    },
+    {
+      "epoch": 0.10933333333333334,
+      "grad_norm": 0.5185305175522327,
+      "learning_rate": 0.0001967473531596671,
+      "loss": 0.808,
+      "step": 205
+    },
+    {
+      "epoch": 0.10986666666666667,
+      "grad_norm": 0.4382876729973094,
+      "learning_rate": 0.0001967034938896976,
+      "loss": 0.7584,
+      "step": 206
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.5590060126752588,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8973,
+      "step": 207
+    },
+    {
+      "epoch": 0.11093333333333333,
+      "grad_norm": 0.49611786385112067,
+      "learning_rate": 0.0001966149091676575,
+      "loss": 0.8716,
+      "step": 208
+    },
+    {
+      "epoch": 0.11146666666666667,
+      "grad_norm": 0.5049439999019059,
+      "learning_rate": 0.00019657018398011434,
+      "loss": 0.7737,
+      "step": 209
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.46712456182633916,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.7859,
+      "step": 210
+    },
+    {
+      "epoch": 0.11253333333333333,
+      "grad_norm": 0.6081204529268448,
+      "learning_rate": 0.00019647986861976246,
+      "loss": 0.8756,
+      "step": 211
+    },
+    {
+      "epoch": 0.11306666666666666,
+      "grad_norm": 0.43838191084634615,
+      "learning_rate": 0.0001964342787166491,
+      "loss": 0.8247,
+      "step": 212
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.4258857376163505,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7771,
+      "step": 213
+    },
+    {
+      "epoch": 0.11413333333333334,
+      "grad_norm": 0.593866093205508,
+      "learning_rate": 0.0001963422351452389,
+      "loss": 0.7866,
+      "step": 214
+    },
+    {
+      "epoch": 0.11466666666666667,
+      "grad_norm": 0.4070136732910013,
+      "learning_rate": 0.0001962957817517982,
+      "loss": 0.7615,
+      "step": 215
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.47446825115829455,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.7493,
+      "step": 216
+    },
+    {
+      "epoch": 0.11573333333333333,
+      "grad_norm": 0.534291016113494,
+      "learning_rate": 0.00019620201244302952,
+      "loss": 0.8327,
+      "step": 217
+    },
+    {
+      "epoch": 0.11626666666666667,
+      "grad_norm": 0.5456977282318515,
+      "learning_rate": 0.00019615469680771096,
+      "loss": 0.8143,
+      "step": 218
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.48373802875797434,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.8229,
+      "step": 219
+    },
+    {
+      "epoch": 0.11733333333333333,
+      "grad_norm": 0.5203841178534363,
+      "learning_rate": 0.00019605920428166323,
+      "loss": 0.8083,
+      "step": 220
+    },
+    {
+      "epoch": 0.11786666666666666,
+      "grad_norm": 0.5629772908785765,
+      "learning_rate": 0.00019601102767608923,
+      "loss": 0.9057,
+      "step": 221
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.561895722100035,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8782,
+      "step": 222
+    },
+    {
+      "epoch": 0.11893333333333334,
+      "grad_norm": 0.7321171724122697,
+      "learning_rate": 0.00019591381449915397,
+      "loss": 0.8782,
+      "step": 223
+    },
+    {
+      "epoch": 0.11946666666666667,
+      "grad_norm": 0.4447398608848945,
+      "learning_rate": 0.00019586477821808597,
+      "loss": 0.7897,
+      "step": 224
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.4949712922597885,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.825,
+      "step": 225
+    },
+    {
+      "epoch": 0.12053333333333334,
+      "grad_norm": 0.6690982394856024,
+      "learning_rate": 0.00019576584700289768,
+      "loss": 0.8946,
+      "step": 226
+    },
+    {
+      "epoch": 0.12106666666666667,
+      "grad_norm": 0.44254020743234357,
+      "learning_rate": 0.00019571595236420102,
+      "loss": 0.8046,
+      "step": 227
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.435787744927338,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.8009,
+      "step": 228
+    },
+    {
+      "epoch": 0.12213333333333333,
+      "grad_norm": 0.5023689018286775,
+      "learning_rate": 0.00019561530576956703,
+      "loss": 0.7905,
+      "step": 229
+    },
+    {
+      "epoch": 0.12266666666666666,
+      "grad_norm": 0.4493265664785308,
+      "learning_rate": 0.00019556455411417573,
+      "loss": 0.7665,
+      "step": 230
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.5362087506786596,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8088,
+      "step": 231
+    },
+    {
+      "epoch": 0.12373333333333333,
+      "grad_norm": 0.4450062730399959,
+      "learning_rate": 0.00019546219484500475,
+      "loss": 0.7885,
+      "step": 232
+    },
+    {
+      "epoch": 0.12426666666666666,
+      "grad_norm": 0.4632062205666545,
+      "learning_rate": 0.00019541058753688538,
+      "loss": 0.8023,
+      "step": 233
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.4316089402576449,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.73,
+      "step": 234
+    },
+    {
+      "epoch": 0.12533333333333332,
+      "grad_norm": 0.4253684655222452,
+      "learning_rate": 0.00019530651834411474,
+      "loss": 0.7307,
+      "step": 235
+    },
+    {
+      "epoch": 0.12586666666666665,
+      "grad_norm": 0.49194412033330853,
+      "learning_rate": 0.00019525405677022989,
+      "loss": 0.7542,
+      "step": 236
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.5524366372423557,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.8655,
+      "step": 237
+    },
+    {
+      "epoch": 0.12693333333333334,
+      "grad_norm": 0.4569670439362406,
+      "learning_rate": 0.0001951482804507517,
+      "loss": 0.7719,
+      "step": 238
+    },
+    {
+      "epoch": 0.12746666666666667,
+      "grad_norm": 0.4621197739520012,
+      "learning_rate": 0.00019509496602102252,
+      "loss": 0.7435,
+      "step": 239
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.5996689512472547,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.8705,
+      "step": 240
+    },
+    {
+      "epoch": 0.12853333333333333,
+      "grad_norm": 0.4995831781161566,
+      "learning_rate": 0.00019498748541760846,
+      "loss": 0.8065,
+      "step": 241
+    },
+    {
+      "epoch": 0.12906666666666666,
+      "grad_norm": 0.5536628209320402,
+      "learning_rate": 0.0001949333195648769,
+      "loss": 0.7531,
+      "step": 242
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.7264022684471643,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.9006,
+      "step": 243
+    },
+    {
+      "epoch": 0.13013333333333332,
+      "grad_norm": 0.48996010531672174,
+      "learning_rate": 0.00019482413756610173,
+      "loss": 0.7176,
+      "step": 244
+    },
+    {
+      "epoch": 0.13066666666666665,
+      "grad_norm": 0.4986301394800893,
+      "learning_rate": 0.0001947691217460921,
+      "loss": 0.7331,
+      "step": 245
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.4582507844574082,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8075,
+      "step": 246
+    },
+    {
+      "epoch": 0.13173333333333334,
+      "grad_norm": 0.5240490196517137,
+      "learning_rate": 0.00019465824128625617,
+      "loss": 0.8127,
+      "step": 247
+    },
+    {
+      "epoch": 0.13226666666666667,
+      "grad_norm": 0.4558191512965942,
+      "learning_rate": 0.00019460237697753577,
+      "loss": 0.7936,
+      "step": 248
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.4955689981126501,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8497,
+      "step": 249
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.43246762551813506,
+      "learning_rate": 0.00019448980103658613,
+      "loss": 0.7647,
+      "step": 250
+    },
+    {
+      "epoch": 0.13386666666666666,
+      "grad_norm": 0.47021749367794397,
+      "learning_rate": 0.0001944330897405257,
+      "loss": 0.8128,
+      "step": 251
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5542410122264593,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8988,
+      "step": 252
+    },
+    {
+      "epoch": 0.13493333333333332,
+      "grad_norm": 0.5027820701759819,
+      "learning_rate": 0.00019431882134397598,
+      "loss": 0.8469,
+      "step": 253
+    },
+    {
+      "epoch": 0.13546666666666668,
+      "grad_norm": 0.4448189784336865,
+      "learning_rate": 0.00019426126458470936,
+      "loss": 0.7906,
+      "step": 254
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.47925425360609064,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.8258,
+      "step": 255
+    },
+    {
+      "epoch": 0.13653333333333334,
+      "grad_norm": 0.5208769466787269,
+      "learning_rate": 0.00019414530680355837,
+      "loss": 0.8222,
+      "step": 256
+    },
+    {
+      "epoch": 0.13706666666666667,
+      "grad_norm": 0.4154088441354576,
+      "learning_rate": 0.00019408690612794148,
+      "loss": 0.7513,
+      "step": 257
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.4855637437288716,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7959,
+      "step": 258
+    },
+    {
+      "epoch": 0.13813333333333333,
+      "grad_norm": 0.5373917222804588,
+      "learning_rate": 0.00019396926207859084,
+      "loss": 0.8802,
+      "step": 259
+    },
+    {
+      "epoch": 0.13866666666666666,
+      "grad_norm": 0.4887205340946534,
+      "learning_rate": 0.0001939100190561601,
+      "loss": 0.8318,
+      "step": 260
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.5695249330104949,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8529,
+      "step": 261
+    },
+    {
+      "epoch": 0.13973333333333332,
+      "grad_norm": 0.4188870625608363,
+      "learning_rate": 0.0001937906919003304,
+      "loss": 0.7977,
+      "step": 262
+    },
+    {
+      "epoch": 0.14026666666666668,
+      "grad_norm": 0.4781375918853016,
+      "learning_rate": 0.00019373060812326052,
+      "loss": 0.786,
+      "step": 263
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5322759754142233,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.7719,
+      "step": 264
+    },
+    {
+      "epoch": 0.14133333333333334,
+      "grad_norm": 0.4465537339360222,
+      "learning_rate": 0.00019360960106790643,
+      "loss": 0.711,
+      "step": 265
+    },
+    {
+      "epoch": 0.14186666666666667,
+      "grad_norm": 0.4649060332296204,
+      "learning_rate": 0.0001935486781509677,
+      "loss": 0.8508,
+      "step": 266
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.47043108444414206,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7746,
+      "step": 267
+    },
+    {
+      "epoch": 0.14293333333333333,
+      "grad_norm": 0.6758924041080338,
+      "learning_rate": 0.00019342599444819168,
+      "loss": 0.8107,
+      "step": 268
+    },
+    {
+      "epoch": 0.14346666666666666,
+      "grad_norm": 0.4559404927513359,
+      "learning_rate": 0.00019336423402870653,
+      "loss": 0.734,
+      "step": 269
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.42429474328097455,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.7219,
+      "step": 270
+    },
+    {
+      "epoch": 0.14453333333333335,
+      "grad_norm": 0.47327928036650596,
+      "learning_rate": 0.0001932398769756714,
+      "loss": 0.883,
+      "step": 271
+    },
+    {
+      "epoch": 0.14506666666666668,
+      "grad_norm": 0.502081625626178,
+      "learning_rate": 0.0001931772807134704,
+      "loss": 0.776,
+      "step": 272
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.48026421006742326,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.8542,
+      "step": 273
+    },
+    {
+      "epoch": 0.14613333333333334,
+      "grad_norm": 0.4264192649481206,
+      "learning_rate": 0.00019305125365231084,
+      "loss": 0.7773,
+      "step": 274
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 0.5041368844234315,
+      "learning_rate": 0.00019298782322968815,
+      "loss": 0.7874,
+      "step": 275
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.4101983489697829,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.7619,
+      "step": 276
+    },
+    {
+      "epoch": 0.14773333333333333,
+      "grad_norm": 0.5440978571078002,
+      "learning_rate": 0.0001928601295474208,
+      "loss": 0.8104,
+      "step": 277
+    },
+    {
+      "epoch": 0.14826666666666666,
+      "grad_norm": 0.5247219256270258,
+      "learning_rate": 0.00019279586666908884,
+      "loss": 0.774,
+      "step": 278
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.6612971325447901,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7768,
+      "step": 279
+    },
+    {
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.5126463491957624,
+      "learning_rate": 0.00019266650979752136,
+      "loss": 0.8796,
+      "step": 280
+    },
+    {
+      "epoch": 0.14986666666666668,
+      "grad_norm": 0.6117943304592908,
+      "learning_rate": 0.00019260141619056507,
+      "loss": 0.7273,
+      "step": 281
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.7762971712464279,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.9606,
+      "step": 282
+    },
+    {
+      "epoch": 0.15093333333333334,
+      "grad_norm": 0.47937349115279854,
+      "learning_rate": 0.0001924703996062038,
+      "loss": 0.8303,
+      "step": 283
+    },
+    {
+      "epoch": 0.15146666666666667,
+      "grad_norm": 0.4820045733822554,
+      "learning_rate": 0.0001924044770200342,
+      "loss": 0.7213,
+      "step": 284
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.4270495896837966,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.8148,
+      "step": 285
+    },
+    {
+      "epoch": 0.15253333333333333,
+      "grad_norm": 0.3958259756289764,
+      "learning_rate": 0.0001922718042439908,
+      "loss": 0.7494,
+      "step": 286
+    },
+    {
+      "epoch": 0.15306666666666666,
+      "grad_norm": 0.45597726469299843,
+      "learning_rate": 0.000192205054450298,
+      "loss": 0.8499,
+      "step": 287
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.5244564069581154,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.7156,
+      "step": 288
+    },
+    {
+      "epoch": 0.15413333333333334,
+      "grad_norm": 0.5227787446025458,
+      "learning_rate": 0.00019207072904819486,
+      "loss": 0.8284,
+      "step": 289
+    },
+    {
+      "epoch": 0.15466666666666667,
+      "grad_norm": 0.49291342948039957,
+      "learning_rate": 0.00019200315384090044,
+      "loss": 0.8306,
+      "step": 290
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.47180595085480614,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7799,
+      "step": 291
+    },
+    {
+      "epoch": 0.15573333333333333,
+      "grad_norm": 0.5651113460887758,
+      "learning_rate": 0.00019186717942277462,
+      "loss": 0.8706,
+      "step": 292
+    },
+    {
+      "epoch": 0.15626666666666666,
+      "grad_norm": 0.5342570203703145,
+      "learning_rate": 0.00019179878061798347,
+      "loss": 0.8137,
+      "step": 293
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.4412344364200379,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.7911,
+      "step": 294
+    },
+    {
+      "epoch": 0.15733333333333333,
+      "grad_norm": 0.4375767139194271,
+      "learning_rate": 0.00019166116083819002,
+      "loss": 0.757,
+      "step": 295
+    },
+    {
+      "epoch": 0.15786666666666666,
+      "grad_norm": 0.49772498084372346,
+      "learning_rate": 0.00019159194027414128,
+      "loss": 0.7478,
+      "step": 296
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.43884306270757545,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.7974,
+      "step": 297
+    },
+    {
+      "epoch": 0.15893333333333334,
+      "grad_norm": 0.4529967915347486,
+      "learning_rate": 0.00019145267883125482,
+      "loss": 0.783,
+      "step": 298
+    },
+    {
+      "epoch": 0.15946666666666667,
+      "grad_norm": 0.45807911831139486,
+      "learning_rate": 0.00019138263836827288,
+      "loss": 0.7645,
+      "step": 299
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5197256879061048,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.7813,
+      "step": 300
+    },
+    {
+      "epoch": 0.16053333333333333,
+      "grad_norm": 0.5038334071231932,
+      "learning_rate": 0.00019124173900498818,
+      "loss": 0.7599,
+      "step": 301
+    },
+    {
+      "epoch": 0.16106666666666666,
+      "grad_norm": 0.49670102860084564,
+      "learning_rate": 0.00019117088052543233,
+      "loss": 0.804,
+      "step": 302
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.468516292536784,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8496,
+      "step": 303
+    },
+    {
+      "epoch": 0.16213333333333332,
+      "grad_norm": 0.40283967469213017,
+      "learning_rate": 0.00019102834702846387,
+      "loss": 0.7902,
+      "step": 304
+    },
+    {
+      "epoch": 0.16266666666666665,
+      "grad_norm": 0.4296043119455365,
+      "learning_rate": 0.0001909566724366779,
+      "loss": 0.734,
+      "step": 305
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.4989654907778812,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.848,
+      "step": 306
+    },
+    {
+      "epoch": 0.16373333333333334,
+      "grad_norm": 0.4705803121651488,
+      "learning_rate": 0.00019081250863665794,
+      "loss": 0.8161,
+      "step": 307
+    },
+    {
+      "epoch": 0.16426666666666667,
+      "grad_norm": 0.4228684176978226,
+      "learning_rate": 0.0001907400198589189,
+      "loss": 0.7375,
+      "step": 308
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.49457291733883274,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.8442,
+      "step": 309
+    },
+    {
+      "epoch": 0.16533333333333333,
+      "grad_norm": 0.4407863834288453,
+      "learning_rate": 0.00019059422963029464,
+      "loss": 0.7497,
+      "step": 310
+    },
+    {
+      "epoch": 0.16586666666666666,
+      "grad_norm": 0.4301210165073435,
+      "learning_rate": 0.0001905209286147611,
+      "loss": 0.707,
+      "step": 311
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.43292672185524217,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7048,
+      "step": 312
+    },
+    {
+      "epoch": 0.16693333333333332,
+      "grad_norm": 0.45849806176782326,
+      "learning_rate": 0.0001903735158756905,
+      "loss": 0.7985,
+      "step": 313
+    },
+    {
+      "epoch": 0.16746666666666668,
+      "grad_norm": 0.42800637308038647,
+      "learning_rate": 0.0001902994045923502,
+      "loss": 0.7615,
+      "step": 314
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.4908767707999642,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.762,
+      "step": 315
+    },
+    {
+      "epoch": 0.16853333333333334,
+      "grad_norm": 0.4863171414963457,
+      "learning_rate": 0.0001901503733045967,
+      "loss": 0.7672,
+      "step": 316
+    },
+    {
+      "epoch": 0.16906666666666667,
+      "grad_norm": 0.517417694309494,
+      "learning_rate": 0.00019007545374521355,
+      "loss": 0.8883,
+      "step": 317
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.40528442843383106,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7426,
+      "step": 318
+    },
+    {
+      "epoch": 0.17013333333333333,
+      "grad_norm": 0.46897332398261404,
+      "learning_rate": 0.00018992480791403958,
+      "loss": 0.7899,
+      "step": 319
+    },
+    {
+      "epoch": 0.17066666666666666,
+      "grad_norm": 0.5277810073048738,
+      "learning_rate": 0.0001898490820921001,
+      "loss": 0.7809,
+      "step": 320
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.4915143492398682,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.7902,
+      "step": 321
+    },
+    {
+      "epoch": 0.17173333333333332,
+      "grad_norm": 0.4025228826702654,
+      "learning_rate": 0.0001896968257661595,
+      "loss": 0.7379,
+      "step": 322
+    },
+    {
+      "epoch": 0.17226666666666668,
+      "grad_norm": 0.5419896681574701,
+      "learning_rate": 0.00018962029571681886,
+      "loss": 0.8238,
+      "step": 323
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.49623969274297897,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.6811,
+      "step": 324
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 0.4802001235125401,
+      "learning_rate": 0.00018946643298804793,
+      "loss": 0.7658,
+      "step": 325
+    },
+    {
+      "epoch": 0.17386666666666667,
+      "grad_norm": 0.4789639246740269,
+      "learning_rate": 0.00018938910076807513,
+      "loss": 0.7643,
+      "step": 326
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.5348953109627322,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.814,
+      "step": 327
+    },
+    {
+      "epoch": 0.17493333333333333,
+      "grad_norm": 0.41953332867974436,
+      "learning_rate": 0.0001892336357715829,
+      "loss": 0.771,
+      "step": 328
+    },
+    {
+      "epoch": 0.17546666666666666,
+      "grad_norm": 0.4245439856312734,
+      "learning_rate": 0.0001891555034593055,
+      "loss": 0.7922,
+      "step": 329
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.43877334769248494,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.8109,
+      "step": 330
+    },
+    {
+      "epoch": 0.17653333333333332,
+      "grad_norm": 0.39656939792093154,
+      "learning_rate": 0.00018899844037326225,
+      "loss": 0.7076,
+      "step": 331
+    },
+    {
+      "epoch": 0.17706666666666668,
+      "grad_norm": 0.44555770939906086,
+      "learning_rate": 0.0001889195100685106,
+      "loss": 0.7564,
+      "step": 332
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.45431927372201913,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.79,
+      "step": 333
+    },
+    {
+      "epoch": 0.17813333333333334,
+      "grad_norm": 0.4738456629646632,
+      "learning_rate": 0.00018876085311403593,
+      "loss": 0.773,
+      "step": 334
+    },
+    {
+      "epoch": 0.17866666666666667,
+      "grad_norm": 0.4080586869728145,
+      "learning_rate": 0.00018868112693808665,
+      "loss": 0.7973,
+      "step": 335
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.40915047638282037,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7118,
+      "step": 336
+    },
+    {
+      "epoch": 0.17973333333333333,
+      "grad_norm": 0.43249592315285584,
+      "learning_rate": 0.00018852088037913577,
+      "loss": 0.7073,
+      "step": 337
+    },
+    {
+      "epoch": 0.18026666666666666,
+      "grad_norm": 0.4021372609040263,
+      "learning_rate": 0.0001884403604746547,
+      "loss": 0.7256,
+      "step": 338
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.5175450241514117,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8151,
+      "step": 339
+    },
+    {
+      "epoch": 0.18133333333333335,
+      "grad_norm": 0.4532390696629612,
+      "learning_rate": 0.00018827852861790398,
+      "loss": 0.7459,
+      "step": 340
+    },
+    {
+      "epoch": 0.18186666666666668,
+      "grad_norm": 0.5160049883677659,
+      "learning_rate": 0.00018819721714888877,
+      "loss": 0.7731,
+      "step": 341
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.4880252901638021,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.7964,
+      "step": 342
+    },
+    {
+      "epoch": 0.18293333333333334,
+      "grad_norm": 0.4270119556740443,
+      "learning_rate": 0.00018803380434362,
+      "loss": 0.702,
+      "step": 343
+    },
+    {
+      "epoch": 0.18346666666666667,
+      "grad_norm": 0.4839218946470525,
+      "learning_rate": 0.0001879517034953418,
+      "loss": 0.8351,
+      "step": 344
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.551660540948947,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.754,
+      "step": 345
+    },
+    {
+      "epoch": 0.18453333333333333,
+      "grad_norm": 0.45043280403669955,
+      "learning_rate": 0.00018778671413332513,
+      "loss": 0.6675,
+      "step": 346
+    },
+    {
+      "epoch": 0.18506666666666666,
+      "grad_norm": 0.44065277624899946,
+      "learning_rate": 0.00018770382611226987,
+      "loss": 0.7586,
+      "step": 347
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.4327253551488894,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.727,
+      "step": 348
+    },
+    {
+      "epoch": 0.18613333333333335,
+      "grad_norm": 0.43761925651145245,
+      "learning_rate": 0.000187537264627646,
+      "loss": 0.7435,
+      "step": 349
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.4630305498739546,
+      "learning_rate": 0.00018745359166145523,
+      "loss": 0.7641,
+      "step": 350
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.5328992513303824,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7626,
+      "step": 351
+    },
+    {
+      "epoch": 0.18773333333333334,
+      "grad_norm": 0.5098188095859564,
+      "learning_rate": 0.00018728546253061614,
+      "loss": 0.7274,
+      "step": 352
+    },
+    {
+      "epoch": 0.18826666666666667,
+      "grad_norm": 0.4889687065223941,
+      "learning_rate": 0.00018720100686802694,
+      "loss": 0.8114,
+      "step": 353
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.49009896834756567,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.8492,
+      "step": 354
+    },
+    {
+      "epoch": 0.18933333333333333,
+      "grad_norm": 0.4447894837155413,
+      "learning_rate": 0.00018703131460949554,
+      "loss": 0.7614,
+      "step": 355
+    },
+    {
+      "epoch": 0.18986666666666666,
+      "grad_norm": 0.4130993532940369,
+      "learning_rate": 0.0001869460785202802,
+      "loss": 0.7396,
+      "step": 356
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.47012331091275783,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.8026,
+      "step": 357
+    },
+    {
+      "epoch": 0.19093333333333334,
+      "grad_norm": 0.4929187617955688,
+      "learning_rate": 0.00018677482769458904,
+      "loss": 0.8644,
+      "step": 358
+    },
+    {
+      "epoch": 0.19146666666666667,
+      "grad_norm": 0.5990703888599299,
+      "learning_rate": 0.00018668881346949417,
+      "loss": 0.8559,
+      "step": 359
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.609077809217428,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.9045,
+      "step": 360
+    },
+    {
+      "epoch": 0.19253333333333333,
+      "grad_norm": 0.4424878133988063,
+      "learning_rate": 0.00018651600867906272,
+      "loss": 0.7656,
+      "step": 361
+    },
+    {
+      "epoch": 0.19306666666666666,
+      "grad_norm": 0.46215474420371894,
+      "learning_rate": 0.00018642921862974742,
+      "loss": 0.8207,
+      "step": 362
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.5061061618813556,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.8528,
+      "step": 363
+    },
+    {
+      "epoch": 0.19413333333333332,
+      "grad_norm": 0.5097362762712476,
+      "learning_rate": 0.00018625486451875843,
+      "loss": 0.7266,
+      "step": 364
+    },
+    {
+      "epoch": 0.19466666666666665,
+      "grad_norm": 0.4565147351609864,
+      "learning_rate": 0.0001861673009777325,
+      "loss": 0.804,
+      "step": 365
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.3847984326162943,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.7273,
+      "step": 366
+    },
+    {
+      "epoch": 0.19573333333333334,
+      "grad_norm": 0.5152582717564496,
+      "learning_rate": 0.00018599140223200716,
+      "loss": 0.8093,
+      "step": 367
+    },
+    {
+      "epoch": 0.19626666666666667,
+      "grad_norm": 0.47173214566980104,
+      "learning_rate": 0.0001859030675525681,
+      "loss": 0.7882,
+      "step": 368
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.5261921918919323,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.8545,
+      "step": 369
+    },
+    {
+      "epoch": 0.19733333333333333,
+      "grad_norm": 0.41804637000571243,
+      "learning_rate": 0.0001857256288994402,
+      "loss": 0.7672,
+      "step": 370
+    },
+    {
+      "epoch": 0.19786666666666666,
+      "grad_norm": 0.48436297650607635,
+      "learning_rate": 0.00018563652545561013,
+      "loss": 0.818,
+      "step": 371
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.3965302090838612,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.6821,
+      "step": 372
+    },
+    {
+      "epoch": 0.19893333333333332,
+      "grad_norm": 0.49489973326421677,
+      "learning_rate": 0.000185457551663799,
+      "loss": 0.7431,
+      "step": 373
+    },
+    {
+      "epoch": 0.19946666666666665,
+      "grad_norm": 0.4122066592989025,
+      "learning_rate": 0.00018536768185026083,
+      "loss": 0.7383,
+      "step": 374
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.4951284382180182,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.7821,
+      "step": 375
+    },
+    {
+      "epoch": 0.20053333333333334,
+      "grad_norm": 0.4576128031894911,
+      "learning_rate": 0.00018518717772974302,
+      "loss": 0.7218,
+      "step": 376
+    },
+    {
+      "epoch": 0.20106666666666667,
+      "grad_norm": 0.4493219019007947,
+      "learning_rate": 0.00018509654396177609,
+      "loss": 0.6925,
+      "step": 377
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4068671936907414,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.7211,
+      "step": 378
+    },
+    {
+      "epoch": 0.20213333333333333,
+      "grad_norm": 0.5713010081475005,
+      "learning_rate": 0.00018491451436365627,
+      "loss": 0.9084,
+      "step": 379
+    },
+    {
+      "epoch": 0.20266666666666666,
+      "grad_norm": 0.42999463693606377,
+      "learning_rate": 0.0001848231190770714,
+      "loss": 0.6999,
+      "step": 380
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.49468756297859245,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.8386,
+      "step": 381
+    },
+    {
+      "epoch": 0.20373333333333332,
+      "grad_norm": 0.5410893554444948,
+      "learning_rate": 0.00018463956889345194,
+      "loss": 0.765,
+      "step": 382
+    },
+    {
+      "epoch": 0.20426666666666668,
+      "grad_norm": 0.4546813988185285,
+      "learning_rate": 0.00018454741454452603,
+      "loss": 0.8114,
+      "step": 383
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4504507568105852,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7781,
+      "step": 384
+    },
+    {
+      "epoch": 0.20533333333333334,
+      "grad_norm": 0.44249447326871505,
+      "learning_rate": 0.00018436234870837547,
+      "loss": 0.7737,
+      "step": 385
+    },
+    {
+      "epoch": 0.20586666666666667,
+      "grad_norm": 0.47403384634933343,
+      "learning_rate": 0.00018426943777378552,
+      "loss": 0.6962,
+      "step": 386
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.41782602249492984,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7021,
+      "step": 387
+    },
+    {
+      "epoch": 0.20693333333333333,
+      "grad_norm": 0.5043278124162154,
+      "learning_rate": 0.00018408286125880604,
+      "loss": 0.7627,
+      "step": 388
+    },
+    {
+      "epoch": 0.20746666666666666,
+      "grad_norm": 0.40891139304823176,
+      "learning_rate": 0.00018398919623556238,
+      "loss": 0.7153,
+      "step": 389
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4760123604558327,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.7927,
+      "step": 390
+    },
+    {
+      "epoch": 0.20853333333333332,
+      "grad_norm": 0.4801579781447219,
+      "learning_rate": 0.0001838011140560562,
+      "loss": 0.8051,
+      "step": 391
+    },
+    {
+      "epoch": 0.20906666666666668,
+      "grad_norm": 0.5402948140632149,
+      "learning_rate": 0.00018370669746143564,
+      "loss": 0.7613,
+      "step": 392
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5388484457151901,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.8343,
+      "step": 393
+    },
+    {
+      "epoch": 0.21013333333333334,
+      "grad_norm": 0.5005329170598786,
+      "learning_rate": 0.0001835171146721701,
+      "loss": 0.8039,
+      "step": 394
+    },
+    {
+      "epoch": 0.21066666666666667,
+      "grad_norm": 0.5127304530705005,
+      "learning_rate": 0.00018342194904364813,
+      "loss": 0.8015,
+      "step": 395
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5606682188930232,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.7967,
+      "step": 396
+    },
+    {
+      "epoch": 0.21173333333333333,
+      "grad_norm": 0.5731550904456906,
+      "learning_rate": 0.00018323087073971993,
+      "loss": 0.8197,
+      "step": 397
+    },
+    {
+      "epoch": 0.21226666666666666,
+      "grad_norm": 0.437099128458472,
+      "learning_rate": 0.00018313495863490258,
+      "loss": 0.7945,
+      "step": 398
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.4401442760833195,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.76,
+      "step": 399
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.44568903031232776,
+      "learning_rate": 0.00018294238995160094,
+      "loss": 0.7695,
+      "step": 400
+    },
+    {
+      "epoch": 0.21386666666666668,
+      "grad_norm": 0.4662245087633688,
+      "learning_rate": 0.00018284573394815597,
+      "loss": 0.7496,
+      "step": 401
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.48524667118673764,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7303,
+      "step": 402
+    },
+    {
+      "epoch": 0.21493333333333334,
+      "grad_norm": 0.4858668556114856,
+      "learning_rate": 0.00018265168006082437,
+      "loss": 0.7747,
+      "step": 403
+    },
+    {
+      "epoch": 0.21546666666666667,
+      "grad_norm": 0.4274364003282721,
+      "learning_rate": 0.00018255428275641214,
+      "loss": 0.7606,
+      "step": 404
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5174559878401154,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7883,
+      "step": 405
+    },
+    {
+      "epoch": 0.21653333333333333,
+      "grad_norm": 0.38984437501119923,
+      "learning_rate": 0.0001823587488803095,
+      "loss": 0.6835,
+      "step": 406
+    },
+    {
+      "epoch": 0.21706666666666666,
+      "grad_norm": 0.510189489917776,
+      "learning_rate": 0.00018226061289251298,
+      "loss": 0.7993,
+      "step": 407
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.46968887006552384,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7332,
+      "step": 408
+    },
+    {
+      "epoch": 0.21813333333333335,
+      "grad_norm": 0.4399984189996566,
+      "learning_rate": 0.00018206360428267332,
+      "loss": 0.6953,
+      "step": 409
+    },
+    {
+      "epoch": 0.21866666666666668,
+      "grad_norm": 0.4333691148080423,
+      "learning_rate": 0.00018196473224892784,
+      "loss": 0.7493,
+      "step": 410
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.41178366424868323,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.6692,
+      "step": 411
+    },
+    {
+      "epoch": 0.21973333333333334,
+      "grad_norm": 0.44677197753635806,
+      "learning_rate": 0.0001817662542000192,
+      "loss": 0.7781,
+      "step": 412
+    },
+    {
+      "epoch": 0.22026666666666667,
+      "grad_norm": 0.43922123270983526,
+      "learning_rate": 0.0001816666487775416,
+      "loss": 0.7271,
+      "step": 413
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.5059324491995313,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7568,
+      "step": 414
+    },
+    {
+      "epoch": 0.22133333333333333,
+      "grad_norm": 0.42926085514645107,
+      "learning_rate": 0.00018146670662372354,
+      "loss": 0.7135,
+      "step": 415
+    },
+    {
+      "epoch": 0.22186666666666666,
+      "grad_norm": 0.5065137362294944,
+      "learning_rate": 0.0001813663704894407,
+      "loss": 0.78,
+      "step": 416
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.5253612969729855,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.8232,
+      "step": 417
+    },
+    {
+      "epoch": 0.22293333333333334,
+      "grad_norm": 0.46219676227828016,
+      "learning_rate": 0.00018116496960422107,
+      "loss": 0.6758,
+      "step": 418
+    },
+    {
+      "epoch": 0.22346666666666667,
+      "grad_norm": 0.42711135894094,
+      "learning_rate": 0.00018106390545469795,
+      "loss": 0.7147,
+      "step": 419
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5561434628339826,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7576,
+      "step": 420
+    },
+    {
+      "epoch": 0.22453333333333333,
+      "grad_norm": 0.5493160920813105,
+      "learning_rate": 0.00018086105125078857,
+      "loss": 0.74,
+      "step": 421
+    },
+    {
+      "epoch": 0.22506666666666666,
+      "grad_norm": 0.4995268309248488,
+      "learning_rate": 0.00018075926180215576,
+      "loss": 0.8139,
+      "step": 422
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.4433354821933893,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.8007,
+      "step": 423
+    },
+    {
+      "epoch": 0.22613333333333333,
+      "grad_norm": 0.5357396937903179,
+      "learning_rate": 0.0001805549597313267,
+      "loss": 0.7468,
+      "step": 424
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 0.4331008924018268,
+      "learning_rate": 0.0001804524477192075,
+      "loss": 0.7474,
+      "step": 425
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.41132216704825353,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7194,
+      "step": 426
+    },
+    {
+      "epoch": 0.22773333333333334,
+      "grad_norm": 0.4438940202038279,
+      "learning_rate": 0.00018024670327214084,
+      "loss": 0.8123,
+      "step": 427
+    },
+    {
+      "epoch": 0.22826666666666667,
+      "grad_norm": 0.45245657206075135,
+      "learning_rate": 0.00018014347145157755,
+      "loss": 0.7344,
+      "step": 428
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.4564352564095824,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7917,
+      "step": 429
+    },
+    {
+      "epoch": 0.22933333333333333,
+      "grad_norm": 0.3874291215657798,
+      "learning_rate": 0.0001799362901577196,
+      "loss": 0.6891,
+      "step": 430
+    },
+    {
+      "epoch": 0.22986666666666666,
+      "grad_norm": 0.5028289800240948,
+      "learning_rate": 0.00017983234130309968,
+      "loss": 0.8464,
+      "step": 431
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.5582262301866101,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7726,
+      "step": 432
+    },
+    {
+      "epoch": 0.23093333333333332,
+      "grad_norm": 0.42059169501629723,
+      "learning_rate": 0.00017962372873051252,
+      "loss": 0.7472,
+      "step": 433
+    },
+    {
+      "epoch": 0.23146666666666665,
+      "grad_norm": 0.41864929349079905,
+      "learning_rate": 0.00017951906563549397,
+      "loss": 0.7511,
+      "step": 434
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.4871627192526415,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.827,
+      "step": 435
+    },
+    {
+      "epoch": 0.23253333333333334,
+      "grad_norm": 0.47436886259604033,
+      "learning_rate": 0.00017930902739070562,
+      "loss": 0.7568,
+      "step": 436
+    },
+    {
+      "epoch": 0.23306666666666667,
+      "grad_norm": 0.5140453593325253,
+      "learning_rate": 0.00017920365286814183,
+      "loss": 0.8633,
+      "step": 437
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4111140417805916,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7271,
+      "step": 438
+    },
+    {
+      "epoch": 0.23413333333333333,
+      "grad_norm": 0.4771323698261575,
+      "learning_rate": 0.0001789921945959958,
+      "loss": 0.7806,
+      "step": 439
+    },
+    {
+      "epoch": 0.23466666666666666,
+      "grad_norm": 0.4563126889883716,
+      "learning_rate": 0.00017888611147786002,
+      "loss": 0.7908,
+      "step": 440
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.4031187640915982,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.711,
+      "step": 441
+    },
+    {
+      "epoch": 0.23573333333333332,
+      "grad_norm": 0.44264116877521864,
+      "learning_rate": 0.00017867323886136348,
+      "loss": 0.7092,
+      "step": 442
+    },
+    {
+      "epoch": 0.23626666666666668,
+      "grad_norm": 0.5345425982074496,
+      "learning_rate": 0.00017856644999867264,
+      "loss": 0.7842,
+      "step": 443
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.4161974533856049,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.6786,
+      "step": 444
+    },
+    {
+      "epoch": 0.23733333333333334,
+      "grad_norm": 0.5142274058224503,
+      "learning_rate": 0.00017835216875884368,
+      "loss": 0.7535,
+      "step": 445
+    },
+    {
+      "epoch": 0.23786666666666667,
+      "grad_norm": 0.44854955901430443,
+      "learning_rate": 0.0001782446770215819,
+      "loss": 0.8062,
+      "step": 446
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.40994996559176605,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7155,
+      "step": 447
+    },
+    {
+      "epoch": 0.23893333333333333,
+      "grad_norm": 0.6256900755515169,
+      "learning_rate": 0.00017802899291729585,
+      "loss": 0.8068,
+      "step": 448
+    },
+    {
+      "epoch": 0.23946666666666666,
+      "grad_norm": 0.4660223730621378,
+      "learning_rate": 0.0001779208011943371,
+      "loss": 0.863,
+      "step": 449
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.41447768458551026,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.8082,
+      "step": 450
+    },
+    {
+      "epoch": 0.24053333333333332,
+      "grad_norm": 0.4671048691178654,
+      "learning_rate": 0.00017770372002217172,
+      "loss": 0.7543,
+      "step": 451
+    },
+    {
+      "epoch": 0.24106666666666668,
+      "grad_norm": 0.4793035132197613,
+      "learning_rate": 0.00017759483122120238,
+      "loss": 0.8101,
+      "step": 452
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.4194968825868918,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.734,
+      "step": 453
+    },
+    {
+      "epoch": 0.24213333333333334,
+      "grad_norm": 0.4756278061404401,
+      "learning_rate": 0.00017737635881528196,
+      "loss": 0.6976,
+      "step": 454
+    },
+    {
+      "epoch": 0.24266666666666667,
+      "grad_norm": 0.5252695532481464,
+      "learning_rate": 0.00017726677586272263,
+      "loss": 0.8406,
+      "step": 455
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.49637457592042844,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7017,
+      "step": 456
+    },
+    {
+      "epoch": 0.24373333333333333,
+      "grad_norm": 0.45696393902348287,
+      "learning_rate": 0.00017704691809456143,
+      "loss": 0.7684,
+      "step": 457
+    },
+    {
+      "epoch": 0.24426666666666666,
+      "grad_norm": 0.7093100010804253,
+      "learning_rate": 0.0001769366439354882,
+      "loss": 0.8497,
+      "step": 458
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.385589836558898,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7077,
+      "step": 459
+    },
+    {
+      "epoch": 0.24533333333333332,
+      "grad_norm": 0.4437814175166149,
+      "learning_rate": 0.00017671540671383243,
+      "loss": 0.6783,
+      "step": 460
+    },
+    {
+      "epoch": 0.24586666666666668,
+      "grad_norm": 0.5403208355435255,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 0.8167,
+      "step": 461
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.4777519924737615,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7405,
+      "step": 462
+    },
+    {
+      "epoch": 0.24693333333333334,
+      "grad_norm": 0.47455092881601535,
+      "learning_rate": 0.00017638183358256696,
+      "loss": 0.7418,
+      "step": 463
+    },
+    {
+      "epoch": 0.24746666666666667,
+      "grad_norm": 0.5078254188770654,
+      "learning_rate": 0.00017627018591992018,
+      "loss": 0.8115,
+      "step": 464
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.4848194549707084,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.8614,
+      "step": 465
+    },
+    {
+      "epoch": 0.24853333333333333,
+      "grad_norm": 0.4199230658226091,
+      "learning_rate": 0.00017604620766564723,
+      "loss": 0.6882,
+      "step": 466
+    },
+    {
+      "epoch": 0.24906666666666666,
+      "grad_norm": 0.5610581015624099,
+      "learning_rate": 0.00017593387774285412,
+      "loss": 0.7367,
+      "step": 467
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.5203286430950974,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8513,
+      "step": 468
+    },
+    {
+      "epoch": 0.2501333333333333,
+      "grad_norm": 0.4507578809785903,
+      "learning_rate": 0.0001757085379831246,
+      "loss": 0.7752,
+      "step": 469
+    },
+    {
+      "epoch": 0.25066666666666665,
+      "grad_norm": 0.5027680945230145,
+      "learning_rate": 0.00017559552881908695,
+      "loss": 0.7688,
+      "step": 470
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.6395933200679313,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8296,
+      "step": 471
+    },
+    {
+      "epoch": 0.2517333333333333,
+      "grad_norm": 0.40110869471345095,
+      "learning_rate": 0.00017536883360997743,
+      "loss": 0.712,
+      "step": 472
+    },
+    {
+      "epoch": 0.25226666666666664,
+      "grad_norm": 0.48579031734817096,
+      "learning_rate": 0.00017525514824185185,
+      "loss": 0.8177,
+      "step": 473
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.43157045704585567,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.7617,
+      "step": 474
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 0.3977611529980882,
+      "learning_rate": 0.00017502710367586687,
+      "loss": 0.7125,
+      "step": 475
+    },
+    {
+      "epoch": 0.2538666666666667,
+      "grad_norm": 0.38024132887630846,
+      "learning_rate": 0.0001749127451589832,
+      "loss": 0.7171,
+      "step": 476
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.4706211326108242,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7344,
+      "step": 477
+    },
+    {
+      "epoch": 0.25493333333333335,
+      "grad_norm": 0.46613957605230466,
+      "learning_rate": 0.00017468335736489177,
+      "loss": 0.7434,
+      "step": 478
+    },
+    {
+      "epoch": 0.2554666666666667,
+      "grad_norm": 0.4322078029947928,
+      "learning_rate": 0.00017456832877267084,
+      "loss": 0.7233,
+      "step": 479
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.43238643453268405,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7688,
+      "step": 480
+    },
+    {
+      "epoch": 0.25653333333333334,
+      "grad_norm": 0.45474994966479154,
+      "learning_rate": 0.00017433760391534167,
+      "loss": 0.8313,
+      "step": 481
+    },
+    {
+      "epoch": 0.25706666666666667,
+      "grad_norm": 0.4251512961228027,
+      "learning_rate": 0.00017422190833921283,
+      "loss": 0.7558,
+      "step": 482
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.45536218212756147,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7753,
+      "step": 483
+    },
+    {
+      "epoch": 0.2581333333333333,
+      "grad_norm": 0.505119345914215,
+      "learning_rate": 0.00017398985261944856,
+      "loss": 0.7423,
+      "step": 484
+    },
+    {
+      "epoch": 0.25866666666666666,
+      "grad_norm": 0.39638892664668407,
+      "learning_rate": 0.00017387349316876666,
+      "loss": 0.7193,
+      "step": 485
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.5170721093418492,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.7876,
+      "step": 486
+    },
+    {
+      "epoch": 0.2597333333333333,
+      "grad_norm": 0.5830209511644934,
+      "learning_rate": 0.0001736401128231373,
+      "loss": 0.7915,
+      "step": 487
+    },
+    {
+      "epoch": 0.26026666666666665,
+      "grad_norm": 0.4250910734443285,
+      "learning_rate": 0.00017352309262509894,
+      "loss": 0.7623,
+      "step": 488
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.5508648649338562,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.8407,
+      "step": 489
+    },
+    {
+      "epoch": 0.2613333333333333,
+      "grad_norm": 0.44339150521035003,
+      "learning_rate": 0.0001732883939257742,
+      "loss": 0.821,
+      "step": 490
+    },
+    {
+      "epoch": 0.2618666666666667,
+      "grad_norm": 0.4204590356364698,
+      "learning_rate": 0.0001731707161253338,
+      "loss": 0.7017,
+      "step": 491
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.4266174970093984,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7526,
+      "step": 492
+    },
+    {
+      "epoch": 0.26293333333333335,
+      "grad_norm": 0.3686983453315899,
+      "learning_rate": 0.00017293470537991463,
+      "loss": 0.7443,
+      "step": 493
+    },
+    {
+      "epoch": 0.2634666666666667,
+      "grad_norm": 0.49394407496261644,
+      "learning_rate": 0.00017281637313969978,
+      "loss": 0.8009,
+      "step": 494
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3921562939510097,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.727,
+      "step": 495
+    },
+    {
+      "epoch": 0.26453333333333334,
+      "grad_norm": 0.4863703161638804,
+      "learning_rate": 0.00017257905669104874,
+      "loss": 0.8435,
+      "step": 496
+    },
+    {
+      "epoch": 0.2650666666666667,
+      "grad_norm": 0.45034045917608595,
+      "learning_rate": 0.00017246007319127545,
+      "loss": 0.735,
+      "step": 497
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.47138423288489745,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7411,
+      "step": 498
+    },
+    {
+      "epoch": 0.26613333333333333,
+      "grad_norm": 0.46312281124205457,
+      "learning_rate": 0.00017222145741734626,
+      "loss": 0.7081,
+      "step": 499
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.43680826598044026,
+      "learning_rate": 0.00017210182585573327,
+      "loss": 0.7495,
+      "step": 500
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.5512546016583748,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.8107,
+      "step": 501
+    },
+    {
+      "epoch": 0.2677333333333333,
+      "grad_norm": 0.3735916418525817,
+      "learning_rate": 0.00017186191716939944,
+      "loss": 0.6893,
+      "step": 502
+    },
+    {
+      "epoch": 0.26826666666666665,
+      "grad_norm": 0.7983497181336426,
+      "learning_rate": 0.0001717416407610824,
+      "loss": 0.7733,
+      "step": 503
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.5070898993355771,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8004,
+      "step": 504
+    },
+    {
+      "epoch": 0.2693333333333333,
+      "grad_norm": 0.5557912695238505,
+      "learning_rate": 0.00017150044560996488,
+      "loss": 0.8447,
+      "step": 505
+    },
+    {
+      "epoch": 0.26986666666666664,
+      "grad_norm": 0.4265134918906563,
+      "learning_rate": 0.00017137952758740978,
+      "loss": 0.7431,
+      "step": 506
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.5131335988369845,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.6951,
+      "step": 507
+    },
+    {
+      "epoch": 0.27093333333333336,
+      "grad_norm": 0.46778912176756743,
+      "learning_rate": 0.00017113705245370368,
+      "loss": 0.8554,
+      "step": 508
+    },
+    {
+      "epoch": 0.2714666666666667,
+      "grad_norm": 0.5839226757599676,
+      "learning_rate": 0.00017101549606662024,
+      "loss": 0.8272,
+      "step": 509
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4292373049276344,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7448,
+      "step": 510
+    },
+    {
+      "epoch": 0.27253333333333335,
+      "grad_norm": 0.5027417769836826,
+      "learning_rate": 0.00017077174746692056,
+      "loss": 0.8897,
+      "step": 511
+    },
+    {
+      "epoch": 0.2730666666666667,
+      "grad_norm": 0.5143260943645462,
+      "learning_rate": 0.00017064955598217462,
+      "loss": 0.8561,
+      "step": 512
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.459333183248125,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7571,
+      "step": 513
+    },
+    {
+      "epoch": 0.27413333333333334,
+      "grad_norm": 0.5267442512392976,
+      "learning_rate": 0.00017040454046730115,
+      "loss": 0.8453,
+      "step": 514
+    },
+    {
+      "epoch": 0.27466666666666667,
+      "grad_norm": 0.4829723879868959,
+      "learning_rate": 0.00017028171716882714,
+      "loss": 0.7506,
+      "step": 515
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.45171016539865366,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7365,
+      "step": 516
+    },
+    {
+      "epoch": 0.27573333333333333,
+      "grad_norm": 0.4835600138246436,
+      "learning_rate": 0.00017003544132364846,
+      "loss": 0.7528,
+      "step": 517
+    },
+    {
+      "epoch": 0.27626666666666666,
+      "grad_norm": 0.4903456759773783,
+      "learning_rate": 0.00016991198951236088,
+      "loss": 0.7437,
+      "step": 518
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.4502694723956876,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.729,
+      "step": 519
+    },
+    {
+      "epoch": 0.2773333333333333,
+      "grad_norm": 0.39570873025854986,
+      "learning_rate": 0.00016966445995561727,
+      "loss": 0.6472,
+      "step": 520
+    },
+    {
+      "epoch": 0.27786666666666665,
+      "grad_norm": 0.5791683138724544,
+      "learning_rate": 0.00016954038294932216,
+      "loss": 0.851,
+      "step": 521
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.44369166106283936,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.745,
+      "step": 522
+    },
+    {
+      "epoch": 0.2789333333333333,
+      "grad_norm": 0.47344841197158355,
+      "learning_rate": 0.0001692916063334479,
+      "loss": 0.7759,
+      "step": 523
+    },
+    {
+      "epoch": 0.27946666666666664,
+      "grad_norm": 0.4341654595149137,
+      "learning_rate": 0.0001691669074667535,
+      "loss": 0.7456,
+      "step": 524
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.47784091229509834,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.8005,
+      "step": 525
+    },
+    {
+      "epoch": 0.28053333333333336,
+      "grad_norm": 0.4965772109689377,
+      "learning_rate": 0.0001689168904776979,
+      "loss": 0.777,
+      "step": 526
+    },
+    {
+      "epoch": 0.2810666666666667,
+      "grad_norm": 0.489350617666454,
+      "learning_rate": 0.00016879157310192535,
+      "loss": 0.7874,
+      "step": 527
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.3797260577387539,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7212,
+      "step": 528
+    },
+    {
+      "epoch": 0.28213333333333335,
+      "grad_norm": 0.719418060615365,
+      "learning_rate": 0.00016854032245897308,
+      "loss": 0.8886,
+      "step": 529
+    },
+    {
+      "epoch": 0.2826666666666667,
+      "grad_norm": 0.4122537871991835,
+      "learning_rate": 0.00016841438994206595,
+      "loss": 0.6815,
+      "step": 530
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.4045742939869129,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7244,
+      "step": 531
+    },
+    {
+      "epoch": 0.28373333333333334,
+      "grad_norm": 0.7494132124205607,
+      "learning_rate": 0.00016816191239765667,
+      "loss": 0.8476,
+      "step": 532
+    },
+    {
+      "epoch": 0.28426666666666667,
+      "grad_norm": 0.5129079548438056,
+      "learning_rate": 0.00016803536812409075,
+      "loss": 0.8489,
+      "step": 533
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.4769566008781452,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.7932,
+      "step": 534
+    },
+    {
+      "epoch": 0.2853333333333333,
+      "grad_norm": 0.464450980037988,
+      "learning_rate": 0.00016778167046363734,
+      "loss": 0.7201,
+      "step": 535
+    },
+    {
+      "epoch": 0.28586666666666666,
+      "grad_norm": 0.46167722460127364,
+      "learning_rate": 0.00016765451783432953,
+      "loss": 0.7502,
+      "step": 536
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.3993696568375122,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7628,
+      "step": 537
+    },
+    {
+      "epoch": 0.2869333333333333,
+      "grad_norm": 0.411043963699614,
+      "learning_rate": 0.0001673996068760359,
+      "loss": 0.7087,
+      "step": 538
+    },
+    {
+      "epoch": 0.28746666666666665,
+      "grad_norm": 0.4240472113939744,
+      "learning_rate": 0.00016727184930825288,
+      "loss": 0.663,
+      "step": 539
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.5982234250548082,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.8586,
+      "step": 540
+    },
+    {
+      "epoch": 0.2885333333333333,
+      "grad_norm": 0.3972292074402739,
+      "learning_rate": 0.00016701573190293077,
+      "loss": 0.7286,
+      "step": 541
+    },
+    {
+      "epoch": 0.2890666666666667,
+      "grad_norm": 0.401535905669461,
+      "learning_rate": 0.00016688737283019706,
+      "loss": 0.6492,
+      "step": 542
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.44233011442238324,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7529,
+      "step": 543
+    },
+    {
+      "epoch": 0.29013333333333335,
+      "grad_norm": 0.5054692473679642,
+      "learning_rate": 0.00016663005586108176,
+      "loss": 0.7722,
+      "step": 544
+    },
+    {
+      "epoch": 0.2906666666666667,
+      "grad_norm": 0.4791933574646442,
+      "learning_rate": 0.00016650109873308765,
+      "loss": 0.7403,
+      "step": 545
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.42779504040868144,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7867,
+      "step": 546
+    },
+    {
+      "epoch": 0.29173333333333334,
+      "grad_norm": 0.4512514222879521,
+      "learning_rate": 0.0001662425891156531,
+      "loss": 0.7094,
+      "step": 547
+    },
+    {
+      "epoch": 0.2922666666666667,
+      "grad_norm": 0.4229210794064544,
+      "learning_rate": 0.00016611303739816168,
+      "loss": 0.7424,
+      "step": 548
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.43432322426676645,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7651,
+      "step": 549
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.7159268812283518,
+      "learning_rate": 0.00016585334207993476,
+      "loss": 0.8139,
+      "step": 550
+    },
+    {
+      "epoch": 0.29386666666666666,
+      "grad_norm": 0.47140640466596184,
+      "learning_rate": 0.00016572319925468892,
+      "loss": 0.704,
+      "step": 551
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4413700493453814,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7438,
+      "step": 552
+    },
+    {
+      "epoch": 0.2949333333333333,
+      "grad_norm": 0.4648841803809812,
+      "learning_rate": 0.0001654623252150624,
+      "loss": 0.7785,
+      "step": 553
+    },
+    {
+      "epoch": 0.29546666666666666,
+      "grad_norm": 0.4807862323463371,
+      "learning_rate": 0.00016533159477969122,
+      "loss": 0.7169,
+      "step": 554
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4512623686501777,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.6775,
+      "step": 555
+    },
+    {
+      "epoch": 0.2965333333333333,
+      "grad_norm": 0.4350999291313162,
+      "learning_rate": 0.00016506954902973655,
+      "loss": 0.7127,
+      "step": 556
+    },
+    {
+      "epoch": 0.29706666666666665,
+      "grad_norm": 0.48905734134433115,
+      "learning_rate": 0.00016493823449766136,
+      "loss": 0.7746,
+      "step": 557
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.4292773298576345,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.645,
+      "step": 558
+    },
+    {
+      "epoch": 0.2981333333333333,
+      "grad_norm": 0.5473772978878985,
+      "learning_rate": 0.00016467502407993992,
+      "loss": 0.8532,
+      "step": 559
+    },
+    {
+      "epoch": 0.2986666666666667,
+      "grad_norm": 0.4297784283931829,
+      "learning_rate": 0.0001645431289802799,
+      "loss": 0.6987,
+      "step": 560
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.4535468380483471,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.8215,
+      "step": 561
+    },
+    {
+      "epoch": 0.29973333333333335,
+      "grad_norm": 0.4997031553280827,
+      "learning_rate": 0.00016427876096865394,
+      "loss": 0.7631,
+      "step": 562
+    },
+    {
+      "epoch": 0.3002666666666667,
+      "grad_norm": 0.48418173193860375,
+      "learning_rate": 0.00016414628884613107,
+      "loss": 0.6708,
+      "step": 563
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.581101794864796,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7584,
+      "step": 564
+    },
+    {
+      "epoch": 0.30133333333333334,
+      "grad_norm": 0.41527374436791076,
+      "learning_rate": 0.00016388077034557355,
+      "loss": 0.6795,
+      "step": 565
+    },
+    {
+      "epoch": 0.30186666666666667,
+      "grad_norm": 0.41383128210916,
+      "learning_rate": 0.00016374772476041748,
+      "loss": 0.7419,
+      "step": 566
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.4498028082700358,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7256,
+      "step": 567
+    },
+    {
+      "epoch": 0.30293333333333333,
+      "grad_norm": 0.48312892011273806,
+      "learning_rate": 0.00016348106290682118,
+      "loss": 0.8031,
+      "step": 568
+    },
+    {
+      "epoch": 0.30346666666666666,
+      "grad_norm": 0.4363637584208029,
+      "learning_rate": 0.00016334744743467364,
+      "loss": 0.7502,
+      "step": 569
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4380402858113743,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7685,
+      "step": 570
+    },
+    {
+      "epoch": 0.3045333333333333,
+      "grad_norm": 0.5372704180044607,
+      "learning_rate": 0.00016307964939465914,
+      "loss": 0.8039,
+      "step": 571
+    },
+    {
+      "epoch": 0.30506666666666665,
+      "grad_norm": 0.42058772305209075,
+      "learning_rate": 0.00016294546762647775,
+      "loss": 0.6993,
+      "step": 572
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.3985817884467263,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7098,
+      "step": 573
+    },
+    {
+      "epoch": 0.3061333333333333,
+      "grad_norm": 0.5548108735480529,
+      "learning_rate": 0.0001626765405972011,
+      "loss": 0.8505,
+      "step": 574
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 0.4963906065238684,
+      "learning_rate": 0.00016254179613916278,
+      "loss": 0.7662,
+      "step": 575
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.43676747549593675,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7631,
+      "step": 576
+    },
+    {
+      "epoch": 0.30773333333333336,
+      "grad_norm": 0.4401411422093453,
+      "learning_rate": 0.000162271747348122,
+      "loss": 0.7322,
+      "step": 577
+    },
+    {
+      "epoch": 0.3082666666666667,
+      "grad_norm": 0.5302779178800815,
+      "learning_rate": 0.0001621364438215262,
+      "loss": 0.7402,
+      "step": 578
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.4958846122339328,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.8676,
+      "step": 579
+    },
+    {
+      "epoch": 0.30933333333333335,
+      "grad_norm": 0.5722256043147971,
+      "learning_rate": 0.00016186528052636692,
+      "loss": 0.7983,
+      "step": 580
+    },
+    {
+      "epoch": 0.3098666666666667,
+      "grad_norm": 0.6465608576747207,
+      "learning_rate": 0.0001617294215675382,
+      "loss": 0.8204,
+      "step": 581
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.41234277879273545,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7872,
+      "step": 582
+    },
+    {
+      "epoch": 0.31093333333333334,
+      "grad_norm": 0.6233071464537084,
+      "learning_rate": 0.0001614571510558588,
+      "loss": 0.8753,
+      "step": 583
+    },
+    {
+      "epoch": 0.31146666666666667,
+      "grad_norm": 0.4568477377015119,
+      "learning_rate": 0.00016132074031604917,
+      "loss": 0.7846,
+      "step": 584
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.41316253146665566,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7075,
+      "step": 585
+    },
+    {
+      "epoch": 0.31253333333333333,
+      "grad_norm": 0.465154997074634,
+      "learning_rate": 0.00016104736990520468,
+      "loss": 0.7647,
+      "step": 586
+    },
+    {
+      "epoch": 0.31306666666666666,
+      "grad_norm": 0.39978449383760306,
+      "learning_rate": 0.0001609104110504954,
+      "loss": 0.704,
+      "step": 587
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4697537669165921,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7608,
+      "step": 588
+    },
+    {
+      "epoch": 0.3141333333333333,
+      "grad_norm": 0.488790150430786,
+      "learning_rate": 0.00016063594808740113,
+      "loss": 0.7766,
+      "step": 589
+    },
+    {
+      "epoch": 0.31466666666666665,
+      "grad_norm": 0.3892107195990382,
+      "learning_rate": 0.00016049844479860422,
+      "loss": 0.703,
+      "step": 590
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.5189422916788797,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.8422,
+      "step": 591
+    },
+    {
+      "epoch": 0.3157333333333333,
+      "grad_norm": 0.5430678631199745,
+      "learning_rate": 0.00016022289665953808,
+      "loss": 0.7868,
+      "step": 592
+    },
+    {
+      "epoch": 0.31626666666666664,
+      "grad_norm": 0.47630987530323626,
+      "learning_rate": 0.00016008485263209742,
+      "loss": 0.7861,
+      "step": 593
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.5376800915747006,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.6851,
+      "step": 594
+    },
+    {
+      "epoch": 0.31733333333333336,
+      "grad_norm": 0.4944054174823598,
+      "learning_rate": 0.0001598082267225018,
+      "loss": 0.7422,
+      "step": 595
+    },
+    {
+      "epoch": 0.3178666666666667,
+      "grad_norm": 0.44069188496058753,
+      "learning_rate": 0.0001596696456663938,
+      "loss": 0.6973,
+      "step": 596
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.46226494684837316,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7538,
+      "step": 597
+    },
+    {
+      "epoch": 0.31893333333333335,
+      "grad_norm": 0.4443950227191254,
+      "learning_rate": 0.00015939194942067646,
+      "loss": 0.7399,
+      "step": 598
+    },
+    {
+      "epoch": 0.3194666666666667,
+      "grad_norm": 0.5685552493768595,
+      "learning_rate": 0.0001592528350603103,
+      "loss": 0.8605,
+      "step": 599
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.44119141577849835,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.6903,
+      "step": 600
+    },
+    {
+      "epoch": 0.32053333333333334,
+      "grad_norm": 0.4605611289915192,
+      "learning_rate": 0.00015897407594164467,
+      "loss": 0.6843,
+      "step": 601
+    },
+    {
+      "epoch": 0.32106666666666667,
+      "grad_norm": 0.47782950798344975,
+      "learning_rate": 0.00015883443201576225,
+      "loss": 0.7513,
+      "step": 602
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.5029707795921143,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7272,
+      "step": 603
+    },
+    {
+      "epoch": 0.3221333333333333,
+      "grad_norm": 0.49064029700122663,
+      "learning_rate": 0.00015855461751588677,
+      "loss": 0.7726,
+      "step": 604
+    },
+    {
+      "epoch": 0.32266666666666666,
+      "grad_norm": 0.4568333689649895,
+      "learning_rate": 0.0001584144477774623,
+      "loss": 0.753,
+      "step": 605
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.4835061310434721,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7956,
+      "step": 606
+    },
+    {
+      "epoch": 0.3237333333333333,
+      "grad_norm": 0.4586904494238172,
+      "learning_rate": 0.00015813358541647915,
+      "loss": 0.7414,
+      "step": 607
+    },
+    {
+      "epoch": 0.32426666666666665,
+      "grad_norm": 0.45267699473776096,
+      "learning_rate": 0.00015799289363261813,
+      "loss": 0.7528,
+      "step": 608
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.5006990456048835,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7552,
+      "step": 609
+    },
+    {
+      "epoch": 0.3253333333333333,
+      "grad_norm": 0.4420243760214426,
+      "learning_rate": 0.00015771099095879108,
+      "loss": 0.7097,
+      "step": 610
+    },
+    {
+      "epoch": 0.3258666666666667,
+      "grad_norm": 0.4630932696592339,
+      "learning_rate": 0.0001575697809106292,
+      "loss": 0.8051,
+      "step": 611
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4381474959525617,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.7712,
+      "step": 612
+    },
+    {
+      "epoch": 0.32693333333333335,
+      "grad_norm": 0.4362637164968971,
+      "learning_rate": 0.00015728684550018064,
+      "loss": 0.7134,
+      "step": 613
+    },
+    {
+      "epoch": 0.3274666666666667,
+      "grad_norm": 0.5337956170885394,
+      "learning_rate": 0.0001571451209827821,
+      "loss": 0.733,
+      "step": 614
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.42619553958238987,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.6973,
+      "step": 615
+    },
+    {
+      "epoch": 0.32853333333333334,
+      "grad_norm": 0.45982115164566545,
+      "learning_rate": 0.00015686116043968972,
+      "loss": 0.8125,
+      "step": 616
+    },
+    {
+      "epoch": 0.3290666666666667,
+      "grad_norm": 0.5187286023606732,
+      "learning_rate": 0.00015671892526194516,
+      "loss": 0.8014,
+      "step": 617
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.4030330003792373,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7771,
+      "step": 618
+    },
+    {
+      "epoch": 0.33013333333333333,
+      "grad_norm": 0.5213695757622626,
+      "learning_rate": 0.0001564339472177373,
+      "loss": 0.8561,
+      "step": 619
+    },
+    {
+      "epoch": 0.33066666666666666,
+      "grad_norm": 0.4430898287378288,
+      "learning_rate": 0.00015629120520226165,
+      "loss": 0.6942,
+      "step": 620
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.3960093676614589,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.6703,
+      "step": 621
+    },
+    {
+      "epoch": 0.3317333333333333,
+      "grad_norm": 0.4461093949820488,
+      "learning_rate": 0.0001560052173158123,
+      "loss": 0.7926,
+      "step": 622
+    },
+    {
+      "epoch": 0.33226666666666665,
+      "grad_norm": 0.5209486668921074,
+      "learning_rate": 0.00015586197229884184,
+      "loss": 0.7327,
+      "step": 623
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.4174835442054574,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7323,
+      "step": 624
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.5203101606683872,
+      "learning_rate": 0.00015557498225616487,
+      "loss": 0.7456,
+      "step": 625
+    },
+    {
+      "epoch": 0.33386666666666664,
+      "grad_norm": 0.43739688753171574,
+      "learning_rate": 0.0001554312380874542,
+      "loss": 0.7156,
+      "step": 626
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.47012908154083727,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7493,
+      "step": 627
+    },
+    {
+      "epoch": 0.33493333333333336,
+      "grad_norm": 0.48821262555246026,
+      "learning_rate": 0.00015514325360149668,
+      "loss": 0.7935,
+      "step": 628
+    },
+    {
+      "epoch": 0.3354666666666667,
+      "grad_norm": 0.5096400845631791,
+      "learning_rate": 0.0001549990141442153,
+      "loss": 0.8074,
+      "step": 629
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5087836103278395,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7491,
+      "step": 630
+    },
+    {
+      "epoch": 0.33653333333333335,
+      "grad_norm": 0.40597823511570036,
+      "learning_rate": 0.00015471004295465035,
+      "loss": 0.704,
+      "step": 631
+    },
+    {
+      "epoch": 0.3370666666666667,
+      "grad_norm": 0.44189955905098144,
+      "learning_rate": 0.0001545653120852787,
+      "loss": 0.7286,
+      "step": 632
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.4094001723854417,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.6312,
+      "step": 633
+    },
+    {
+      "epoch": 0.33813333333333334,
+      "grad_norm": 0.47703476115864146,
+      "learning_rate": 0.00015427536195829742,
+      "loss": 0.8125,
+      "step": 634
+    },
+    {
+      "epoch": 0.33866666666666667,
+      "grad_norm": 0.4131946559039073,
+      "learning_rate": 0.00015413014356652286,
+      "loss": 0.6799,
+      "step": 635
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.5054724085494187,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.8006,
+      "step": 636
+    },
+    {
+      "epoch": 0.33973333333333333,
+      "grad_norm": 0.4295937358340469,
+      "learning_rate": 0.00015383922229462549,
+      "loss": 0.7929,
+      "step": 637
+    },
+    {
+      "epoch": 0.34026666666666666,
+      "grad_norm": 0.6344466190997674,
+      "learning_rate": 0.00015369352028323774,
+      "loss": 0.8001,
+      "step": 638
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.5029154087876303,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7009,
+      "step": 639
+    },
+    {
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.48505083375787206,
+      "learning_rate": 0.0001534016356850244,
+      "loss": 0.7636,
+      "step": 640
+    },
+    {
+      "epoch": 0.34186666666666665,
+      "grad_norm": 0.4281213329908672,
+      "learning_rate": 0.0001532554539698105,
+      "loss": 0.7538,
+      "step": 641
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.45395160003970775,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7168,
+      "step": 642
+    },
+    {
+      "epoch": 0.3429333333333333,
+      "grad_norm": 0.42365833943737535,
+      "learning_rate": 0.00015296261388977108,
+      "loss": 0.7059,
+      "step": 643
+    },
+    {
+      "epoch": 0.34346666666666664,
+      "grad_norm": 0.45763802503026313,
+      "learning_rate": 0.0001528159563994104,
+      "loss": 0.7389,
+      "step": 644
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.4545983554426908,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7503,
+      "step": 645
+    },
+    {
+      "epoch": 0.34453333333333336,
+      "grad_norm": 0.3898745640926478,
+      "learning_rate": 0.00015252216870771345,
+      "loss": 0.6693,
+      "step": 646
+    },
+    {
+      "epoch": 0.3450666666666667,
+      "grad_norm": 0.5136523474448932,
+      "learning_rate": 0.00015237503938367186,
+      "loss": 0.7637,
+      "step": 647
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.5659363117917683,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.8121,
+      "step": 648
+    },
+    {
+      "epoch": 0.34613333333333335,
+      "grad_norm": 0.5060082102672091,
+      "learning_rate": 0.00015208031197595356,
+      "loss": 0.7696,
+      "step": 649
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.429466840029395,
+      "learning_rate": 0.0001519327147723776,
+      "loss": 0.7262,
+      "step": 650
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.4257298981810606,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.747,
+      "step": 651
+    },
+    {
+      "epoch": 0.34773333333333334,
+      "grad_norm": 0.4204373050848424,
+      "learning_rate": 0.0001516370555695291,
+      "loss": 0.7364,
+      "step": 652
+    },
+    {
+      "epoch": 0.34826666666666667,
+      "grad_norm": 0.46223585107318704,
+      "learning_rate": 0.00015148899445313981,
+      "loss": 0.7489,
+      "step": 653
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.45422836276114315,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7728,
+      "step": 654
+    },
+    {
+      "epoch": 0.34933333333333333,
+      "grad_norm": 0.43489775186795826,
+      "learning_rate": 0.00015119241140109467,
+      "loss": 0.7891,
+      "step": 655
+    },
+    {
+      "epoch": 0.34986666666666666,
+      "grad_norm": 0.39069125850805453,
+      "learning_rate": 0.00015104389035108077,
+      "loss": 0.7273,
+      "step": 656
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.4246531895440103,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.7387,
+      "step": 657
+    },
+    {
+      "epoch": 0.3509333333333333,
+      "grad_norm": 0.49072827335108293,
+      "learning_rate": 0.0001507463914206012,
+      "loss": 0.8379,
+      "step": 658
+    },
+    {
+      "epoch": 0.35146666666666665,
+      "grad_norm": 0.5478280015009428,
+      "learning_rate": 0.0001505974144285124,
+      "loss": 0.7375,
+      "step": 659
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3769981459132424,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.6606,
+      "step": 660
+    },
+    {
+      "epoch": 0.3525333333333333,
+      "grad_norm": 0.48461775989790185,
+      "learning_rate": 0.00015029900761497506,
+      "loss": 0.7789,
+      "step": 661
+    },
+    {
+      "epoch": 0.35306666666666664,
+      "grad_norm": 0.48579640510721367,
+      "learning_rate": 0.00015014957868461458,
+      "loss": 0.7903,
+      "step": 662
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.49393967471009037,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.6961,
+      "step": 663
+    },
+    {
+      "epoch": 0.35413333333333336,
+      "grad_norm": 0.4112538399474424,
+      "learning_rate": 0.000149850272007796,
+      "loss": 0.7437,
+      "step": 664
+    },
+    {
+      "epoch": 0.3546666666666667,
+      "grad_norm": 0.5103340356646984,
+      "learning_rate": 0.00014970039515511304,
+      "loss": 0.7634,
+      "step": 665
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.4340726265092865,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.737,
+      "step": 666
+    },
+    {
+      "epoch": 0.35573333333333335,
+      "grad_norm": 0.5477988532675317,
+      "learning_rate": 0.0001494001966589736,
+      "loss": 0.7635,
+      "step": 667
+    },
+    {
+      "epoch": 0.3562666666666667,
+      "grad_norm": 0.49762428997080593,
+      "learning_rate": 0.00014924987591195547,
+      "loss": 0.7205,
+      "step": 668
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.4617320557674953,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7369,
+      "step": 669
+    },
+    {
+      "epoch": 0.35733333333333334,
+      "grad_norm": 0.41210155221307404,
+      "learning_rate": 0.0001489487936644237,
+      "loss": 0.6681,
+      "step": 670
+    },
+    {
+      "epoch": 0.35786666666666667,
+      "grad_norm": 0.4910795134552265,
+      "learning_rate": 0.00014879803306298736,
+      "loss": 0.8081,
+      "step": 671
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.5213128333185546,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.829,
+      "step": 672
+    },
+    {
+      "epoch": 0.3589333333333333,
+      "grad_norm": 0.47921788570624435,
+      "learning_rate": 0.00014849607515574276,
+      "loss": 0.7583,
+      "step": 673
+    },
+    {
+      "epoch": 0.35946666666666666,
+      "grad_norm": 0.43595138360007896,
+      "learning_rate": 0.00014834487875162657,
+      "loss": 0.76,
+      "step": 674
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.42735307273006445,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.6781,
+      "step": 675
+    },
+    {
+      "epoch": 0.3605333333333333,
+      "grad_norm": 0.39006746091488365,
+      "learning_rate": 0.00014804205329988225,
+      "loss": 0.7359,
+      "step": 676
+    },
+    {
+      "epoch": 0.36106666666666665,
+      "grad_norm": 0.5310398949825308,
+      "learning_rate": 0.00014789042515653687,
+      "loss": 0.6893,
+      "step": 677
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.46443101262094777,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7285,
+      "step": 678
+    },
+    {
+      "epoch": 0.3621333333333333,
+      "grad_norm": 0.5146034622929456,
+      "learning_rate": 0.00014758674029882152,
+      "loss": 0.8202,
+      "step": 679
+    },
+    {
+      "epoch": 0.3626666666666667,
+      "grad_norm": 0.4348436928102778,
+      "learning_rate": 0.00014743468449130063,
+      "loss": 0.7602,
+      "step": 680
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.4432613163819088,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7545,
+      "step": 681
+    },
+    {
+      "epoch": 0.36373333333333335,
+      "grad_norm": 0.42553157744987713,
+      "learning_rate": 0.00014713014838923976,
+      "loss": 0.7512,
+      "step": 682
+    },
+    {
+      "epoch": 0.3642666666666667,
+      "grad_norm": 0.4699396848331131,
+      "learning_rate": 0.00014697766900409074,
+      "loss": 0.7622,
+      "step": 683
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.40606209684153594,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.6484,
+      "step": 684
+    },
+    {
+      "epoch": 0.36533333333333334,
+      "grad_norm": 0.4355074989092574,
+      "learning_rate": 0.0001466722898421873,
+      "loss": 0.7134,
+      "step": 685
+    },
+    {
+      "epoch": 0.3658666666666667,
+      "grad_norm": 0.46703525087855696,
+      "learning_rate": 0.0001465193909773413,
+      "loss": 0.7999,
+      "step": 686
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.4664099267583723,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.8064,
+      "step": 687
+    },
+    {
+      "epoch": 0.36693333333333333,
+      "grad_norm": 0.5140904956668334,
+      "learning_rate": 0.00014621317696275564,
+      "loss": 0.7683,
+      "step": 688
+    },
+    {
+      "epoch": 0.36746666666666666,
+      "grad_norm": 0.4408781952457787,
+      "learning_rate": 0.00014605986272741748,
+      "loss": 0.7514,
+      "step": 689
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4800514398310778,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7411,
+      "step": 690
+    },
+    {
+      "epoch": 0.3685333333333333,
+      "grad_norm": 0.4211586384557557,
+      "learning_rate": 0.00014575282208974702,
+      "loss": 0.7155,
+      "step": 691
+    },
+    {
+      "epoch": 0.36906666666666665,
+      "grad_norm": 0.421653090437245,
+      "learning_rate": 0.00014559909660428468,
+      "loss": 0.7042,
+      "step": 692
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.3904998722681181,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.6913,
+      "step": 693
+    },
+    {
+      "epoch": 0.3701333333333333,
+      "grad_norm": 0.49633206000997504,
+      "learning_rate": 0.00014529123759534255,
+      "loss": 0.7718,
+      "step": 694
+    },
+    {
+      "epoch": 0.37066666666666664,
+      "grad_norm": 0.38695000482239966,
+      "learning_rate": 0.00014513710499117647,
+      "loss": 0.6648,
+      "step": 695
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4846438208269051,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.729,
+      "step": 696
+    },
+    {
+      "epoch": 0.37173333333333336,
+      "grad_norm": 0.4988732653784483,
+      "learning_rate": 0.00014482843588476974,
+      "loss": 0.7105,
+      "step": 697
+    },
+    {
+      "epoch": 0.3722666666666667,
+      "grad_norm": 0.3890334427781973,
+      "learning_rate": 0.00014467390030426186,
+      "loss": 0.7642,
+      "step": 698
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.5015995117011884,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7951,
+      "step": 699
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.4161455709852972,
+      "learning_rate": 0.0001443644293959693,
+      "loss": 0.7556,
+      "step": 700
+    },
+    {
+      "epoch": 0.3738666666666667,
+      "grad_norm": 0.3986358064231693,
+      "learning_rate": 0.00014420949499231172,
+      "loss": 0.6742,
+      "step": 701
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4231105241423163,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.7393,
+      "step": 702
+    },
+    {
+      "epoch": 0.37493333333333334,
+      "grad_norm": 0.4422640127013368,
+      "learning_rate": 0.00014389923059926062,
+      "loss": 0.7435,
+      "step": 703
+    },
+    {
+      "epoch": 0.37546666666666667,
+      "grad_norm": 0.43021573691414106,
+      "learning_rate": 0.0001437439015363638,
+      "loss": 0.759,
+      "step": 704
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.42485595541853743,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7528,
+      "step": 705
+    },
+    {
+      "epoch": 0.37653333333333333,
+      "grad_norm": 0.40479690878576835,
+      "learning_rate": 0.00014343285199700683,
+      "loss": 0.7608,
+      "step": 706
+    },
+    {
+      "epoch": 0.37706666666666666,
+      "grad_norm": 0.4932163955361533,
+      "learning_rate": 0.0001432771324493879,
+      "loss": 0.7164,
+      "step": 707
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.42200867156661387,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.6774,
+      "step": 708
+    },
+    {
+      "epoch": 0.3781333333333333,
+      "grad_norm": 0.5467399094318625,
+      "learning_rate": 0.00014296530612327863,
+      "loss": 0.8899,
+      "step": 709
+    },
+    {
+      "epoch": 0.37866666666666665,
+      "grad_norm": 0.4169895054651297,
+      "learning_rate": 0.00014280920027594907,
+      "loss": 0.7594,
+      "step": 710
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.3938157519829676,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.6852,
+      "step": 711
+    },
+    {
+      "epoch": 0.3797333333333333,
+      "grad_norm": 0.48117155195670075,
+      "learning_rate": 0.00014249660554351752,
+      "loss": 0.7268,
+      "step": 712
+    },
+    {
+      "epoch": 0.38026666666666664,
+      "grad_norm": 0.4522316546093445,
+      "learning_rate": 0.00014234011759187083,
+      "loss": 0.7072,
+      "step": 713
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.4434853759843272,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7224,
+      "step": 714
+    },
+    {
+      "epoch": 0.38133333333333336,
+      "grad_norm": 0.3982148416540636,
+      "learning_rate": 0.00014202676285419812,
+      "loss": 0.6335,
+      "step": 715
+    },
+    {
+      "epoch": 0.3818666666666667,
+      "grad_norm": 0.48780884007878533,
+      "learning_rate": 0.00014186989700389687,
+      "loss": 0.678,
+      "step": 716
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.3942170795871845,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.6621,
+      "step": 717
+    },
+    {
+      "epoch": 0.38293333333333335,
+      "grad_norm": 0.37315975023349823,
+      "learning_rate": 0.0001415557906824895,
+      "loss": 0.6799,
+      "step": 718
+    },
+    {
+      "epoch": 0.3834666666666667,
+      "grad_norm": 0.4804017797780288,
+      "learning_rate": 0.00014139855114935252,
+      "loss": 0.677,
+      "step": 719
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4866777828183144,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.7712,
+      "step": 720
+    },
+    {
+      "epoch": 0.38453333333333334,
+      "grad_norm": 0.48112597267634266,
+      "learning_rate": 0.0001410837016859161,
+      "loss": 0.7361,
+      "step": 721
+    },
+    {
+      "epoch": 0.38506666666666667,
+      "grad_norm": 0.49039545768870146,
+      "learning_rate": 0.00014092609269580496,
+      "loss": 0.7098,
+      "step": 722
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.4156820194805481,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7215,
+      "step": 723
+    },
+    {
+      "epoch": 0.38613333333333333,
+      "grad_norm": 0.449495212879705,
+      "learning_rate": 0.00014061050855201723,
+      "loss": 0.7859,
+      "step": 724
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 0.5517682180192388,
+      "learning_rate": 0.0001404525343407228,
+      "loss": 0.738,
+      "step": 725
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.4078107633758721,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7428,
+      "step": 726
+    },
+    {
+      "epoch": 0.3877333333333333,
+      "grad_norm": 0.3848257011546897,
+      "learning_rate": 0.00014013622399800627,
+      "loss": 0.6847,
+      "step": 727
+    },
+    {
+      "epoch": 0.38826666666666665,
+      "grad_norm": 0.4705304268494785,
+      "learning_rate": 0.00013997788881113489,
+      "loss": 0.6901,
+      "step": 728
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.45351375128922244,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.697,
+      "step": 729
+    },
+    {
+      "epoch": 0.3893333333333333,
+      "grad_norm": 0.4206152695860078,
+      "learning_rate": 0.0001396608607704289,
+      "loss": 0.7215,
+      "step": 730
+    },
+    {
+      "epoch": 0.38986666666666664,
+      "grad_norm": 0.4872622191747159,
+      "learning_rate": 0.0001395021688632882,
+      "loss": 0.783,
+      "step": 731
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.42120062669574676,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7548,
+      "step": 732
+    },
+    {
+      "epoch": 0.39093333333333335,
+      "grad_norm": 0.42322770575160906,
+      "learning_rate": 0.00013918443164482046,
+      "loss": 0.7057,
+      "step": 733
+    },
+    {
+      "epoch": 0.3914666666666667,
+      "grad_norm": 0.43128822409554385,
+      "learning_rate": 0.000139025387282305,
+      "loss": 0.6595,
+      "step": 734
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.4192929353710216,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7343,
+      "step": 735
+    },
+    {
+      "epoch": 0.39253333333333335,
+      "grad_norm": 0.413275458081682,
+      "learning_rate": 0.0001387069494253626,
+      "loss": 0.6916,
+      "step": 736
+    },
+    {
+      "epoch": 0.3930666666666667,
+      "grad_norm": 0.3690774046989975,
+      "learning_rate": 0.0001385475568818394,
+      "loss": 0.7141,
+      "step": 737
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.4617461073357319,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7592,
+      "step": 738
+    },
+    {
+      "epoch": 0.39413333333333334,
+      "grad_norm": 0.4893411296699691,
+      "learning_rate": 0.00013822842694453924,
+      "loss": 0.7884,
+      "step": 739
+    },
+    {
+      "epoch": 0.39466666666666667,
+      "grad_norm": 0.48910757120341053,
+      "learning_rate": 0.0001380686905037327,
+      "loss": 0.815,
+      "step": 740
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.49374593997669053,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7179,
+      "step": 741
+    },
+    {
+      "epoch": 0.3957333333333333,
+      "grad_norm": 0.44571095106319375,
+      "learning_rate": 0.00013774887706279165,
+      "loss": 0.7098,
+      "step": 742
+    },
+    {
+      "epoch": 0.39626666666666666,
+      "grad_norm": 0.41530745351569454,
+      "learning_rate": 0.0001375888010176686,
+      "loss": 0.7024,
+      "step": 743
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.5525773689172044,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.8158,
+      "step": 744
+    },
+    {
+      "epoch": 0.3973333333333333,
+      "grad_norm": 0.5170291279150868,
+      "learning_rate": 0.00013726831266817278,
+      "loss": 0.7695,
+      "step": 745
+    },
+    {
+      "epoch": 0.39786666666666665,
+      "grad_norm": 0.45091419541150624,
+      "learning_rate": 0.00013710790132082692,
+      "loss": 0.7352,
+      "step": 746
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.4879611538440513,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7401,
+      "step": 747
+    },
+    {
+      "epoch": 0.3989333333333333,
+      "grad_norm": 0.377411463980635,
+      "learning_rate": 0.00013678674667600102,
+      "loss": 0.6367,
+      "step": 748
+    },
+    {
+      "epoch": 0.3994666666666667,
+      "grad_norm": 0.39877682093200306,
+      "learning_rate": 0.00013662600433753745,
+      "loss": 0.6868,
+      "step": 749
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5110745295985535,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.8137,
+      "step": 750
+    },
+    {
+      "epoch": 0.40053333333333335,
+      "grad_norm": 0.557627149290855,
+      "learning_rate": 0.00013630419202851284,
+      "loss": 0.7981,
+      "step": 751
+    },
+    {
+      "epoch": 0.4010666666666667,
+      "grad_norm": 0.4785536029438291,
+      "learning_rate": 0.00013614312301893223,
+      "loss": 0.7433,
+      "step": 752
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.3950924824169307,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.6917,
+      "step": 753
+    },
+    {
+      "epoch": 0.40213333333333334,
+      "grad_norm": 0.38368541292649505,
+      "learning_rate": 0.00013582066169451535,
+      "loss": 0.7066,
+      "step": 754
+    },
+    {
+      "epoch": 0.4026666666666667,
+      "grad_norm": 0.43192553227021735,
+      "learning_rate": 0.0001356592703425976,
+      "loss": 0.7538,
+      "step": 755
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.4508907660187954,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7094,
+      "step": 756
+    },
+    {
+      "epoch": 0.40373333333333333,
+      "grad_norm": 0.45563511717824373,
+      "learning_rate": 0.00013533616866903735,
+      "loss": 0.7877,
+      "step": 757
+    },
+    {
+      "epoch": 0.40426666666666666,
+      "grad_norm": 0.4151235291365336,
+      "learning_rate": 0.0001351744593122255,
+      "loss": 0.6721,
+      "step": 758
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.4748985553018345,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7979,
+      "step": 759
+    },
+    {
+      "epoch": 0.4053333333333333,
+      "grad_norm": 0.4291758260327782,
+      "learning_rate": 0.00013485072597298038,
+      "loss": 0.6887,
+      "step": 760
+    },
+    {
+      "epoch": 0.40586666666666665,
+      "grad_norm": 0.4853152567669402,
+      "learning_rate": 0.00013468870295726398,
+      "loss": 0.667,
+      "step": 761
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.37695781697999237,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.679,
+      "step": 762
+    },
+    {
+      "epoch": 0.4069333333333333,
+      "grad_norm": 0.4145462694391789,
+      "learning_rate": 0.00013436434665276865,
+      "loss": 0.7507,
+      "step": 763
+    },
+    {
+      "epoch": 0.40746666666666664,
+      "grad_norm": 0.4762167954960014,
+      "learning_rate": 0.00013420201433256689,
+      "loss": 0.7404,
+      "step": 764
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.5049455761457096,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7763,
+      "step": 765
+    },
+    {
+      "epoch": 0.40853333333333336,
+      "grad_norm": 0.4188665914477433,
+      "learning_rate": 0.00013387704377999842,
+      "loss": 0.7405,
+      "step": 766
+    },
+    {
+      "epoch": 0.4090666666666667,
+      "grad_norm": 0.412248135828123,
+      "learning_rate": 0.00013371440651804313,
+      "loss": 0.7389,
+      "step": 767
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.48039414957295357,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7428,
+      "step": 768
+    },
+    {
+      "epoch": 0.41013333333333335,
+      "grad_norm": 0.3624702058083208,
+      "learning_rate": 0.00013338883045108674,
+      "loss": 0.7249,
+      "step": 769
+    },
+    {
+      "epoch": 0.4106666666666667,
+      "grad_norm": 0.5636795345502021,
+      "learning_rate": 0.00013322589261830517,
+      "loss": 0.8385,
+      "step": 770
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.4607953135742836,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.756,
+      "step": 771
+    },
+    {
+      "epoch": 0.41173333333333334,
+      "grad_norm": 0.376729193046304,
+      "learning_rate": 0.0001328997197869194,
+      "loss": 0.6614,
+      "step": 772
+    },
+    {
+      "epoch": 0.41226666666666667,
+      "grad_norm": 0.3991667630290107,
+      "learning_rate": 0.0001327364857623168,
+      "loss": 0.6909,
+      "step": 773
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.47573220822123935,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7151,
+      "step": 774
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 0.4588884817829327,
+      "learning_rate": 0.00013240972493249847,
+      "loss": 0.748,
+      "step": 775
+    },
+    {
+      "epoch": 0.41386666666666666,
+      "grad_norm": 0.5163011613357176,
+      "learning_rate": 0.0001322461991030402,
+      "loss": 0.7619,
+      "step": 776
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.5202514115332239,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7711,
+      "step": 777
+    },
+    {
+      "epoch": 0.4149333333333333,
+      "grad_norm": 0.42017813629927897,
+      "learning_rate": 0.00013191885905658872,
+      "loss": 0.6914,
+      "step": 778
+    },
+    {
+      "epoch": 0.41546666666666665,
+      "grad_norm": 0.49298713784841597,
+      "learning_rate": 0.0001317550458170826,
+      "loss": 0.7873,
+      "step": 779
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4211312472740005,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.6849,
+      "step": 780
+    },
+    {
+      "epoch": 0.4165333333333333,
+      "grad_norm": 0.46257868165210364,
+      "learning_rate": 0.00013142713535136414,
+      "loss": 0.7719,
+      "step": 781
+    },
+    {
+      "epoch": 0.41706666666666664,
+      "grad_norm": 0.44366844894252566,
+      "learning_rate": 0.00013126303910434214,
+      "loss": 0.7104,
+      "step": 782
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.45552496956208255,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.8097,
+      "step": 783
+    },
+    {
+      "epoch": 0.41813333333333336,
+      "grad_norm": 0.5078343665730175,
+      "learning_rate": 0.00013093456703205288,
+      "loss": 0.741,
+      "step": 784
+    },
+    {
+      "epoch": 0.4186666666666667,
+      "grad_norm": 0.4461778897832065,
+      "learning_rate": 0.00013077019218765305,
+      "loss": 0.7362,
+      "step": 785
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.5357241749772622,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7631,
+      "step": 786
+    },
+    {
+      "epoch": 0.41973333333333335,
+      "grad_norm": 0.46927262105236117,
+      "learning_rate": 0.0001304411673365826,
+      "loss": 0.799,
+      "step": 787
+    },
+    {
+      "epoch": 0.4202666666666667,
+      "grad_norm": 0.4328759143850609,
+      "learning_rate": 0.0001302765183124302,
+      "loss": 0.71,
+      "step": 788
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.5007509397077923,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7574,
+      "step": 789
+    },
+    {
+      "epoch": 0.42133333333333334,
+      "grad_norm": 0.4416348515451264,
+      "learning_rate": 0.00012994694952522435,
+      "loss": 0.7459,
+      "step": 790
+    },
+    {
+      "epoch": 0.42186666666666667,
+      "grad_norm": 0.4698835680868725,
+      "learning_rate": 0.00012978203074631334,
+      "loss": 0.7342,
+      "step": 791
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.50252881148867,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7906,
+      "step": 792
+    },
+    {
+      "epoch": 0.42293333333333333,
+      "grad_norm": 0.47604069277245004,
+      "learning_rate": 0.00012945192688023624,
+      "loss": 0.7896,
+      "step": 793
+    },
+    {
+      "epoch": 0.42346666666666666,
+      "grad_norm": 0.5366904106513515,
+      "learning_rate": 0.0001292867427788104,
+      "loss": 0.7475,
+      "step": 794
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.4977277201391843,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.74,
+      "step": 795
+    },
+    {
+      "epoch": 0.4245333333333333,
+      "grad_norm": 0.5004320754854158,
+      "learning_rate": 0.00012895611270550666,
+      "loss": 0.7675,
+      "step": 796
+    },
+    {
+      "epoch": 0.42506666666666665,
+      "grad_norm": 0.3497844453998508,
+      "learning_rate": 0.0001287906677209403,
+      "loss": 0.6345,
+      "step": 797
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.43523023207752554,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7434,
+      "step": 798
+    },
+    {
+      "epoch": 0.4261333333333333,
+      "grad_norm": 0.5418558846172636,
+      "learning_rate": 0.0001284595203261965,
+      "loss": 0.8334,
+      "step": 799
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.47654734118721276,
+      "learning_rate": 0.00012829381890487536,
+      "loss": 0.7781,
+      "step": 800
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.3870351813154318,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.674,
+      "step": 801
+    },
+    {
+      "epoch": 0.42773333333333335,
+      "grad_norm": 0.4461402164522316,
+      "learning_rate": 0.00012796216308838117,
+      "loss": 0.7324,
+      "step": 802
+    },
+    {
+      "epoch": 0.4282666666666667,
+      "grad_norm": 0.41058692356759663,
+      "learning_rate": 0.00012779620968358273,
+      "loss": 0.7587,
+      "step": 803
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4618492446489887,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7244,
+      "step": 804
+    },
+    {
+      "epoch": 0.42933333333333334,
+      "grad_norm": 0.36974323409460746,
+      "learning_rate": 0.00012746405435869198,
+      "loss": 0.6838,
+      "step": 805
+    },
+    {
+      "epoch": 0.4298666666666667,
+      "grad_norm": 0.45087404920245155,
+      "learning_rate": 0.00012729785343046588,
+      "loss": 0.7834,
+      "step": 806
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.4579058796264781,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7209,
+      "step": 807
+    },
+    {
+      "epoch": 0.43093333333333333,
+      "grad_norm": 0.47498833143544217,
+      "learning_rate": 0.00012696520752395672,
+      "loss": 0.8021,
+      "step": 808
+    },
+    {
+      "epoch": 0.43146666666666667,
+      "grad_norm": 0.45453279197662866,
+      "learning_rate": 0.00012679876353900482,
+      "loss": 0.7371,
+      "step": 809
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.47827951125770113,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7593,
+      "step": 810
+    },
+    {
+      "epoch": 0.4325333333333333,
+      "grad_norm": 0.9472114532586452,
+      "learning_rate": 0.00012646563599083996,
+      "loss": 0.7698,
+      "step": 811
+    },
+    {
+      "epoch": 0.43306666666666666,
+      "grad_norm": 0.4051855162278431,
+      "learning_rate": 0.00012629895342239643,
+      "loss": 0.7284,
+      "step": 812
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.4255203306929849,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.6836,
+      "step": 813
+    },
+    {
+      "epoch": 0.4341333333333333,
+      "grad_norm": 0.444687852998819,
+      "learning_rate": 0.00012596535318548289,
+      "loss": 0.748,
+      "step": 814
+    },
+    {
+      "epoch": 0.43466666666666665,
+      "grad_norm": 0.41768292366931153,
+      "learning_rate": 0.0001257984365131938,
+      "loss": 0.7089,
+      "step": 815
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.5861615055236309,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.8613,
+      "step": 816
+    },
+    {
+      "epoch": 0.4357333333333333,
+      "grad_norm": 0.44600652403455104,
+      "learning_rate": 0.00012546437255314222,
+      "loss": 0.7116,
+      "step": 817
+    },
+    {
+      "epoch": 0.4362666666666667,
+      "grad_norm": 0.43336271970583384,
+      "learning_rate": 0.0001252972262629454,
+      "loss": 0.6975,
+      "step": 818
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.48013857861997417,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7971,
+      "step": 819
+    },
+    {
+      "epoch": 0.43733333333333335,
+      "grad_norm": 0.5034434513967145,
+      "learning_rate": 0.00012496270755782914,
+      "loss": 0.7113,
+      "step": 820
+    },
+    {
+      "epoch": 0.4378666666666667,
+      "grad_norm": 0.3948595412837032,
+      "learning_rate": 0.00012479533614183334,
+      "loss": 0.6594,
+      "step": 821
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.4640635824991028,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7221,
+      "step": 822
+    },
+    {
+      "epoch": 0.43893333333333334,
+      "grad_norm": 0.470648685182896,
+      "learning_rate": 0.00012446037168194714,
+      "loss": 0.7356,
+      "step": 823
+    },
+    {
+      "epoch": 0.43946666666666667,
+      "grad_norm": 0.46249832108832006,
+      "learning_rate": 0.00012429277963831148,
+      "loss": 0.7244,
+      "step": 824
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.6054596781772245,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.8706,
+      "step": 825
+    },
+    {
+      "epoch": 0.44053333333333333,
+      "grad_norm": 0.46149602091097064,
+      "learning_rate": 0.00012395737842592995,
+      "loss": 0.7706,
+      "step": 826
+    },
+    {
+      "epoch": 0.44106666666666666,
+      "grad_norm": 0.473517667257064,
+      "learning_rate": 0.000123789570258743,
+      "loss": 0.8551,
+      "step": 827
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.40140203364399346,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.647,
+      "step": 828
+    },
+    {
+      "epoch": 0.4421333333333333,
+      "grad_norm": 0.37806047932029274,
+      "learning_rate": 0.00012345374130787854,
+      "loss": 0.6263,
+      "step": 829
+    },
+    {
+      "epoch": 0.44266666666666665,
+      "grad_norm": 0.43531713509041514,
+      "learning_rate": 0.00012328572152703725,
+      "loss": 0.7189,
+      "step": 830
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.5729951222509617,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7874,
+      "step": 831
+    },
+    {
+      "epoch": 0.4437333333333333,
+      "grad_norm": 0.48094925653615717,
+      "learning_rate": 0.00012294947386319794,
+      "loss": 0.7358,
+      "step": 832
+    },
+    {
+      "epoch": 0.44426666666666664,
+      "grad_norm": 0.4160815675654382,
+      "learning_rate": 0.0001227812469842864,
+      "loss": 0.7087,
+      "step": 833
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.4012351762963187,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.6723,
+      "step": 834
+    },
+    {
+      "epoch": 0.44533333333333336,
+      "grad_norm": 0.5005619939623338,
+      "learning_rate": 0.00012244458964423327,
+      "loss": 0.7039,
+      "step": 835
+    },
+    {
+      "epoch": 0.4458666666666667,
+      "grad_norm": 0.4289880721438666,
+      "learning_rate": 0.00012227616018840154,
+      "loss": 0.7353,
+      "step": 836
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.4623424281862036,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7243,
+      "step": 837
+    },
+    {
+      "epoch": 0.44693333333333335,
+      "grad_norm": 0.5238256798151496,
+      "learning_rate": 0.00012193910221990581,
+      "loss": 0.6481,
+      "step": 838
+    },
+    {
+      "epoch": 0.4474666666666667,
+      "grad_norm": 0.4920251365004284,
+      "learning_rate": 0.00012177047471374807,
+      "loss": 0.7208,
+      "step": 839
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.47212264759512007,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7468,
+      "step": 840
+    },
+    {
+      "epoch": 0.44853333333333334,
+      "grad_norm": 0.4570252945087352,
+      "learning_rate": 0.0001214330251753481,
+      "loss": 0.6692,
+      "step": 841
+    },
+    {
+      "epoch": 0.44906666666666667,
+      "grad_norm": 0.4544825791391424,
+      "learning_rate": 0.00012126420415078132,
+      "loss": 0.7347,
+      "step": 842
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.3866722439233,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.6801,
+      "step": 843
+    },
+    {
+      "epoch": 0.45013333333333333,
+      "grad_norm": 0.434025092111616,
+      "learning_rate": 0.00012092637211153885,
+      "loss": 0.6937,
+      "step": 844
+    },
+    {
+      "epoch": 0.45066666666666666,
+      "grad_norm": 0.4873335745190174,
+      "learning_rate": 0.0001207573621056809,
+      "loss": 0.7189,
+      "step": 845
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.4059140162354565,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7184,
+      "step": 846
+    },
+    {
+      "epoch": 0.4517333333333333,
+      "grad_norm": 0.5122139601061256,
+      "learning_rate": 0.00012041915664493761,
+      "loss": 0.7952,
+      "step": 847
+    },
+    {
+      "epoch": 0.45226666666666665,
+      "grad_norm": 0.4153275678011465,
+      "learning_rate": 0.00012024996219998517,
+      "loss": 0.7255,
+      "step": 848
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.47828235983973083,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7161,
+      "step": 849
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.3917227166614342,
+      "learning_rate": 0.00011991139240711857,
+      "loss": 0.7079,
+      "step": 850
+    },
+    {
+      "epoch": 0.45386666666666664,
+      "grad_norm": 0.5210383767114197,
+      "learning_rate": 0.00011974201807022525,
+      "loss": 0.7076,
+      "step": 851
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.42984081870006136,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7429,
+      "step": 852
+    },
+    {
+      "epoch": 0.45493333333333336,
+      "grad_norm": 0.3743061187411529,
+      "learning_rate": 0.00011940309304440433,
+      "loss": 0.6726,
+      "step": 853
+    },
+    {
+      "epoch": 0.4554666666666667,
+      "grad_norm": 0.4448355922229668,
+      "learning_rate": 0.00011923354336755835,
+      "loss": 0.6941,
+      "step": 854
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.4721939581213205,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.6726,
+      "step": 855
+    },
+    {
+      "epoch": 0.45653333333333335,
+      "grad_norm": 0.4422985194716223,
+      "learning_rate": 0.00011889427221749916,
+      "loss": 0.7306,
+      "step": 856
+    },
+    {
+      "epoch": 0.4570666666666667,
+      "grad_norm": 0.42105536963702145,
+      "learning_rate": 0.00011872455175740112,
+      "loss": 0.7131,
+      "step": 857
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3659156762371082,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7031,
+      "step": 858
+    },
+    {
+      "epoch": 0.45813333333333334,
+      "grad_norm": 0.4665359443232002,
+      "learning_rate": 0.00011838494360112185,
+      "loss": 0.7055,
+      "step": 859
+    },
+    {
+      "epoch": 0.45866666666666667,
+      "grad_norm": 0.45621341040450175,
+      "learning_rate": 0.00011821505691906216,
+      "loss": 0.7362,
+      "step": 860
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.436441349809642,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7538,
+      "step": 861
+    },
+    {
+      "epoch": 0.4597333333333333,
+      "grad_norm": 0.4508971403898046,
+      "learning_rate": 0.00011787512088363817,
+      "loss": 0.7453,
+      "step": 862
+    },
+    {
+      "epoch": 0.46026666666666666,
+      "grad_norm": 0.4269259836628523,
+      "learning_rate": 0.00011770507254537453,
+      "loss": 0.7214,
+      "step": 863
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.4459383523611653,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.6567,
+      "step": 864
+    },
+    {
+      "epoch": 0.4613333333333333,
+      "grad_norm": 0.4465978200420246,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.7387,
+      "step": 865
+    },
+    {
+      "epoch": 0.46186666666666665,
+      "grad_norm": 0.42088728726217783,
+      "learning_rate": 0.00011719461234232764,
+      "loss": 0.6753,
+      "step": 866
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.4800131441916319,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7005,
+      "step": 867
+    },
+    {
+      "epoch": 0.4629333333333333,
+      "grad_norm": 0.39495859931250377,
+      "learning_rate": 0.00011685404796484225,
+      "loss": 0.684,
+      "step": 868
+    },
+    {
+      "epoch": 0.4634666666666667,
+      "grad_norm": 0.38666835949944633,
+      "learning_rate": 0.00011668369002869912,
+      "loss": 0.6587,
+      "step": 869
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3785771871608222,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7127,
+      "step": 870
+    },
+    {
+      "epoch": 0.46453333333333335,
+      "grad_norm": 0.4997168522784999,
+      "learning_rate": 0.00011634282520518383,
+      "loss": 0.7406,
+      "step": 871
+    },
+    {
+      "epoch": 0.4650666666666667,
+      "grad_norm": 0.4552535133442266,
+      "learning_rate": 0.00011617231933568578,
+      "loss": 0.7027,
+      "step": 872
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.4281849731866267,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7355,
+      "step": 873
+    },
+    {
+      "epoch": 0.46613333333333334,
+      "grad_norm": 0.41856303836706704,
+      "learning_rate": 0.00011583116322698935,
+      "loss": 0.7173,
+      "step": 874
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.41641475823433494,
+      "learning_rate": 0.00011566051400653486,
+      "loss": 0.6546,
+      "step": 875
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.41990224100948037,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.732,
+      "step": 876
+    },
+    {
+      "epoch": 0.46773333333333333,
+      "grad_norm": 0.4568532612356803,
+      "learning_rate": 0.00011531907578133429,
+      "loss": 0.7081,
+      "step": 877
+    },
+    {
+      "epoch": 0.46826666666666666,
+      "grad_norm": 0.49956252479186747,
+      "learning_rate": 0.00011514828779617459,
+      "loss": 0.7362,
+      "step": 878
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.5600248883179616,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7427,
+      "step": 879
+    },
+    {
+      "epoch": 0.4693333333333333,
+      "grad_norm": 0.39916392653079935,
+      "learning_rate": 0.00011480657663072896,
+      "loss": 0.67,
+      "step": 880
+    },
+    {
+      "epoch": 0.46986666666666665,
+      "grad_norm": 0.4288357738907657,
+      "learning_rate": 0.00011463565447084445,
+      "loss": 0.725,
+      "step": 881
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.45418123549236744,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.649,
+      "step": 882
+    },
+    {
+      "epoch": 0.4709333333333333,
+      "grad_norm": 0.4903675423822203,
+      "learning_rate": 0.00011429367954874819,
+      "loss": 0.7349,
+      "step": 883
+    },
+    {
+      "epoch": 0.47146666666666665,
+      "grad_norm": 0.6131789191291735,
+      "learning_rate": 0.0001141226278077254,
+      "loss": 0.7716,
+      "step": 884
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4309385945084161,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7158,
+      "step": 885
+    },
+    {
+      "epoch": 0.47253333333333336,
+      "grad_norm": 0.435760732981933,
+      "learning_rate": 0.00011378039831966134,
+      "loss": 0.7038,
+      "step": 886
+    },
+    {
+      "epoch": 0.4730666666666667,
+      "grad_norm": 0.5564258754454618,
+      "learning_rate": 0.00011360922159456928,
+      "loss": 0.6625,
+      "step": 887
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.44328423885398677,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.6896,
+      "step": 888
+    },
+    {
+      "epoch": 0.47413333333333335,
+      "grad_norm": 0.48030019973891724,
+      "learning_rate": 0.00011326674673806195,
+      "loss": 0.7395,
+      "step": 889
+    },
+    {
+      "epoch": 0.4746666666666667,
+      "grad_norm": 0.5051613444338655,
+      "learning_rate": 0.00011309544962932862,
+      "loss": 0.8078,
+      "step": 890
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.4573909168100557,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7745,
+      "step": 891
+    },
+    {
+      "epoch": 0.47573333333333334,
+      "grad_norm": 0.4401736616959906,
+      "learning_rate": 0.00011275273860849684,
+      "loss": 0.7356,
+      "step": 892
+    },
+    {
+      "epoch": 0.47626666666666667,
+      "grad_norm": 0.5004522458958894,
+      "learning_rate": 0.00011258132571978555,
+      "loss": 0.7012,
+      "step": 893
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.5228492610992537,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.7444,
+      "step": 894
+    },
+    {
+      "epoch": 0.47733333333333333,
+      "grad_norm": 0.3984380461729751,
+      "learning_rate": 0.00011223838774509514,
+      "loss": 0.7443,
+      "step": 895
+    },
+    {
+      "epoch": 0.47786666666666666,
+      "grad_norm": 0.4965680892197268,
+      "learning_rate": 0.00011206686368318086,
+      "loss": 0.6394,
+      "step": 896
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.4179899100906879,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.6331,
+      "step": 897
+    },
+    {
+      "epoch": 0.4789333333333333,
+      "grad_norm": 0.4485835346576027,
+      "learning_rate": 0.00011172370797119712,
+      "loss": 0.7241,
+      "step": 898
+    },
+    {
+      "epoch": 0.47946666666666665,
+      "grad_norm": 0.44535951940755986,
+      "learning_rate": 0.00011155207734584263,
+      "loss": 0.7378,
+      "step": 899
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.41528059437441167,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7482,
+      "step": 900
+    },
+    {
+      "epoch": 0.4805333333333333,
+      "grad_norm": 0.4812608069912455,
+      "learning_rate": 0.00011120871311898254,
+      "loss": 0.7707,
+      "step": 901
+    },
+    {
+      "epoch": 0.48106666666666664,
+      "grad_norm": 0.6298099451387288,
+      "learning_rate": 0.0001110369805428146,
+      "loss": 0.7764,
+      "step": 902
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.49564263943530495,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.714,
+      "step": 903
+    },
+    {
+      "epoch": 0.48213333333333336,
+      "grad_norm": 0.46763160567626094,
+      "learning_rate": 0.0001106934170290991,
+      "loss": 0.7506,
+      "step": 904
+    },
+    {
+      "epoch": 0.4826666666666667,
+      "grad_norm": 0.4166158646410712,
+      "learning_rate": 0.00011052158711748434,
+      "loss": 0.6823,
+      "step": 905
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.4752311001226654,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.7637,
+      "step": 906
+    },
+    {
+      "epoch": 0.48373333333333335,
+      "grad_norm": 0.5010835968981501,
+      "learning_rate": 0.00011017783355029026,
+      "loss": 0.7361,
+      "step": 907
+    },
+    {
+      "epoch": 0.4842666666666667,
+      "grad_norm": 0.39733938852446354,
+      "learning_rate": 0.00011000591092121127,
+      "loss": 0.7173,
+      "step": 908
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.39211339428632275,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7257,
+      "step": 909
+    },
+    {
+      "epoch": 0.48533333333333334,
+      "grad_norm": 0.5105825628365223,
+      "learning_rate": 0.0001096619765390232,
+      "loss": 0.8402,
+      "step": 910
+    },
+    {
+      "epoch": 0.48586666666666667,
+      "grad_norm": 0.5193039573985059,
+      "learning_rate": 0.00010948996581295436,
+      "loss": 0.7425,
+      "step": 911
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3862603973124171,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.709,
+      "step": 912
+    },
+    {
+      "epoch": 0.48693333333333333,
+      "grad_norm": 0.45300153174614916,
+      "learning_rate": 0.00010914585985911632,
+      "loss": 0.6598,
+      "step": 913
+    },
+    {
+      "epoch": 0.48746666666666666,
+      "grad_norm": 0.44820320157660765,
+      "learning_rate": 0.00010897376565889971,
+      "loss": 0.6945,
+      "step": 914
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.3997412603202052,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.7067,
+      "step": 915
+    },
+    {
+      "epoch": 0.4885333333333333,
+      "grad_norm": 0.40947983018857464,
+      "learning_rate": 0.00010862949738136681,
+      "loss": 0.6708,
+      "step": 916
+    },
+    {
+      "epoch": 0.48906666666666665,
+      "grad_norm": 0.4754101993177675,
+      "learning_rate": 0.00010845732433208779,
+      "loss": 0.6983,
+      "step": 917
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.44950607717307856,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.8272,
+      "step": 918
+    },
+    {
+      "epoch": 0.4901333333333333,
+      "grad_norm": 0.38602995140251695,
+      "learning_rate": 0.00010811290298317755,
+      "loss": 0.6056,
+      "step": 919
+    },
+    {
+      "epoch": 0.49066666666666664,
+      "grad_norm": 0.4393206083327871,
+      "learning_rate": 0.00010794065571204072,
+      "loss": 0.6795,
+      "step": 920
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.47478309312233763,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.76,
+      "step": 921
+    },
+    {
+      "epoch": 0.49173333333333336,
+      "grad_norm": 0.47034182707880096,
+      "learning_rate": 0.00010759609054818458,
+      "loss": 0.7437,
+      "step": 922
+    },
+    {
+      "epoch": 0.4922666666666667,
+      "grad_norm": 0.48083147996893855,
+      "learning_rate": 0.00010742377368438914,
+      "loss": 0.6545,
+      "step": 923
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4336982554683627,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.6859,
+      "step": 924
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 0.4207220668613276,
+      "learning_rate": 0.00010707907396588361,
+      "loss": 0.6818,
+      "step": 925
+    },
+    {
+      "epoch": 0.4938666666666667,
+      "grad_norm": 0.3748167742989246,
+      "learning_rate": 0.0001069066921404992,
+      "loss": 0.6447,
+      "step": 926
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.5050791445432369,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.741,
+      "step": 927
+    },
+    {
+      "epoch": 0.49493333333333334,
+      "grad_norm": 0.47384529502461137,
+      "learning_rate": 0.00010656186713125689,
+      "loss": 0.7561,
+      "step": 928
+    },
+    {
+      "epoch": 0.49546666666666667,
+      "grad_norm": 0.36608612365026283,
+      "learning_rate": 0.0001063894249770989,
+      "loss": 0.6721,
+      "step": 929
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4468455436512149,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.6718,
+      "step": 930
+    },
+    {
+      "epoch": 0.4965333333333333,
+      "grad_norm": 0.43522896013997575,
+      "learning_rate": 0.00010604448394439983,
+      "loss": 0.7471,
+      "step": 931
+    },
+    {
+      "epoch": 0.49706666666666666,
+      "grad_norm": 0.4320446808545491,
+      "learning_rate": 0.00010587198609590505,
+      "loss": 0.7346,
+      "step": 932
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.39291534306609155,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.6664,
+      "step": 933
+    },
+    {
+      "epoch": 0.4981333333333333,
+      "grad_norm": 0.40501450732134475,
+      "learning_rate": 0.00010552693831014726,
+      "loss": 0.6477,
+      "step": 934
+    },
+    {
+      "epoch": 0.49866666666666665,
+      "grad_norm": 0.47059878387198506,
+      "learning_rate": 0.0001053543894032493,
+      "loss": 0.7557,
+      "step": 935
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.4455770934005571,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.6767,
+      "step": 936
+    },
+    {
+      "epoch": 0.4997333333333333,
+      "grad_norm": 0.4939088817221293,
+      "learning_rate": 0.00010500924413769988,
+      "loss": 0.6729,
+      "step": 937
+    },
+    {
+      "epoch": 0.5002666666666666,
+      "grad_norm": 0.4964218357696382,
+      "learning_rate": 0.00010483664880970457,
+      "loss": 0.7617,
+      "step": 938
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.4608211700887765,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7729,
+      "step": 939
+    },
+    {
+      "epoch": 0.5013333333333333,
+      "grad_norm": 0.42795111113050455,
+      "learning_rate": 0.00010449141534025045,
+      "loss": 0.7197,
+      "step": 940
+    },
+    {
+      "epoch": 0.5018666666666667,
+      "grad_norm": 0.5007745897689231,
+      "learning_rate": 0.00010431877822971117,
+      "loss": 0.7102,
+      "step": 941
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.6999720720846241,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.8264,
+      "step": 942
+    },
+    {
+      "epoch": 0.5029333333333333,
+      "grad_norm": 0.46304362298689367,
+      "learning_rate": 0.00010397346583460971,
+      "loss": 0.7887,
+      "step": 943
+    },
+    {
+      "epoch": 0.5034666666666666,
+      "grad_norm": 0.3985616879181766,
+      "learning_rate": 0.0001038007915812028,
+      "loss": 0.7189,
+      "step": 944
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.45627600783763306,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.6619,
+      "step": 945
+    },
+    {
+      "epoch": 0.5045333333333333,
+      "grad_norm": 0.356584407952678,
+      "learning_rate": 0.0001034554095408326,
+      "loss": 0.6606,
+      "step": 946
+    },
+    {
+      "epoch": 0.5050666666666667,
+      "grad_norm": 0.43577340520676033,
+      "learning_rate": 0.00010328270278523256,
+      "loss": 0.7142,
+      "step": 947
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.4733156893422998,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.728,
+      "step": 948
+    },
+    {
+      "epoch": 0.5061333333333333,
+      "grad_norm": 0.42808444154564773,
+      "learning_rate": 0.00010293726038184393,
+      "loss": 0.7154,
+      "step": 949
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.5007068163829674,
+      "learning_rate": 0.00010276452576559879,
+      "loss": 0.7969,
+      "step": 950
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.4130841971473934,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.6141,
+      "step": 951
+    },
+    {
+      "epoch": 0.5077333333333334,
+      "grad_norm": 0.4353374083452603,
+      "learning_rate": 0.00010241903228306431,
+      "loss": 0.7204,
+      "step": 952
+    },
+    {
+      "epoch": 0.5082666666666666,
+      "grad_norm": 0.4366447906638421,
+      "learning_rate": 0.0001022462744484709,
+      "loss": 0.6752,
+      "step": 953
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.4136216912670512,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.6573,
+      "step": 954
+    },
+    {
+      "epoch": 0.5093333333333333,
+      "grad_norm": 0.36380003077884254,
+      "learning_rate": 0.00010190073917203589,
+      "loss": 0.6799,
+      "step": 955
+    },
+    {
+      "epoch": 0.5098666666666667,
+      "grad_norm": 0.38066318132601257,
+      "learning_rate": 0.00010172796276201503,
+      "loss": 0.6963,
+      "step": 956
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.43640031931792256,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.745,
+      "step": 957
+    },
+    {
+      "epoch": 0.5109333333333334,
+      "grad_norm": 0.4217155080690035,
+      "learning_rate": 0.00010138239497804804,
+      "loss": 0.6891,
+      "step": 958
+    },
+    {
+      "epoch": 0.5114666666666666,
+      "grad_norm": 0.5124061787151392,
+      "learning_rate": 0.00010120960463601976,
+      "loss": 0.7634,
+      "step": 959
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.5316982626641302,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7308,
+      "step": 960
+    },
+    {
+      "epoch": 0.5125333333333333,
+      "grad_norm": 0.3964336531410184,
+      "learning_rate": 0.00010086401363176305,
+      "loss": 0.6524,
+      "step": 961
+    },
+    {
+      "epoch": 0.5130666666666667,
+      "grad_norm": 0.44770223191113395,
+      "learning_rate": 0.00010069121400152181,
+      "loss": 0.7057,
+      "step": 962
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.400433927645052,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.6621,
+      "step": 963
+    },
+    {
+      "epoch": 0.5141333333333333,
+      "grad_norm": 0.41862647916803847,
+      "learning_rate": 0.0001003456090648416,
+      "loss": 0.6665,
+      "step": 964
+    },
+    {
+      "epoch": 0.5146666666666667,
+      "grad_norm": 0.4012182700945775,
+      "learning_rate": 0.00010017280479043147,
+      "loss": 0.6907,
+      "step": 965
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.42947453081849973,
+      "learning_rate": 0.0001,
+      "loss": 0.723,
+      "step": 966
+    },
+    {
+      "epoch": 0.5157333333333334,
+      "grad_norm": 0.41970579172523287,
+      "learning_rate": 9.982719520956855e-05,
+      "loss": 0.6791,
+      "step": 967
+    },
+    {
+      "epoch": 0.5162666666666667,
+      "grad_norm": 0.3714481537478899,
+      "learning_rate": 9.965439093515841e-05,
+      "loss": 0.6698,
+      "step": 968
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.5237501849426753,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7405,
+      "step": 969
+    },
+    {
+      "epoch": 0.5173333333333333,
+      "grad_norm": 0.47493042786920125,
+      "learning_rate": 9.930878599847821e-05,
+      "loss": 0.7338,
+      "step": 970
+    },
+    {
+      "epoch": 0.5178666666666667,
+      "grad_norm": 0.4189611336365823,
+      "learning_rate": 9.913598636823693e-05,
+      "loss": 0.7938,
+      "step": 971
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.45444657911733427,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.6477,
+      "step": 972
+    },
+    {
+      "epoch": 0.5189333333333334,
+      "grad_norm": 0.37069022687128605,
+      "learning_rate": 9.879039536398024e-05,
+      "loss": 0.6885,
+      "step": 973
+    },
+    {
+      "epoch": 0.5194666666666666,
+      "grad_norm": 0.3762677879935244,
+      "learning_rate": 9.861760502195197e-05,
+      "loss": 0.672,
+      "step": 974
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.6230312694472444,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.794,
+      "step": 975
+    },
+    {
+      "epoch": 0.5205333333333333,
+      "grad_norm": 0.5535522560335289,
+      "learning_rate": 9.827203723798498e-05,
+      "loss": 0.7513,
+      "step": 976
+    },
+    {
+      "epoch": 0.5210666666666667,
+      "grad_norm": 0.41518575881430275,
+      "learning_rate": 9.809926082796415e-05,
+      "loss": 0.7236,
+      "step": 977
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.5231666406971717,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.7724,
+      "step": 978
+    },
+    {
+      "epoch": 0.5221333333333333,
+      "grad_norm": 0.4375656625124024,
+      "learning_rate": 9.775372555152912e-05,
+      "loss": 0.7707,
+      "step": 979
+    },
+    {
+      "epoch": 0.5226666666666666,
+      "grad_norm": 0.4372291586618811,
+      "learning_rate": 9.758096771693573e-05,
+      "loss": 0.6907,
+      "step": 980
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.4322224825025404,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6954,
+      "step": 981
+    },
+    {
+      "epoch": 0.5237333333333334,
+      "grad_norm": 0.4069345600036601,
+      "learning_rate": 9.723547423440122e-05,
+      "loss": 0.6508,
+      "step": 982
+    },
+    {
+      "epoch": 0.5242666666666667,
+      "grad_norm": 0.5586441490091297,
+      "learning_rate": 9.70627396181561e-05,
+      "loss": 0.72,
+      "step": 983
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.47394620352062794,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7661,
+      "step": 984
+    },
+    {
+      "epoch": 0.5253333333333333,
+      "grad_norm": 0.44740168580886186,
+      "learning_rate": 9.671729721476746e-05,
+      "loss": 0.6853,
+      "step": 985
+    },
+    {
+      "epoch": 0.5258666666666667,
+      "grad_norm": 0.4475730347483736,
+      "learning_rate": 9.654459045916743e-05,
+      "loss": 0.6968,
+      "step": 986
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.4220184235511863,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.714,
+      "step": 987
+    },
+    {
+      "epoch": 0.5269333333333334,
+      "grad_norm": 0.4725389504679414,
+      "learning_rate": 9.619920841879725e-05,
+      "loss": 0.7731,
+      "step": 988
+    },
+    {
+      "epoch": 0.5274666666666666,
+      "grad_norm": 0.42852303235911554,
+      "learning_rate": 9.602653416539031e-05,
+      "loss": 0.714,
+      "step": 989
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.43963207443504576,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.7638,
+      "step": 990
+    },
+    {
+      "epoch": 0.5285333333333333,
+      "grad_norm": 0.39489295936510793,
+      "learning_rate": 9.568122177028884e-05,
+      "loss": 0.6538,
+      "step": 991
+    },
+    {
+      "epoch": 0.5290666666666667,
+      "grad_norm": 0.43974009060802266,
+      "learning_rate": 9.550858465974958e-05,
+      "loss": 0.7697,
+      "step": 992
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.37811439525893237,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.6487,
+      "step": 993
+    },
+    {
+      "epoch": 0.5301333333333333,
+      "grad_norm": 0.5005279798935339,
+      "learning_rate": 9.516335119029546e-05,
+      "loss": 0.7951,
+      "step": 994
+    },
+    {
+      "epoch": 0.5306666666666666,
+      "grad_norm": 0.43524518191637235,
+      "learning_rate": 9.499075586230013e-05,
+      "loss": 0.7264,
+      "step": 995
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.45685119847738254,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.7069,
+      "step": 996
+    },
+    {
+      "epoch": 0.5317333333333333,
+      "grad_norm": 0.4332628472237465,
+      "learning_rate": 9.464561059675073e-05,
+      "loss": 0.6838,
+      "step": 997
+    },
+    {
+      "epoch": 0.5322666666666667,
+      "grad_norm": 0.41200605772690635,
+      "learning_rate": 9.44730616898528e-05,
+      "loss": 0.6618,
+      "step": 998
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.4271296345910832,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.6848,
+      "step": 999
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.4185761458588467,
+      "learning_rate": 9.412801390409497e-05,
+      "loss": 0.7216,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5338666666666667,
+      "grad_norm": 0.5264701446142168,
+      "learning_rate": 9.395551605560018e-05,
+      "loss": 0.6779,
+      "step": 1001
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.4020877365544272,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.6837,
+      "step": 1002
+    },
+    {
+      "epoch": 0.5349333333333334,
+      "grad_norm": 0.5384160472172311,
+      "learning_rate": 9.361057502290113e-05,
+      "loss": 0.7121,
+      "step": 1003
+    },
+    {
+      "epoch": 0.5354666666666666,
+      "grad_norm": 0.5213296396932801,
+      "learning_rate": 9.343813286874312e-05,
+      "loss": 0.7804,
+      "step": 1004
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.5270813891162284,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.7956,
+      "step": 1005
+    },
+    {
+      "epoch": 0.5365333333333333,
+      "grad_norm": 0.49575141836996933,
+      "learning_rate": 9.309330785950086e-05,
+      "loss": 0.6717,
+      "step": 1006
+    },
+    {
+      "epoch": 0.5370666666666667,
+      "grad_norm": 0.42591287362664304,
+      "learning_rate": 9.292092603411641e-05,
+      "loss": 0.7224,
+      "step": 1007
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.5725826530948611,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7225,
+      "step": 1008
+    },
+    {
+      "epoch": 0.5381333333333334,
+      "grad_norm": 0.45388403632573054,
+      "learning_rate": 9.257622631561085e-05,
+      "loss": 0.6919,
+      "step": 1009
+    },
+    {
+      "epoch": 0.5386666666666666,
+      "grad_norm": 0.3771585033880052,
+      "learning_rate": 9.240390945181543e-05,
+      "loss": 0.6982,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.38059835446466583,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.7109,
+      "step": 1011
+    },
+    {
+      "epoch": 0.5397333333333333,
+      "grad_norm": 0.4437724395723168,
+      "learning_rate": 9.205934428795929e-05,
+      "loss": 0.6903,
+      "step": 1012
+    },
+    {
+      "epoch": 0.5402666666666667,
+      "grad_norm": 0.5457205907235986,
+      "learning_rate": 9.188709701682247e-05,
+      "loss": 0.7525,
+      "step": 1013
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.4916490505433348,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.703,
+      "step": 1014
+    },
+    {
+      "epoch": 0.5413333333333333,
+      "grad_norm": 0.4120828334760189,
+      "learning_rate": 9.154267566791223e-05,
+      "loss": 0.6729,
+      "step": 1015
+    },
+    {
+      "epoch": 0.5418666666666667,
+      "grad_norm": 0.5899202363159813,
+      "learning_rate": 9.137050261863324e-05,
+      "loss": 0.7549,
+      "step": 1016
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.40318575488783587,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.6565,
+      "step": 1017
+    },
+    {
+      "epoch": 0.5429333333333334,
+      "grad_norm": 0.40673107079296383,
+      "learning_rate": 9.102623434110028e-05,
+      "loss": 0.6789,
+      "step": 1018
+    },
+    {
+      "epoch": 0.5434666666666667,
+      "grad_norm": 0.4498824076995007,
+      "learning_rate": 9.085414014088369e-05,
+      "loss": 0.6853,
+      "step": 1019
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4787445125107849,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.6575,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5445333333333333,
+      "grad_norm": 0.4505039428482767,
+      "learning_rate": 9.051003418704565e-05,
+      "loss": 0.737,
+      "step": 1021
+    },
+    {
+      "epoch": 0.5450666666666667,
+      "grad_norm": 0.4148893071953553,
+      "learning_rate": 9.033802346097682e-05,
+      "loss": 0.7067,
+      "step": 1022
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.40547430757080405,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.7131,
+      "step": 1023
+    },
+    {
+      "epoch": 0.5461333333333334,
+      "grad_norm": 0.45950253100425287,
+      "learning_rate": 8.999408907878877e-05,
+      "loss": 0.6953,
+      "step": 1024
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 0.45380750671363423,
+      "learning_rate": 8.982216644970979e-05,
+      "loss": 0.7219,
+      "step": 1025
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.4710021776162681,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7447,
+      "step": 1026
+    },
+    {
+      "epoch": 0.5477333333333333,
+      "grad_norm": 0.4938304629909478,
+      "learning_rate": 8.947841288251568e-05,
+      "loss": 0.7496,
+      "step": 1027
+    },
+    {
+      "epoch": 0.5482666666666667,
+      "grad_norm": 0.48659707936829805,
+      "learning_rate": 8.930658297090091e-05,
+      "loss": 0.8105,
+      "step": 1028
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.5717747968066121,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.7824,
+      "step": 1029
+    },
+    {
+      "epoch": 0.5493333333333333,
+      "grad_norm": 0.41510218190079734,
+      "learning_rate": 8.896301945718541e-05,
+      "loss": 0.6887,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5498666666666666,
+      "grad_norm": 0.4273989601917071,
+      "learning_rate": 8.879128688101749e-05,
+      "loss": 0.6775,
+      "step": 1031
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.4090195999130743,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7076,
+      "step": 1032
+    },
+    {
+      "epoch": 0.5509333333333334,
+      "grad_norm": 0.5298938815336502,
+      "learning_rate": 8.844792265415738e-05,
+      "loss": 0.7224,
+      "step": 1033
+    },
+    {
+      "epoch": 0.5514666666666667,
+      "grad_norm": 0.36980990976501543,
+      "learning_rate": 8.827629202880293e-05,
+      "loss": 0.7112,
+      "step": 1034
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.667053722173872,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7419,
+      "step": 1035
+    },
+    {
+      "epoch": 0.5525333333333333,
+      "grad_norm": 0.5159519841077279,
+      "learning_rate": 8.793313631681915e-05,
+      "loss": 0.755,
+      "step": 1036
+    },
+    {
+      "epoch": 0.5530666666666667,
+      "grad_norm": 0.37328800367068515,
+      "learning_rate": 8.776161225490489e-05,
+      "loss": 0.6607,
+      "step": 1037
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4163781319007869,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.7453,
+      "step": 1038
+    },
+    {
+      "epoch": 0.5541333333333334,
+      "grad_norm": 0.38413750861242135,
+      "learning_rate": 8.741867428021446e-05,
+      "loss": 0.6544,
+      "step": 1039
+    },
+    {
+      "epoch": 0.5546666666666666,
+      "grad_norm": 0.438990645670329,
+      "learning_rate": 8.724726139150318e-05,
+      "loss": 0.649,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.413988489597262,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.7088,
+      "step": 1041
+    },
+    {
+      "epoch": 0.5557333333333333,
+      "grad_norm": 0.4047062021806597,
+      "learning_rate": 8.690455037067141e-05,
+      "loss": 0.6866,
+      "step": 1042
+    },
+    {
+      "epoch": 0.5562666666666667,
+      "grad_norm": 0.5325821394653242,
+      "learning_rate": 8.673325326193806e-05,
+      "loss": 0.7167,
+      "step": 1043
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.4591998311226366,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.6783,
+      "step": 1044
+    },
+    {
+      "epoch": 0.5573333333333333,
+      "grad_norm": 0.5025201483899181,
+      "learning_rate": 8.639077840543077e-05,
+      "loss": 0.7875,
+      "step": 1045
+    },
+    {
+      "epoch": 0.5578666666666666,
+      "grad_norm": 0.40676184484246597,
+      "learning_rate": 8.621960168033867e-05,
+      "loss": 0.7082,
+      "step": 1046
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.45801410510596946,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.7613,
+      "step": 1047
+    },
+    {
+      "epoch": 0.5589333333333333,
+      "grad_norm": 0.41187950614770436,
+      "learning_rate": 8.587737219227462e-05,
+      "loss": 0.7598,
+      "step": 1048
+    },
+    {
+      "epoch": 0.5594666666666667,
+      "grad_norm": 0.5732262740881797,
+      "learning_rate": 8.570632045125185e-05,
+      "loss": 0.7329,
+      "step": 1049
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3818635915803538,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6546,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5605333333333333,
+      "grad_norm": 0.39724465874359377,
+      "learning_rate": 8.536434552915556e-05,
+      "loss": 0.7249,
+      "step": 1051
+    },
+    {
+      "epoch": 0.5610666666666667,
+      "grad_norm": 0.4643601322601259,
+      "learning_rate": 8.519342336927105e-05,
+      "loss": 0.7876,
+      "step": 1052
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.40560055104386716,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6493,
+      "step": 1053
+    },
+    {
+      "epoch": 0.5621333333333334,
+      "grad_norm": 0.41988583263667245,
+      "learning_rate": 8.485171220382545e-05,
+      "loss": 0.6976,
+      "step": 1054
+    },
+    {
+      "epoch": 0.5626666666666666,
+      "grad_norm": 0.5281061830116633,
+      "learning_rate": 8.468092421866573e-05,
+      "loss": 0.7101,
+      "step": 1055
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.4185855771334389,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7464,
+      "step": 1056
+    },
+    {
+      "epoch": 0.5637333333333333,
+      "grad_norm": 0.4169364040206677,
+      "learning_rate": 8.433948599346516e-05,
+      "loss": 0.6963,
+      "step": 1057
+    },
+    {
+      "epoch": 0.5642666666666667,
+      "grad_norm": 0.4138958003302118,
+      "learning_rate": 8.416883677301069e-05,
+      "loss": 0.6927,
+      "step": 1058
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.4281538675806205,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6665,
+      "step": 1059
+    },
+    {
+      "epoch": 0.5653333333333334,
+      "grad_norm": 0.470171983661211,
+      "learning_rate": 8.382768066431425e-05,
+      "loss": 0.7198,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5658666666666666,
+      "grad_norm": 0.3897485372566952,
+      "learning_rate": 8.36571747948162e-05,
+      "loss": 0.6301,
+      "step": 1061
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.47207872974235343,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.652,
+      "step": 1062
+    },
+    {
+      "epoch": 0.5669333333333333,
+      "grad_norm": 0.580648892234986,
+      "learning_rate": 8.33163099713009e-05,
+      "loss": 0.7559,
+      "step": 1063
+    },
+    {
+      "epoch": 0.5674666666666667,
+      "grad_norm": 0.37649919680147714,
+      "learning_rate": 8.31459520351578e-05,
+      "loss": 0.6825,
+      "step": 1064
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.46132230846384087,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.6524,
+      "step": 1065
+    },
+    {
+      "epoch": 0.5685333333333333,
+      "grad_norm": 0.4117939450778646,
+      "learning_rate": 8.280538765767235e-05,
+      "loss": 0.633,
+      "step": 1066
+    },
+    {
+      "epoch": 0.5690666666666667,
+      "grad_norm": 0.5251640489626463,
+      "learning_rate": 8.263518223330697e-05,
+      "loss": 0.6484,
+      "step": 1067
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.5370556593506223,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7925,
+      "step": 1068
+    },
+    {
+      "epoch": 0.5701333333333334,
+      "grad_norm": 0.48977171085261184,
+      "learning_rate": 8.22949274546255e-05,
+      "loss": 0.727,
+      "step": 1069
+    },
+    {
+      "epoch": 0.5706666666666667,
+      "grad_norm": 0.38950419140082826,
+      "learning_rate": 8.212487911636184e-05,
+      "loss": 0.6583,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.4513940074808248,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.7201,
+      "step": 1071
+    },
+    {
+      "epoch": 0.5717333333333333,
+      "grad_norm": 0.44536962993224344,
+      "learning_rate": 8.178494308093789e-05,
+      "loss": 0.7007,
+      "step": 1072
+    },
+    {
+      "epoch": 0.5722666666666667,
+      "grad_norm": 0.4924820800581278,
+      "learning_rate": 8.161505639887817e-05,
+      "loss": 0.671,
+      "step": 1073
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.4548878688956654,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.6757,
+      "step": 1074
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 0.5097819864945549,
+      "learning_rate": 8.127544824259889e-05,
+      "loss": 0.6906,
+      "step": 1075
+    },
+    {
+      "epoch": 0.5738666666666666,
+      "grad_norm": 0.47639256146424586,
+      "learning_rate": 8.110572778250085e-05,
+      "loss": 0.7345,
+      "step": 1076
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.5605924808144956,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.6721,
+      "step": 1077
+    },
+    {
+      "epoch": 0.5749333333333333,
+      "grad_norm": 0.473831236069468,
+      "learning_rate": 8.076645663244168e-05,
+      "loss": 0.702,
+      "step": 1078
+    },
+    {
+      "epoch": 0.5754666666666667,
+      "grad_norm": 0.45800812471608365,
+      "learning_rate": 8.059690695559568e-05,
+      "loss": 0.7131,
+      "step": 1079
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.39545170615928676,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.6403,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5765333333333333,
+      "grad_norm": 0.462802428054446,
+      "learning_rate": 8.025798192977481e-05,
+      "loss": 0.7314,
+      "step": 1081
+    },
+    {
+      "epoch": 0.5770666666666666,
+      "grad_norm": 0.4649214253109914,
+      "learning_rate": 8.008860759288147e-05,
+      "loss": 0.6844,
+      "step": 1082
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.4383940632651265,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6449,
+      "step": 1083
+    },
+    {
+      "epoch": 0.5781333333333334,
+      "grad_norm": 0.4692348617737329,
+      "learning_rate": 7.975003780001485e-05,
+      "loss": 0.7542,
+      "step": 1084
+    },
+    {
+      "epoch": 0.5786666666666667,
+      "grad_norm": 0.5257629918914171,
+      "learning_rate": 7.958084335506239e-05,
+      "loss": 0.8086,
+      "step": 1085
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.4588194630842873,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6464,
+      "step": 1086
+    },
+    {
+      "epoch": 0.5797333333333333,
+      "grad_norm": 0.48580849547296195,
+      "learning_rate": 7.924263789431912e-05,
+      "loss": 0.7594,
+      "step": 1087
+    },
+    {
+      "epoch": 0.5802666666666667,
+      "grad_norm": 0.353636881092341,
+      "learning_rate": 7.907362788846116e-05,
+      "loss": 0.7121,
+      "step": 1088
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.38104388079939217,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.6567,
+      "step": 1089
+    },
+    {
+      "epoch": 0.5813333333333334,
+      "grad_norm": 0.42042891837825247,
+      "learning_rate": 7.873579584921869e-05,
+      "loss": 0.7057,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5818666666666666,
+      "grad_norm": 0.40031403089755774,
+      "learning_rate": 7.856697482465196e-05,
+      "loss": 0.6226,
+      "step": 1091
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.4118951225176375,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6514,
+      "step": 1092
+    },
+    {
+      "epoch": 0.5829333333333333,
+      "grad_norm": 0.42668133459988783,
+      "learning_rate": 7.822952528625191e-05,
+      "loss": 0.6994,
+      "step": 1093
+    },
+    {
+      "epoch": 0.5834666666666667,
+      "grad_norm": 0.40110478497299995,
+      "learning_rate": 7.806089778009421e-05,
+      "loss": 0.6952,
+      "step": 1094
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.506616334073721,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.7305,
+      "step": 1095
+    },
+    {
+      "epoch": 0.5845333333333333,
+      "grad_norm": 0.39826405904340123,
+      "learning_rate": 7.772383981159849e-05,
+      "loss": 0.6628,
+      "step": 1096
+    },
+    {
+      "epoch": 0.5850666666666666,
+      "grad_norm": 0.4271293723300673,
+      "learning_rate": 7.755541035576677e-05,
+      "loss": 0.676,
+      "step": 1097
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.437120180626211,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.7236,
+      "step": 1098
+    },
+    {
+      "epoch": 0.5861333333333333,
+      "grad_norm": 0.42255243493505024,
+      "learning_rate": 7.721875301571359e-05,
+      "loss": 0.6824,
+      "step": 1099
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.46590559822989075,
+      "learning_rate": 7.705052613680211e-05,
+      "loss": 0.6974,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.4463527365112677,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.6912,
+      "step": 1101
+    },
+    {
+      "epoch": 0.5877333333333333,
+      "grad_norm": 0.4464292756636874,
+      "learning_rate": 7.671427847296275e-05,
+      "loss": 0.634,
+      "step": 1102
+    },
+    {
+      "epoch": 0.5882666666666667,
+      "grad_norm": 0.4486764707441599,
+      "learning_rate": 7.654625869212146e-05,
+      "loss": 0.6361,
+      "step": 1103
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.41175580150636143,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.6901,
+      "step": 1104
+    },
+    {
+      "epoch": 0.5893333333333334,
+      "grad_norm": 0.45036368725317394,
+      "learning_rate": 7.6210429741257e-05,
+      "loss": 0.6799,
+      "step": 1105
+    },
+    {
+      "epoch": 0.5898666666666667,
+      "grad_norm": 0.41771862187974396,
+      "learning_rate": 7.604262157407007e-05,
+      "loss": 0.6541,
+      "step": 1106
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.3996706258597516,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.7087,
+      "step": 1107
+    },
+    {
+      "epoch": 0.5909333333333333,
+      "grad_norm": 0.43454543250144245,
+      "learning_rate": 7.570722036168854e-05,
+      "loss": 0.6238,
+      "step": 1108
+    },
+    {
+      "epoch": 0.5914666666666667,
+      "grad_norm": 0.40885044054759473,
+      "learning_rate": 7.55396283180529e-05,
+      "loss": 0.6826,
+      "step": 1109
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4164739833269892,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.6845,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5925333333333334,
+      "grad_norm": 0.4365546905048353,
+      "learning_rate": 7.520466385816671e-05,
+      "loss": 0.7073,
+      "step": 1111
+    },
+    {
+      "epoch": 0.5930666666666666,
+      "grad_norm": 0.4538456707835925,
+      "learning_rate": 7.503729244217086e-05,
+      "loss": 0.7201,
+      "step": 1112
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.37097894672741727,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6486,
+      "step": 1113
+    },
+    {
+      "epoch": 0.5941333333333333,
+      "grad_norm": 0.4474030266548989,
+      "learning_rate": 7.470277373705461e-05,
+      "loss": 0.6147,
+      "step": 1114
+    },
+    {
+      "epoch": 0.5946666666666667,
+      "grad_norm": 0.4219935785255905,
+      "learning_rate": 7.453562744685778e-05,
+      "loss": 0.655,
+      "step": 1115
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.44335158294294086,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.7268,
+      "step": 1116
+    },
+    {
+      "epoch": 0.5957333333333333,
+      "grad_norm": 0.4354821147625836,
+      "learning_rate": 7.42015634868062e-05,
+      "loss": 0.684,
+      "step": 1117
+    },
+    {
+      "epoch": 0.5962666666666666,
+      "grad_norm": 0.4061371543693794,
+      "learning_rate": 7.403464681451715e-05,
+      "loss": 0.6952,
+      "step": 1118
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.5064732816872987,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6548,
+      "step": 1119
+    },
+    {
+      "epoch": 0.5973333333333334,
+      "grad_norm": 0.4391331847943741,
+      "learning_rate": 7.370104657760361e-05,
+      "loss": 0.6606,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5978666666666667,
+      "grad_norm": 0.4335560681238166,
+      "learning_rate": 7.353436400916004e-05,
+      "loss": 0.7329,
+      "step": 1121
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4535808370951183,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.6476,
+      "step": 1122
+    },
+    {
+      "epoch": 0.5989333333333333,
+      "grad_norm": 0.41898636185061233,
+      "learning_rate": 7.320123646099519e-05,
+      "loss": 0.6789,
+      "step": 1123
+    },
+    {
+      "epoch": 0.5994666666666667,
+      "grad_norm": 0.4494944636894788,
+      "learning_rate": 7.303479247604332e-05,
+      "loss": 0.7103,
+      "step": 1124
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.491356584275832,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.745,
+      "step": 1125
+    },
+    {
+      "epoch": 0.6005333333333334,
+      "grad_norm": 0.6830744799297692,
+      "learning_rate": 7.270214656953415e-05,
+      "loss": 0.7344,
+      "step": 1126
+    },
+    {
+      "epoch": 0.6010666666666666,
+      "grad_norm": 0.47208146062633694,
+      "learning_rate": 7.253594564130804e-05,
+      "loss": 0.7296,
+      "step": 1127
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.5148113440315742,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.6917,
+      "step": 1128
+    },
+    {
+      "epoch": 0.6021333333333333,
+      "grad_norm": 0.46249237893405354,
+      "learning_rate": 7.22037903164173e-05,
+      "loss": 0.6511,
+      "step": 1129
+    },
+    {
+      "epoch": 0.6026666666666667,
+      "grad_norm": 0.4333329759022181,
+      "learning_rate": 7.203783691161883e-05,
+      "loss": 0.7026,
+      "step": 1130
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.45181958290943725,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.6934,
+      "step": 1131
+    },
+    {
+      "epoch": 0.6037333333333333,
+      "grad_norm": 0.4967025867464761,
+      "learning_rate": 7.170618109512465e-05,
+      "loss": 0.7365,
+      "step": 1132
+    },
+    {
+      "epoch": 0.6042666666666666,
+      "grad_norm": 0.44556605284460443,
+      "learning_rate": 7.154047967380354e-05,
+      "loss": 0.7594,
+      "step": 1133
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.43844495879403456,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.7302,
+      "step": 1134
+    },
+    {
+      "epoch": 0.6053333333333333,
+      "grad_norm": 0.4447741052768386,
+      "learning_rate": 7.12093322790597e-05,
+      "loss": 0.6862,
+      "step": 1135
+    },
+    {
+      "epoch": 0.6058666666666667,
+      "grad_norm": 0.4044188770090188,
+      "learning_rate": 7.104388729449338e-05,
+      "loss": 0.6946,
+      "step": 1136
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.5415468788093395,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7217,
+      "step": 1137
+    },
+    {
+      "epoch": 0.6069333333333333,
+      "grad_norm": 0.39803291072156194,
+      "learning_rate": 7.071325722118963e-05,
+      "loss": 0.6049,
+      "step": 1138
+    },
+    {
+      "epoch": 0.6074666666666667,
+      "grad_norm": 0.42524764548934013,
+      "learning_rate": 7.054807311976379e-05,
+      "loss": 0.6956,
+      "step": 1139
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.40243540895739494,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6944,
+      "step": 1140
+    },
+    {
+      "epoch": 0.6085333333333334,
+      "grad_norm": 0.41108116013920964,
+      "learning_rate": 7.021796925368667e-05,
+      "loss": 0.6781,
+      "step": 1141
+    },
+    {
+      "epoch": 0.6090666666666666,
+      "grad_norm": 0.4180256922431302,
+      "learning_rate": 7.005305047477566e-05,
+      "loss": 0.7563,
+      "step": 1142
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.37883994845143076,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6508,
+      "step": 1143
+    },
+    {
+      "epoch": 0.6101333333333333,
+      "grad_norm": 0.48577703163002417,
+      "learning_rate": 6.972348168756983e-05,
+      "loss": 0.6649,
+      "step": 1144
+    },
+    {
+      "epoch": 0.6106666666666667,
+      "grad_norm": 0.4496672306844674,
+      "learning_rate": 6.955883266341741e-05,
+      "loss": 0.6892,
+      "step": 1145
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.440269970232495,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6979,
+      "step": 1146
+    },
+    {
+      "epoch": 0.6117333333333334,
+      "grad_norm": 0.43144139434154993,
+      "learning_rate": 6.922980781234699e-05,
+      "loss": 0.7325,
+      "step": 1147
+    },
+    {
+      "epoch": 0.6122666666666666,
+      "grad_norm": 0.4567799170805063,
+      "learning_rate": 6.906543296794714e-05,
+      "loss": 0.6869,
+      "step": 1148
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.45884689535164136,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6246,
+      "step": 1149
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.4612139017442247,
+      "learning_rate": 6.873696089565786e-05,
+      "loss": 0.633,
+      "step": 1150
+    },
+    {
+      "epoch": 0.6138666666666667,
+      "grad_norm": 0.38772962724409904,
+      "learning_rate": 6.85728646486359e-05,
+      "loss": 0.6571,
+      "step": 1151
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.49487000040161555,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.7017,
+      "step": 1152
+    },
+    {
+      "epoch": 0.6149333333333333,
+      "grad_norm": 0.39082581245765496,
+      "learning_rate": 6.82449541829174e-05,
+      "loss": 0.6365,
+      "step": 1153
+    },
+    {
+      "epoch": 0.6154666666666667,
+      "grad_norm": 0.4691915029959711,
+      "learning_rate": 6.80811409434113e-05,
+      "loss": 0.696,
+      "step": 1154
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.4654745459694697,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.6701,
+      "step": 1155
+    },
+    {
+      "epoch": 0.6165333333333334,
+      "grad_norm": 0.4960853830959196,
+      "learning_rate": 6.775380089695986e-05,
+      "loss": 0.7386,
+      "step": 1156
+    },
+    {
+      "epoch": 0.6170666666666667,
+      "grad_norm": 0.41269755829152927,
+      "learning_rate": 6.759027506750158e-05,
+      "loss": 0.7021,
+      "step": 1157
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.6735254066940529,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7665,
+      "step": 1158
+    },
+    {
+      "epoch": 0.6181333333333333,
+      "grad_norm": 0.5474011650485257,
+      "learning_rate": 6.726351423768322e-05,
+      "loss": 0.7174,
+      "step": 1159
+    },
+    {
+      "epoch": 0.6186666666666667,
+      "grad_norm": 0.397827080451777,
+      "learning_rate": 6.710028021308061e-05,
+      "loss": 0.6678,
+      "step": 1160
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.43277723143519836,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.6794,
+      "step": 1161
+    },
+    {
+      "epoch": 0.6197333333333334,
+      "grad_norm": 0.4678172416628807,
+      "learning_rate": 6.677410738169485e-05,
+      "loss": 0.7276,
+      "step": 1162
+    },
+    {
+      "epoch": 0.6202666666666666,
+      "grad_norm": 0.47678291661801986,
+      "learning_rate": 6.661116954891328e-05,
+      "loss": 0.7016,
+      "step": 1163
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.38415068856345364,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.6685,
+      "step": 1164
+    },
+    {
+      "epoch": 0.6213333333333333,
+      "grad_norm": 0.38330519173168615,
+      "learning_rate": 6.62855934819569e-05,
+      "loss": 0.6694,
+      "step": 1165
+    },
+    {
+      "epoch": 0.6218666666666667,
+      "grad_norm": 0.45012001154278036,
+      "learning_rate": 6.612295622000162e-05,
+      "loss": 0.6893,
+      "step": 1166
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.36465998860021354,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6549,
+      "step": 1167
+    },
+    {
+      "epoch": 0.6229333333333333,
+      "grad_norm": 0.4089045562238243,
+      "learning_rate": 6.579798566743314e-05,
+      "loss": 0.655,
+      "step": 1168
+    },
+    {
+      "epoch": 0.6234666666666666,
+      "grad_norm": 0.401000896754143,
+      "learning_rate": 6.563565334723134e-05,
+      "loss": 0.6687,
+      "step": 1169
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.45878175518310715,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7239,
+      "step": 1170
+    },
+    {
+      "epoch": 0.6245333333333334,
+      "grad_norm": 0.49041075314677157,
+      "learning_rate": 6.531129704273604e-05,
+      "loss": 0.734,
+      "step": 1171
+    },
+    {
+      "epoch": 0.6250666666666667,
+      "grad_norm": 0.39186563776406375,
+      "learning_rate": 6.514927402701964e-05,
+      "loss": 0.6688,
+      "step": 1172
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.4914328053426828,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.7282,
+      "step": 1173
+    },
+    {
+      "epoch": 0.6261333333333333,
+      "grad_norm": 0.40625192142635563,
+      "learning_rate": 6.48255406877745e-05,
+      "loss": 0.6386,
+      "step": 1174
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 0.3748063684152412,
+      "learning_rate": 6.466383133096267e-05,
+      "loss": 0.6615,
+      "step": 1175
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.5215673018623856,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6575,
+      "step": 1176
+    },
+    {
+      "epoch": 0.6277333333333334,
+      "grad_norm": 0.504226772605567,
+      "learning_rate": 6.434072965740242e-05,
+      "loss": 0.6481,
+      "step": 1177
+    },
+    {
+      "epoch": 0.6282666666666666,
+      "grad_norm": 0.399025245567611,
+      "learning_rate": 6.417933830548467e-05,
+      "loss": 0.71,
+      "step": 1178
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.3330948835069077,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.6139,
+      "step": 1179
+    },
+    {
+      "epoch": 0.6293333333333333,
+      "grad_norm": 0.4958690052247975,
+      "learning_rate": 6.385687698106781e-05,
+      "loss": 0.7787,
+      "step": 1180
+    },
+    {
+      "epoch": 0.6298666666666667,
+      "grad_norm": 0.518376029324385,
+      "learning_rate": 6.369580797148718e-05,
+      "loss": 0.7002,
+      "step": 1181
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.43775005862410454,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6702,
+      "step": 1182
+    },
+    {
+      "epoch": 0.6309333333333333,
+      "grad_norm": 0.564111859142483,
+      "learning_rate": 6.337399566246257e-05,
+      "loss": 0.7696,
+      "step": 1183
+    },
+    {
+      "epoch": 0.6314666666666666,
+      "grad_norm": 0.4406942671738771,
+      "learning_rate": 6.321325332399903e-05,
+      "loss": 0.6969,
+      "step": 1184
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.4294131443095355,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7576,
+      "step": 1185
+    },
+    {
+      "epoch": 0.6325333333333333,
+      "grad_norm": 0.4005698087450695,
+      "learning_rate": 6.289209867917312e-05,
+      "loss": 0.6544,
+      "step": 1186
+    },
+    {
+      "epoch": 0.6330666666666667,
+      "grad_norm": 0.4229655347526881,
+      "learning_rate": 6.273168733182722e-05,
+      "loss": 0.7072,
+      "step": 1187
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.4354204824694456,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.7203,
+      "step": 1188
+    },
+    {
+      "epoch": 0.6341333333333333,
+      "grad_norm": 0.4090693349291068,
+      "learning_rate": 6.241119898233144e-05,
+      "loss": 0.7133,
+      "step": 1189
+    },
+    {
+      "epoch": 0.6346666666666667,
+      "grad_norm": 0.44127117465359905,
+      "learning_rate": 6.225112293720836e-05,
+      "loss": 0.7059,
+      "step": 1190
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.40594685075351816,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.688,
+      "step": 1191
+    },
+    {
+      "epoch": 0.6357333333333334,
+      "grad_norm": 0.4479936053531853,
+      "learning_rate": 6.19313094962673e-05,
+      "loss": 0.6753,
+      "step": 1192
+    },
+    {
+      "epoch": 0.6362666666666666,
+      "grad_norm": 0.44240262318228424,
+      "learning_rate": 6.177157305546078e-05,
+      "loss": 0.647,
+      "step": 1193
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.4702963243682181,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.7452,
+      "step": 1194
+    },
+    {
+      "epoch": 0.6373333333333333,
+      "grad_norm": 0.4403693769384026,
+      "learning_rate": 6.145244311816063e-05,
+      "loss": 0.7167,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6378666666666667,
+      "grad_norm": 0.40999692307058,
+      "learning_rate": 6.129305057463741e-05,
+      "loss": 0.6198,
+      "step": 1196
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.5226868856788982,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.7573,
+      "step": 1197
+    },
+    {
+      "epoch": 0.6389333333333334,
+      "grad_norm": 0.39179328679742803,
+      "learning_rate": 6.0974612717695004e-05,
+      "loss": 0.6695,
+      "step": 1198
+    },
+    {
+      "epoch": 0.6394666666666666,
+      "grad_norm": 0.42774356488428983,
+      "learning_rate": 6.0815568355179556e-05,
+      "loss": 0.7275,
+      "step": 1199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.43943837571035477,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.7273,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6405333333333333,
+      "grad_norm": 0.552480001257446,
+      "learning_rate": 6.0497831136711836e-05,
+      "loss": 0.7572,
+      "step": 1201
+    },
+    {
+      "epoch": 0.6410666666666667,
+      "grad_norm": 0.48459244632161336,
+      "learning_rate": 6.0339139229571116e-05,
+      "loss": 0.6398,
+      "step": 1202
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.41043053451708134,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.6884,
+      "step": 1203
+    },
+    {
+      "epoch": 0.6421333333333333,
+      "grad_norm": 0.4107179284069568,
+      "learning_rate": 6.002211118886514e-05,
+      "loss": 0.7124,
+      "step": 1204
+    },
+    {
+      "epoch": 0.6426666666666667,
+      "grad_norm": 0.43047382909893095,
+      "learning_rate": 5.986377600199371e-05,
+      "loss": 0.6733,
+      "step": 1205
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.39600325407900394,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6283,
+      "step": 1206
+    },
+    {
+      "epoch": 0.6437333333333334,
+      "grad_norm": 0.41500514219441187,
+      "learning_rate": 5.9547465659277215e-05,
+      "loss": 0.6449,
+      "step": 1207
+    },
+    {
+      "epoch": 0.6442666666666667,
+      "grad_norm": 0.4649038637148329,
+      "learning_rate": 5.938949144798279e-05,
+      "loss": 0.7128,
+      "step": 1208
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.3742635905884641,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6253,
+      "step": 1209
+    },
+    {
+      "epoch": 0.6453333333333333,
+      "grad_norm": 0.49873594095049717,
+      "learning_rate": 5.907390730419507e-05,
+      "loss": 0.6567,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6458666666666667,
+      "grad_norm": 0.5030172499643145,
+      "learning_rate": 5.8916298314083915e-05,
+      "loss": 0.7176,
+      "step": 1211
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.38035823222941917,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.648,
+      "step": 1212
+    },
+    {
+      "epoch": 0.6469333333333334,
+      "grad_norm": 0.403454227071662,
+      "learning_rate": 5.860144885064751e-05,
+      "loss": 0.608,
+      "step": 1213
+    },
+    {
+      "epoch": 0.6474666666666666,
+      "grad_norm": 0.39566732890510886,
+      "learning_rate": 5.8444209317510514e-05,
+      "loss": 0.6571,
+      "step": 1214
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.39833030979330514,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6795,
+      "step": 1215
+    },
+    {
+      "epoch": 0.6485333333333333,
+      "grad_norm": 0.4622437312038266,
+      "learning_rate": 5.813010299610313e-05,
+      "loss": 0.7967,
+      "step": 1216
+    },
+    {
+      "epoch": 0.6490666666666667,
+      "grad_norm": 0.4125561000297353,
+      "learning_rate": 5.797323714580192e-05,
+      "loss": 0.6698,
+      "step": 1217
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.41048898462940947,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6844,
+      "step": 1218
+    },
+    {
+      "epoch": 0.6501333333333333,
+      "grad_norm": 0.43286647820008745,
+      "learning_rate": 5.765988240812921e-05,
+      "loss": 0.7044,
+      "step": 1219
+    },
+    {
+      "epoch": 0.6506666666666666,
+      "grad_norm": 0.487024153960826,
+      "learning_rate": 5.750339445648252e-05,
+      "loss": 0.7095,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.49409516688134575,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.6861,
+      "step": 1221
+    },
+    {
+      "epoch": 0.6517333333333334,
+      "grad_norm": 0.39803526419291674,
+      "learning_rate": 5.7190799724050924e-05,
+      "loss": 0.6631,
+      "step": 1222
+    },
+    {
+      "epoch": 0.6522666666666667,
+      "grad_norm": 0.4287146084213074,
+      "learning_rate": 5.7034693876721376e-05,
+      "loss": 0.721,
+      "step": 1223
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.46736119936228177,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.7044,
+      "step": 1224
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 0.4386736030522742,
+      "learning_rate": 5.6722867550612116e-05,
+      "loss": 0.7027,
+      "step": 1225
+    },
+    {
+      "epoch": 0.6538666666666667,
+      "grad_norm": 0.4404033746840516,
+      "learning_rate": 5.6567148002993164e-05,
+      "loss": 0.6616,
+      "step": 1226
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.45784499125755806,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.6221,
+      "step": 1227
+    },
+    {
+      "epoch": 0.6549333333333334,
+      "grad_norm": 0.45788776415686483,
+      "learning_rate": 5.625609846363622e-05,
+      "loss": 0.7006,
+      "step": 1228
+    },
+    {
+      "epoch": 0.6554666666666666,
+      "grad_norm": 0.43731077678181945,
+      "learning_rate": 5.6100769400739383e-05,
+      "loss": 0.6902,
+      "step": 1229
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.6796396357166941,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.7697,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6565333333333333,
+      "grad_norm": 0.42635858623764006,
+      "learning_rate": 5.579050500768836e-05,
+      "loss": 0.6844,
+      "step": 1231
+    },
+    {
+      "epoch": 0.6570666666666667,
+      "grad_norm": 0.5295579742057989,
+      "learning_rate": 5.5635570604030705e-05,
+      "loss": 0.7399,
+      "step": 1232
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.41825020566671456,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.7376,
+      "step": 1233
+    },
+    {
+      "epoch": 0.6581333333333333,
+      "grad_norm": 0.4803419664026688,
+      "learning_rate": 5.53260996957381e-05,
+      "loss": 0.778,
+      "step": 1234
+    },
+    {
+      "epoch": 0.6586666666666666,
+      "grad_norm": 0.8095754262667553,
+      "learning_rate": 5.5171564115230254e-05,
+      "loss": 0.6842,
+      "step": 1235
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.37785107552632613,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6701,
+      "step": 1236
+    },
+    {
+      "epoch": 0.6597333333333333,
+      "grad_norm": 0.46132617217855576,
+      "learning_rate": 5.486289500882355e-05,
+      "loss": 0.6356,
+      "step": 1237
+    },
+    {
+      "epoch": 0.6602666666666667,
+      "grad_norm": 0.5269509945864841,
+      "learning_rate": 5.47087624046575e-05,
+      "loss": 0.6769,
+      "step": 1238
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.3922689653386676,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.645,
+      "step": 1239
+    },
+    {
+      "epoch": 0.6613333333333333,
+      "grad_norm": 0.555143415465988,
+      "learning_rate": 5.4400903395715366e-05,
+      "loss": 0.7452,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6618666666666667,
+      "grad_norm": 0.4475682529570901,
+      "learning_rate": 5.424717791025302e-05,
+      "loss": 0.7474,
+      "step": 1241
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.46825839397048047,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.576,
+      "step": 1242
+    },
+    {
+      "epoch": 0.6629333333333334,
+      "grad_norm": 0.47736062364675896,
+      "learning_rate": 5.394013727258254e-05,
+      "loss": 0.7407,
+      "step": 1243
+    },
+    {
+      "epoch": 0.6634666666666666,
+      "grad_norm": 0.41386791103089915,
+      "learning_rate": 5.378682303724435e-05,
+      "loss": 0.6498,
+      "step": 1244
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4115071829091007,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6727,
+      "step": 1245
+    },
+    {
+      "epoch": 0.6645333333333333,
+      "grad_norm": 0.38598858770760563,
+      "learning_rate": 5.348060902265871e-05,
+      "loss": 0.6473,
+      "step": 1246
+    },
+    {
+      "epoch": 0.6650666666666667,
+      "grad_norm": 0.4288611226500625,
+      "learning_rate": 5.332771015781275e-05,
+      "loss": 0.6626,
+      "step": 1247
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.39767727632829636,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.6833,
+      "step": 1248
+    },
+    {
+      "epoch": 0.6661333333333334,
+      "grad_norm": 0.5055050883992983,
+      "learning_rate": 5.302233099590928e-05,
+      "loss": 0.6983,
+      "step": 1249
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.5112414029165399,
+      "learning_rate": 5.286985161076029e-05,
+      "loss": 0.6303,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.42466226932803275,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6509,
+      "step": 1251
+    },
+    {
+      "epoch": 0.6677333333333333,
+      "grad_norm": 0.43147014942641604,
+      "learning_rate": 5.2565315508699376e-05,
+      "loss": 0.6773,
+      "step": 1252
+    },
+    {
+      "epoch": 0.6682666666666667,
+      "grad_norm": 0.44918413339986263,
+      "learning_rate": 5.2413259701178505e-05,
+      "loss": 0.6996,
+      "step": 1253
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.40487296854779825,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6872,
+      "step": 1254
+    },
+    {
+      "epoch": 0.6693333333333333,
+      "grad_norm": 0.4715949353339733,
+      "learning_rate": 5.210957484346314e-05,
+      "loss": 0.6799,
+      "step": 1255
+    },
+    {
+      "epoch": 0.6698666666666667,
+      "grad_norm": 0.6058533316508485,
+      "learning_rate": 5.195794670011776e-05,
+      "loss": 0.7764,
+      "step": 1256
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.42200509928999985,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.7224,
+      "step": 1257
+    },
+    {
+      "epoch": 0.6709333333333334,
+      "grad_norm": 0.44729875600539715,
+      "learning_rate": 5.165512124837344e-05,
+      "loss": 0.6985,
+      "step": 1258
+    },
+    {
+      "epoch": 0.6714666666666667,
+      "grad_norm": 0.6017859491906349,
+      "learning_rate": 5.150392484425728e-05,
+      "loss": 0.6907,
+      "step": 1259
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.5070518017224357,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.7038,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6725333333333333,
+      "grad_norm": 0.49813536052165636,
+      "learning_rate": 5.120196693701267e-05,
+      "loss": 0.7194,
+      "step": 1261
+    },
+    {
+      "epoch": 0.6730666666666667,
+      "grad_norm": 0.4184170317362622,
+      "learning_rate": 5.105120633557634e-05,
+      "loss": 0.6385,
+      "step": 1262
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.5010532458438172,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7317,
+      "step": 1263
+    },
+    {
+      "epoch": 0.6741333333333334,
+      "grad_norm": 0.5005339987888526,
+      "learning_rate": 5.075012408804458e-05,
+      "loss": 0.7765,
+      "step": 1264
+    },
+    {
+      "epoch": 0.6746666666666666,
+      "grad_norm": 0.5213241438385137,
+      "learning_rate": 5.059980334102637e-05,
+      "loss": 0.669,
+      "step": 1265
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.45957806458841644,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.7257,
+      "step": 1266
+    },
+    {
+      "epoch": 0.6757333333333333,
+      "grad_norm": 0.4441361698356745,
+      "learning_rate": 5.0299604844886985e-05,
+      "loss": 0.6101,
+      "step": 1267
+    },
+    {
+      "epoch": 0.6762666666666667,
+      "grad_norm": 0.4429998799364062,
+      "learning_rate": 5.014972799220403e-05,
+      "loss": 0.7337,
+      "step": 1268
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.40723007461235616,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.6965,
+      "step": 1269
+    },
+    {
+      "epoch": 0.6773333333333333,
+      "grad_norm": 0.41413620831446507,
+      "learning_rate": 4.985042131538545e-05,
+      "loss": 0.7109,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6778666666666666,
+      "grad_norm": 0.436997370362431,
+      "learning_rate": 4.9700992385024934e-05,
+      "loss": 0.7481,
+      "step": 1271
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.4192542523884157,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.7023,
+      "step": 1272
+    },
+    {
+      "epoch": 0.6789333333333334,
+      "grad_norm": 0.3684514145841737,
+      "learning_rate": 4.940258557148765e-05,
+      "loss": 0.6542,
+      "step": 1273
+    },
+    {
+      "epoch": 0.6794666666666667,
+      "grad_norm": 0.4078503684704817,
+      "learning_rate": 4.9253608579398855e-05,
+      "loss": 0.6315,
+      "step": 1274
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.4654049362770654,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.7958,
+      "step": 1275
+    },
+    {
+      "epoch": 0.6805333333333333,
+      "grad_norm": 0.47020760443462817,
+      "learning_rate": 4.895610964891923e-05,
+      "loss": 0.7046,
+      "step": 1276
+    },
+    {
+      "epoch": 0.6810666666666667,
+      "grad_norm": 0.3660858077712867,
+      "learning_rate": 4.880758859890536e-05,
+      "loss": 0.6759,
+      "step": 1277
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.41944031281785055,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6306,
+      "step": 1278
+    },
+    {
+      "epoch": 0.6821333333333334,
+      "grad_norm": 0.44192112888266855,
+      "learning_rate": 4.851100554686021e-05,
+      "loss": 0.7037,
+      "step": 1279
+    },
+    {
+      "epoch": 0.6826666666666666,
+      "grad_norm": 0.39563893570133474,
+      "learning_rate": 4.836294443047088e-05,
+      "loss": 0.6927,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.4527215345915951,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.7066,
+      "step": 1281
+    },
+    {
+      "epoch": 0.6837333333333333,
+      "grad_norm": 0.528383811512297,
+      "learning_rate": 4.8067285227622404e-05,
+      "loss": 0.7627,
+      "step": 1282
+    },
+    {
+      "epoch": 0.6842666666666667,
+      "grad_norm": 0.4226464527745641,
+      "learning_rate": 4.791968802404648e-05,
+      "loss": 0.6755,
+      "step": 1283
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.4474780169862844,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.7362,
+      "step": 1284
+    },
+    {
+      "epoch": 0.6853333333333333,
+      "grad_norm": 0.4972514215365334,
+      "learning_rate": 4.762496061632814e-05,
+      "loss": 0.694,
+      "step": 1285
+    },
+    {
+      "epoch": 0.6858666666666666,
+      "grad_norm": 0.4945500820847561,
+      "learning_rate": 4.747783129228656e-05,
+      "loss": 0.6474,
+      "step": 1286
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.41877512278041723,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.5882,
+      "step": 1287
+    },
+    {
+      "epoch": 0.6869333333333333,
+      "grad_norm": 0.483327908182511,
+      "learning_rate": 4.718404360058966e-05,
+      "loss": 0.6754,
+      "step": 1288
+    },
+    {
+      "epoch": 0.6874666666666667,
+      "grad_norm": 0.4604880955395153,
+      "learning_rate": 4.7037386110228985e-05,
+      "loss": 0.6747,
+      "step": 1289
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.45350310620758705,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.6913,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6885333333333333,
+      "grad_norm": 0.3920930895298731,
+      "learning_rate": 4.6744546030189486e-05,
+      "loss": 0.655,
+      "step": 1291
+    },
+    {
+      "epoch": 0.6890666666666667,
+      "grad_norm": 0.37192017843441927,
+      "learning_rate": 4.659836431497563e-05,
+      "loss": 0.6205,
+      "step": 1292
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.37001840877376735,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.6218,
+      "step": 1293
+    },
+    {
+      "epoch": 0.6901333333333334,
+      "grad_norm": 0.4198155173809062,
+      "learning_rate": 4.630647971676232e-05,
+      "loss": 0.6269,
+      "step": 1294
+    },
+    {
+      "epoch": 0.6906666666666667,
+      "grad_norm": 0.5257923572584187,
+      "learning_rate": 4.6160777705374524e-05,
+      "loss": 0.725,
+      "step": 1295
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.4231819929149378,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.7147,
+      "step": 1296
+    },
+    {
+      "epoch": 0.6917333333333333,
+      "grad_norm": 0.37574676564116133,
+      "learning_rate": 4.586985643347717e-05,
+      "loss": 0.5911,
+      "step": 1297
+    },
+    {
+      "epoch": 0.6922666666666667,
+      "grad_norm": 0.4125533536902336,
+      "learning_rate": 4.572463804170263e-05,
+      "loss": 0.623,
+      "step": 1298
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.41763382077089345,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6186,
+      "step": 1299
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.47045704844553404,
+      "learning_rate": 4.543468791472131e-05,
+      "loss": 0.6692,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6938666666666666,
+      "grad_norm": 0.43950313219525144,
+      "learning_rate": 4.5289957045349653e-05,
+      "loss": 0.6338,
+      "step": 1301
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.44850994906610586,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.7587,
+      "step": 1302
+    },
+    {
+      "epoch": 0.6949333333333333,
+      "grad_norm": 0.40076973487863327,
+      "learning_rate": 4.5000985855784746e-05,
+      "loss": 0.6528,
+      "step": 1303
+    },
+    {
+      "epoch": 0.6954666666666667,
+      "grad_norm": 0.4763486099000397,
+      "learning_rate": 4.485674639850333e-05,
+      "loss": 0.6593,
+      "step": 1304
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.42317066038164286,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.6815,
+      "step": 1305
+    },
+    {
+      "epoch": 0.6965333333333333,
+      "grad_norm": 0.5334792578675287,
+      "learning_rate": 4.456876191254582e-05,
+      "loss": 0.7538,
+      "step": 1306
+    },
+    {
+      "epoch": 0.6970666666666666,
+      "grad_norm": 0.4563517816630481,
+      "learning_rate": 4.442501774383515e-05,
+      "loss": 0.7655,
+      "step": 1307
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.457088132652322,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.764,
+      "step": 1308
+    },
+    {
+      "epoch": 0.6981333333333334,
+      "grad_norm": 0.45315406826456367,
+      "learning_rate": 4.413802770115816e-05,
+      "loss": 0.7333,
+      "step": 1309
+    },
+    {
+      "epoch": 0.6986666666666667,
+      "grad_norm": 0.4785175570304063,
+      "learning_rate": 4.399478268418771e-05,
+      "loss": 0.6752,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.5193290540475145,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.6766,
+      "step": 1311
+    },
+    {
+      "epoch": 0.6997333333333333,
+      "grad_norm": 0.39166249453443075,
+      "learning_rate": 4.3708794797738375e-05,
+      "loss": 0.6641,
+      "step": 1312
+    },
+    {
+      "epoch": 0.7002666666666667,
+      "grad_norm": 0.4444251457161952,
+      "learning_rate": 4.3566052782262735e-05,
+      "loss": 0.6808,
+      "step": 1313
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.4525915252002046,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6637,
+      "step": 1314
+    },
+    {
+      "epoch": 0.7013333333333334,
+      "grad_norm": 0.4445375117875077,
+      "learning_rate": 4.328107473805487e-05,
+      "loss": 0.7181,
+      "step": 1315
+    },
+    {
+      "epoch": 0.7018666666666666,
+      "grad_norm": 0.4412186282350355,
+      "learning_rate": 4.3138839560310303e-05,
+      "loss": 0.6649,
+      "step": 1316
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.4272874352081193,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6575,
+      "step": 1317
+    },
+    {
+      "epoch": 0.7029333333333333,
+      "grad_norm": 0.4992767279616866,
+      "learning_rate": 4.2854879017217894e-05,
+      "loss": 0.7525,
+      "step": 1318
+    },
+    {
+      "epoch": 0.7034666666666667,
+      "grad_norm": 0.48090114753757046,
+      "learning_rate": 4.271315449981934e-05,
+      "loss": 0.7477,
+      "step": 1319
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.40733017554217016,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.6759,
+      "step": 1320
+    },
+    {
+      "epoch": 0.7045333333333333,
+      "grad_norm": 0.4498923712920087,
+      "learning_rate": 4.2430219089370823e-05,
+      "loss": 0.6781,
+      "step": 1321
+    },
+    {
+      "epoch": 0.7050666666666666,
+      "grad_norm": 0.3990536280518104,
+      "learning_rate": 4.228900904120895e-05,
+      "loss": 0.6663,
+      "step": 1322
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.4970635315232516,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.7522,
+      "step": 1323
+    },
+    {
+      "epoch": 0.7061333333333333,
+      "grad_norm": 0.4744644664117155,
+      "learning_rate": 4.200710636738189e-05,
+      "loss": 0.765,
+      "step": 1324
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 0.3346482618704202,
+      "learning_rate": 4.1866414583520877e-05,
+      "loss": 0.5854,
+      "step": 1325
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.41626631799201624,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6726,
+      "step": 1326
+    },
+    {
+      "epoch": 0.7077333333333333,
+      "grad_norm": 0.3640006285780399,
+      "learning_rate": 4.158555222253771e-05,
+      "loss": 0.6425,
+      "step": 1327
+    },
+    {
+      "epoch": 0.7082666666666667,
+      "grad_norm": 0.45944766881500854,
+      "learning_rate": 4.14453824841132e-05,
+      "loss": 0.7496,
+      "step": 1328
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.5295144756125102,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.633,
+      "step": 1329
+    },
+    {
+      "epoch": 0.7093333333333334,
+      "grad_norm": 0.47970524929365166,
+      "learning_rate": 4.1165567984237764e-05,
+      "loss": 0.6483,
+      "step": 1330
+    },
+    {
+      "epoch": 0.7098666666666666,
+      "grad_norm": 0.5288382109707465,
+      "learning_rate": 4.102592405835536e-05,
+      "loss": 0.6519,
+      "step": 1331
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4273640335221985,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7136,
+      "step": 1332
+    },
+    {
+      "epoch": 0.7109333333333333,
+      "grad_norm": 0.3698460538893361,
+      "learning_rate": 4.074716493968975e-05,
+      "loss": 0.6415,
+      "step": 1333
+    },
+    {
+      "epoch": 0.7114666666666667,
+      "grad_norm": 0.4961002979751885,
+      "learning_rate": 4.060805057932359e-05,
+      "loss": 0.7749,
+      "step": 1334
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3992713891696009,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6803,
+      "step": 1335
+    },
+    {
+      "epoch": 0.7125333333333334,
+      "grad_norm": 0.35674124778456545,
+      "learning_rate": 4.0330354333606234e-05,
+      "loss": 0.607,
+      "step": 1336
+    },
+    {
+      "epoch": 0.7130666666666666,
+      "grad_norm": 0.3969586298474553,
+      "learning_rate": 4.019177327749822e-05,
+      "loss": 0.6145,
+      "step": 1337
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.4396329628072576,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.74,
+      "step": 1338
+    },
+    {
+      "epoch": 0.7141333333333333,
+      "grad_norm": 0.5244884710162703,
+      "learning_rate": 3.991514736790258e-05,
+      "loss": 0.6471,
+      "step": 1339
+    },
+    {
+      "epoch": 0.7146666666666667,
+      "grad_norm": 0.4569481617674653,
+      "learning_rate": 3.977710334046193e-05,
+      "loss": 0.7491,
+      "step": 1340
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.4407543083560062,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.6717,
+      "step": 1341
+    },
+    {
+      "epoch": 0.7157333333333333,
+      "grad_norm": 0.3996784940180213,
+      "learning_rate": 3.950155520139581e-05,
+      "loss": 0.6755,
+      "step": 1342
+    },
+    {
+      "epoch": 0.7162666666666667,
+      "grad_norm": 0.4514537088669984,
+      "learning_rate": 3.936405191259891e-05,
+      "loss": 0.6173,
+      "step": 1343
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3664007104725477,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6495,
+      "step": 1344
+    },
+    {
+      "epoch": 0.7173333333333334,
+      "grad_norm": 0.4845923303179934,
+      "learning_rate": 3.9089588949504655e-05,
+      "loss": 0.5729,
+      "step": 1345
+    },
+    {
+      "epoch": 0.7178666666666667,
+      "grad_norm": 0.5131027825325541,
+      "learning_rate": 3.895263009479534e-05,
+      "loss": 0.7162,
+      "step": 1346
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.4870252273445952,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6333,
+      "step": 1347
+    },
+    {
+      "epoch": 0.7189333333333333,
+      "grad_norm": 0.4382846518264229,
+      "learning_rate": 3.867925968395085e-05,
+      "loss": 0.6344,
+      "step": 1348
+    },
+    {
+      "epoch": 0.7194666666666667,
+      "grad_norm": 0.45462423014598224,
+      "learning_rate": 3.854284894414122e-05,
+      "loss": 0.7179,
+      "step": 1349
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.38601592208424074,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.6247,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7205333333333334,
+      "grad_norm": 0.5588254869023244,
+      "learning_rate": 3.82705784324618e-05,
+      "loss": 0.6824,
+      "step": 1351
+    },
+    {
+      "epoch": 0.7210666666666666,
+      "grad_norm": 0.38423012071444157,
+      "learning_rate": 3.8134719473633094e-05,
+      "loss": 0.6414,
+      "step": 1352
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.36722452410125017,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.5833,
+      "step": 1353
+    },
+    {
+      "epoch": 0.7221333333333333,
+      "grad_norm": 0.4327130367411745,
+      "learning_rate": 3.786355617847385e-05,
+      "loss": 0.7005,
+      "step": 1354
+    },
+    {
+      "epoch": 0.7226666666666667,
+      "grad_norm": 0.4903701302239034,
+      "learning_rate": 3.772825265187802e-05,
+      "loss": 0.6898,
+      "step": 1355
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.40168516926848963,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.6793,
+      "step": 1356
+    },
+    {
+      "epoch": 0.7237333333333333,
+      "grad_norm": 0.44376164501594506,
+      "learning_rate": 3.7458203860837234e-05,
+      "loss": 0.6857,
+      "step": 1357
+    },
+    {
+      "epoch": 0.7242666666666666,
+      "grad_norm": 0.3587503335507288,
+      "learning_rate": 3.732345940279893e-05,
+      "loss": 0.5688,
+      "step": 1358
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.4934285243182926,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.7796,
+      "step": 1359
+    },
+    {
+      "epoch": 0.7253333333333334,
+      "grad_norm": 0.4404991951999115,
+      "learning_rate": 3.705453237352227e-05,
+      "loss": 0.652,
+      "step": 1360
+    },
+    {
+      "epoch": 0.7258666666666667,
+      "grad_norm": 0.3647155002598664,
+      "learning_rate": 3.692035060534088e-05,
+      "loss": 0.6375,
+      "step": 1361
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.49974533015504735,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.6908,
+      "step": 1362
+    },
+    {
+      "epoch": 0.7269333333333333,
+      "grad_norm": 0.39018185328648913,
+      "learning_rate": 3.665255256532638e-05,
+      "loss": 0.6532,
+      "step": 1363
+    },
+    {
+      "epoch": 0.7274666666666667,
+      "grad_norm": 0.37228594811392435,
+      "learning_rate": 3.651893709317887e-05,
+      "loss": 0.6379,
+      "step": 1364
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.3971951160318089,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6252,
+      "step": 1365
+    },
+    {
+      "epoch": 0.7285333333333334,
+      "grad_norm": 0.3599070866386319,
+      "learning_rate": 3.625227523958252e-05,
+      "loss": 0.5969,
+      "step": 1366
+    },
+    {
+      "epoch": 0.7290666666666666,
+      "grad_norm": 0.4379513340178431,
+      "learning_rate": 3.611922965442648e-05,
+      "loss": 0.6774,
+      "step": 1367
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3998705472017563,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.6405,
+      "step": 1368
+    },
+    {
+      "epoch": 0.7301333333333333,
+      "grad_norm": 0.41760982475547026,
+      "learning_rate": 3.5853711153868965e-05,
+      "loss": 0.6829,
+      "step": 1369
+    },
+    {
+      "epoch": 0.7306666666666667,
+      "grad_norm": 0.4366260280255711,
+      "learning_rate": 3.5721239031346066e-05,
+      "loss": 0.6926,
+      "step": 1370
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.4106177310144535,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.7056,
+      "step": 1371
+    },
+    {
+      "epoch": 0.7317333333333333,
+      "grad_norm": 0.3921999038233143,
+      "learning_rate": 3.545687101972013e-05,
+      "loss": 0.6919,
+      "step": 1372
+    },
+    {
+      "epoch": 0.7322666666666666,
+      "grad_norm": 0.41674680016384474,
+      "learning_rate": 3.53249759200601e-05,
+      "loss": 0.647,
+      "step": 1373
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4119048309083103,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6498,
+      "step": 1374
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.40572103472794274,
+      "learning_rate": 3.506176550233863e-05,
+      "loss": 0.7052,
+      "step": 1375
+    },
+    {
+      "epoch": 0.7338666666666667,
+      "grad_norm": 0.39057808572159125,
+      "learning_rate": 3.4930450970263485e-05,
+      "loss": 0.6491,
+      "step": 1376
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.491566833277267,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6583,
+      "step": 1377
+    },
+    {
+      "epoch": 0.7349333333333333,
+      "grad_norm": 0.4762257430976011,
+      "learning_rate": 3.46684052203088e-05,
+      "loss": 0.6663,
+      "step": 1378
+    },
+    {
+      "epoch": 0.7354666666666667,
+      "grad_norm": 0.5445065980715168,
+      "learning_rate": 3.4537674784937614e-05,
+      "loss": 0.7097,
+      "step": 1379
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.4543710104572786,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.689,
+      "step": 1380
+    },
+    {
+      "epoch": 0.7365333333333334,
+      "grad_norm": 0.407829920159229,
+      "learning_rate": 3.427680074531113e-05,
+      "loss": 0.6778,
+      "step": 1381
+    },
+    {
+      "epoch": 0.7370666666666666,
+      "grad_norm": 0.41732549426387294,
+      "learning_rate": 3.4146657920065285e-05,
+      "loss": 0.61,
+      "step": 1382
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.5353744556728132,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7306,
+      "step": 1383
+    },
+    {
+      "epoch": 0.7381333333333333,
+      "grad_norm": 0.4175265894701234,
+      "learning_rate": 3.388696260183832e-05,
+      "loss": 0.6786,
+      "step": 1384
+    },
+    {
+      "epoch": 0.7386666666666667,
+      "grad_norm": 0.43922642748880425,
+      "learning_rate": 3.3757410884346894e-05,
+      "loss": 0.737,
+      "step": 1385
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.4591942338879461,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6677,
+      "step": 1386
+    },
+    {
+      "epoch": 0.7397333333333334,
+      "grad_norm": 0.42002570865671857,
+      "learning_rate": 3.3498901266912396e-05,
+      "loss": 0.6193,
+      "step": 1387
+    },
+    {
+      "epoch": 0.7402666666666666,
+      "grad_norm": 0.4538601995375422,
+      "learning_rate": 3.336994413891828e-05,
+      "loss": 0.7202,
+      "step": 1388
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.3831032040092475,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.601,
+      "step": 1389
+    },
+    {
+      "epoch": 0.7413333333333333,
+      "grad_norm": 0.43483694718557675,
+      "learning_rate": 3.3112627169802946e-05,
+      "loss": 0.6504,
+      "step": 1390
+    },
+    {
+      "epoch": 0.7418666666666667,
+      "grad_norm": 0.433441180853285,
+      "learning_rate": 3.298426809706928e-05,
+      "loss": 0.6137,
+      "step": 1391
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.4574223889763118,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6979,
+      "step": 1392
+    },
+    {
+      "epoch": 0.7429333333333333,
+      "grad_norm": 0.576370653686149,
+      "learning_rate": 3.2728150691747115e-05,
+      "loss": 0.7514,
+      "step": 1393
+    },
+    {
+      "epoch": 0.7434666666666667,
+      "grad_norm": 0.5090825579316304,
+      "learning_rate": 3.2600393123964113e-05,
+      "loss": 0.6658,
+      "step": 1394
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4570716372493963,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.7022,
+      "step": 1395
+    },
+    {
+      "epoch": 0.7445333333333334,
+      "grad_norm": 0.4243893702053848,
+      "learning_rate": 3.234548216567049e-05,
+      "loss": 0.721,
+      "step": 1396
+    },
+    {
+      "epoch": 0.7450666666666667,
+      "grad_norm": 0.40463911930908225,
+      "learning_rate": 3.2218329536362704e-05,
+      "loss": 0.6872,
+      "step": 1397
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.4691413149126142,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.568,
+      "step": 1398
+    },
+    {
+      "epoch": 0.7461333333333333,
+      "grad_norm": 0.4410995107571646,
+      "learning_rate": 3.196463187590929e-05,
+      "loss": 0.6605,
+      "step": 1399
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.45395735012404703,
+      "learning_rate": 3.1838087602343344e-05,
+      "loss": 0.678,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.4455123253836739,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6884,
+      "step": 1401
+    },
+    {
+      "epoch": 0.7477333333333334,
+      "grad_norm": 0.466476454337529,
+      "learning_rate": 3.158561005793402e-05,
+      "loss": 0.7379,
+      "step": 1402
+    },
+    {
+      "epoch": 0.7482666666666666,
+      "grad_norm": 0.45254947510936605,
+      "learning_rate": 3.145967754102691e-05,
+      "loss": 0.6483,
+      "step": 1403
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.5105718886675502,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.6524,
+      "step": 1404
+    },
+    {
+      "epoch": 0.7493333333333333,
+      "grad_norm": 0.4153774316188661,
+      "learning_rate": 3.120842689807468e-05,
+      "loss": 0.6585,
+      "step": 1405
+    },
+    {
+      "epoch": 0.7498666666666667,
+      "grad_norm": 0.6287557308304093,
+      "learning_rate": 3.108310952230212e-05,
+      "loss": 0.644,
+      "step": 1406
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.41060708920000044,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6393,
+      "step": 1407
+    },
+    {
+      "epoch": 0.7509333333333333,
+      "grad_norm": 0.4584936591156118,
+      "learning_rate": 3.083309253324651e-05,
+      "loss": 0.6834,
+      "step": 1408
+    },
+    {
+      "epoch": 0.7514666666666666,
+      "grad_norm": 0.42760298449155615,
+      "learning_rate": 3.070839366655215e-05,
+      "loss": 0.6391,
+      "step": 1409
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.40847995117532254,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6677,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7525333333333334,
+      "grad_norm": 0.38598233718275865,
+      "learning_rate": 3.0459617050677868e-05,
+      "loss": 0.612,
+      "step": 1411
+    },
+    {
+      "epoch": 0.7530666666666667,
+      "grad_norm": 0.4154384657504431,
+      "learning_rate": 3.0335540044382694e-05,
+      "loss": 0.6217,
+      "step": 1412
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.6660104658498806,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.7194,
+      "step": 1413
+    },
+    {
+      "epoch": 0.7541333333333333,
+      "grad_norm": 0.4231194901746939,
+      "learning_rate": 3.008801048763914e-05,
+      "loss": 0.6385,
+      "step": 1414
+    },
+    {
+      "epoch": 0.7546666666666667,
+      "grad_norm": 0.4653297067314939,
+      "learning_rate": 2.996455867635155e-05,
+      "loss": 0.7533,
+      "step": 1415
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.45689436023906294,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.7248,
+      "step": 1416
+    },
+    {
+      "epoch": 0.7557333333333334,
+      "grad_norm": 0.40256887514928125,
+      "learning_rate": 2.9718282831172883e-05,
+      "loss": 0.6613,
+      "step": 1417
+    },
+    {
+      "epoch": 0.7562666666666666,
+      "grad_norm": 0.6204123648495595,
+      "learning_rate": 2.9595459532698854e-05,
+      "loss": 0.7044,
+      "step": 1418
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.43213476883749513,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6409,
+      "step": 1419
+    },
+    {
+      "epoch": 0.7573333333333333,
+      "grad_norm": 0.4646837388697919,
+      "learning_rate": 2.9350444017825385e-05,
+      "loss": 0.6921,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7578666666666667,
+      "grad_norm": 0.45353160693893724,
+      "learning_rate": 2.922825253307947e-05,
+      "loss": 0.6904,
+      "step": 1421
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.5023230394327901,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.7396,
+      "step": 1422
+    },
+    {
+      "epoch": 0.7589333333333333,
+      "grad_norm": 0.3935264738094166,
+      "learning_rate": 2.898450393337977e-05,
+      "loss": 0.6477,
+      "step": 1423
+    },
+    {
+      "epoch": 0.7594666666666666,
+      "grad_norm": 0.48635306499044456,
+      "learning_rate": 2.8862947546296315e-05,
+      "loss": 0.6998,
+      "step": 1424
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.4091423751364748,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.6265,
+      "step": 1425
+    },
+    {
+      "epoch": 0.7605333333333333,
+      "grad_norm": 0.40384469236774845,
+      "learning_rate": 2.8620472412590228e-05,
+      "loss": 0.6999,
+      "step": 1426
+    },
+    {
+      "epoch": 0.7610666666666667,
+      "grad_norm": 0.4120121582534377,
+      "learning_rate": 2.8499554390035143e-05,
+      "loss": 0.6909,
+      "step": 1427
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.39083906952401376,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6236,
+      "step": 1428
+    },
+    {
+      "epoch": 0.7621333333333333,
+      "grad_norm": 0.41218289972349514,
+      "learning_rate": 2.8258359238917665e-05,
+      "loss": 0.664,
+      "step": 1429
+    },
+    {
+      "epoch": 0.7626666666666667,
+      "grad_norm": 0.39384549030326926,
+      "learning_rate": 2.8138082830600554e-05,
+      "loss": 0.6389,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.4632752977068213,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6344,
+      "step": 1431
+    },
+    {
+      "epoch": 0.7637333333333334,
+      "grad_norm": 0.3773831474501387,
+      "learning_rate": 2.7898174144266732e-05,
+      "loss": 0.639,
+      "step": 1432
+    },
+    {
+      "epoch": 0.7642666666666666,
+      "grad_norm": 0.5038423659696732,
+      "learning_rate": 2.7778542582653744e-05,
+      "loss": 0.7311,
+      "step": 1433
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.44071340842899487,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6005,
+      "step": 1434
+    },
+    {
+      "epoch": 0.7653333333333333,
+      "grad_norm": 0.3992398623624981,
+      "learning_rate": 2.753992680872457e-05,
+      "loss": 0.6808,
+      "step": 1435
+    },
+    {
+      "epoch": 0.7658666666666667,
+      "grad_norm": 0.41390857698005834,
+      "learning_rate": 2.7420943308951284e-05,
+      "loss": 0.6412,
+      "step": 1436
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.5241813239086165,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.6991,
+      "step": 1437
+    },
+    {
+      "epoch": 0.7669333333333334,
+      "grad_norm": 0.3994989775650419,
+      "learning_rate": 2.7183626860300247e-05,
+      "loss": 0.6192,
+      "step": 1438
+    },
+    {
+      "epoch": 0.7674666666666666,
+      "grad_norm": 0.4291846570848238,
+      "learning_rate": 2.7065294620085424e-05,
+      "loss": 0.6488,
+      "step": 1439
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.41504568841541833,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6927,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7685333333333333,
+      "grad_norm": 0.5175577081278326,
+      "learning_rate": 2.6829283874666233e-05,
+      "loss": 0.6336,
+      "step": 1441
+    },
+    {
+      "epoch": 0.7690666666666667,
+      "grad_norm": 0.4173679144944115,
+      "learning_rate": 2.6711606074225782e-05,
+      "loss": 0.676,
+      "step": 1442
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.560320983926784,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6901,
+      "step": 1443
+    },
+    {
+      "epoch": 0.7701333333333333,
+      "grad_norm": 0.5554519744795914,
+      "learning_rate": 2.647690737490106e-05,
+      "loss": 0.7598,
+      "step": 1444
+    },
+    {
+      "epoch": 0.7706666666666667,
+      "grad_norm": 0.480224673602412,
+      "learning_rate": 2.6359887176862718e-05,
+      "loss": 0.632,
+      "step": 1445
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.38508153551994906,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.664,
+      "step": 1446
+    },
+    {
+      "epoch": 0.7717333333333334,
+      "grad_norm": 0.46055633682093294,
+      "learning_rate": 2.6126506831233344e-05,
+      "loss": 0.7333,
+      "step": 1447
+    },
+    {
+      "epoch": 0.7722666666666667,
+      "grad_norm": 0.3961097454084375,
+      "learning_rate": 2.6010147380551475e-05,
+      "loss": 0.6506,
+      "step": 1448
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.735253120855245,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.7185,
+      "step": 1449
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.3489013592721229,
+      "learning_rate": 2.577809166078716e-05,
+      "loss": 0.6212,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7738666666666667,
+      "grad_norm": 0.3889928529352225,
+      "learning_rate": 2.566239608465838e-05,
+      "loss": 0.6171,
+      "step": 1451
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.46397922957246035,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.6655,
+      "step": 1452
+    },
+    {
+      "epoch": 0.7749333333333334,
+      "grad_norm": 0.39166678734884736,
+      "learning_rate": 2.543167122732918e-05,
+      "loss": 0.6863,
+      "step": 1453
+    },
+    {
+      "epoch": 0.7754666666666666,
+      "grad_norm": 0.5787530599310754,
+      "learning_rate": 2.5316642635108244e-05,
+      "loss": 0.7363,
+      "step": 1454
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.5958819013877672,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.6994,
+      "step": 1455
+    },
+    {
+      "epoch": 0.7765333333333333,
+      "grad_norm": 0.4201426615307166,
+      "learning_rate": 2.508725484101684e-05,
+      "loss": 0.5715,
+      "step": 1456
+    },
+    {
+      "epoch": 0.7770666666666667,
+      "grad_norm": 0.39623704747143423,
+      "learning_rate": 2.4972896324133144e-05,
+      "loss": 0.5991,
+      "step": 1457
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.4139662710085164,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6299,
+      "step": 1458
+    },
+    {
+      "epoch": 0.7781333333333333,
+      "grad_norm": 0.33924102653209187,
+      "learning_rate": 2.4744851758148156e-05,
+      "loss": 0.5845,
+      "step": 1459
+    },
+    {
+      "epoch": 0.7786666666666666,
+      "grad_norm": 0.4007066797410344,
+      "learning_rate": 2.4631166390022574e-05,
+      "loss": 0.6402,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.434895471979439,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6852,
+      "step": 1461
+    },
+    {
+      "epoch": 0.7797333333333333,
+      "grad_norm": 0.3921533618694153,
+      "learning_rate": 2.4404471180913058e-05,
+      "loss": 0.6419,
+      "step": 1462
+    },
+    {
+      "epoch": 0.7802666666666667,
+      "grad_norm": 0.41674674428686015,
+      "learning_rate": 2.429146201687538e-05,
+      "loss": 0.7331,
+      "step": 1463
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.4404240216773432,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.638,
+      "step": 1464
+    },
+    {
+      "epoch": 0.7813333333333333,
+      "grad_norm": 0.5100165492429559,
+      "learning_rate": 2.4066122257145894e-05,
+      "loss": 0.6815,
+      "step": 1465
+    },
+    {
+      "epoch": 0.7818666666666667,
+      "grad_norm": 0.45911938575850225,
+      "learning_rate": 2.3953792334352787e-05,
+      "loss": 0.6395,
+      "step": 1466
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.4188734839089894,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.673,
+      "step": 1467
+    },
+    {
+      "epoch": 0.7829333333333334,
+      "grad_norm": 0.4194909092789948,
+      "learning_rate": 2.3729814080079816e-05,
+      "loss": 0.6896,
+      "step": 1468
+    },
+    {
+      "epoch": 0.7834666666666666,
+      "grad_norm": 0.3734397655475053,
+      "learning_rate": 2.361816641743303e-05,
+      "loss": 0.6286,
+      "step": 1469
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.418794631207735,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.7151,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7845333333333333,
+      "grad_norm": 0.3668294531327107,
+      "learning_rate": 2.339555568810221e-05,
+      "loss": 0.6247,
+      "step": 1471
+    },
+    {
+      "epoch": 0.7850666666666667,
+      "grad_norm": 0.5056859106234943,
+      "learning_rate": 2.328459328616759e-05,
+      "loss": 0.6627,
+      "step": 1472
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.5031974197168696,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6123,
+      "step": 1473
+    },
+    {
+      "epoch": 0.7861333333333334,
+      "grad_norm": 0.36770625640776916,
+      "learning_rate": 2.306335606451181e-05,
+      "loss": 0.6253,
+      "step": 1474
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 0.3950678989681755,
+      "learning_rate": 2.295308190543859e-05,
+      "loss": 0.6347,
+      "step": 1475
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.424691665342637,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6422,
+      "step": 1476
+    },
+    {
+      "epoch": 0.7877333333333333,
+      "grad_norm": 0.3932538680943357,
+      "learning_rate": 2.2733224137277366e-05,
+      "loss": 0.6071,
+      "step": 1477
+    },
+    {
+      "epoch": 0.7882666666666667,
+      "grad_norm": 0.40241094021518636,
+      "learning_rate": 2.262364118471805e-05,
+      "loss": 0.6719,
+      "step": 1478
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.4361459984843694,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6396,
+      "step": 1479
+    },
+    {
+      "epoch": 0.7893333333333333,
+      "grad_norm": 0.3982081342410596,
+      "learning_rate": 2.2405168778797646e-05,
+      "loss": 0.6533,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7898666666666667,
+      "grad_norm": 0.3685938016957018,
+      "learning_rate": 2.2296279977828337e-05,
+      "loss": 0.6251,
+      "step": 1481
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.4343032209940338,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.7168,
+      "step": 1482
+    },
+    {
+      "epoch": 0.7909333333333334,
+      "grad_norm": 0.33962441541583616,
+      "learning_rate": 2.2079198805662914e-05,
+      "loss": 0.6092,
+      "step": 1483
+    },
+    {
+      "epoch": 0.7914666666666667,
+      "grad_norm": 0.45358695497249857,
+      "learning_rate": 2.1971007082704164e-05,
+      "loss": 0.6314,
+      "step": 1484
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.4147278661270302,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6418,
+      "step": 1485
+    },
+    {
+      "epoch": 0.7925333333333333,
+      "grad_norm": 0.43803549259230773,
+      "learning_rate": 2.1755322978418137e-05,
+      "loss": 0.6794,
+      "step": 1486
+    },
+    {
+      "epoch": 0.7930666666666667,
+      "grad_norm": 0.463212491061389,
+      "learning_rate": 2.1647831241156302e-05,
+      "loss": 0.6362,
+      "step": 1487
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.4166521400201831,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6285,
+      "step": 1488
+    },
+    {
+      "epoch": 0.7941333333333334,
+      "grad_norm": 0.46747358714893084,
+      "learning_rate": 2.1433550001327373e-05,
+      "loss": 0.6832,
+      "step": 1489
+    },
+    {
+      "epoch": 0.7946666666666666,
+      "grad_norm": 0.506818733087523,
+      "learning_rate": 2.1326761138636553e-05,
+      "loss": 0.6906,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.6232152586172739,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.611,
+      "step": 1491
+    },
+    {
+      "epoch": 0.7957333333333333,
+      "grad_norm": 0.398009057094343,
+      "learning_rate": 2.111388852214001e-05,
+      "loss": 0.7032,
+      "step": 1492
+    },
+    {
+      "epoch": 0.7962666666666667,
+      "grad_norm": 0.4954796533712989,
+      "learning_rate": 2.1007805404004242e-05,
+      "loss": 0.7264,
+      "step": 1493
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.40108521022798854,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.655,
+      "step": 1494
+    },
+    {
+      "epoch": 0.7973333333333333,
+      "grad_norm": 0.45598380321275594,
+      "learning_rate": 2.0796347131858186e-05,
+      "loss": 0.6988,
+      "step": 1495
+    },
+    {
+      "epoch": 0.7978666666666666,
+      "grad_norm": 0.3871915430991362,
+      "learning_rate": 2.069097260929439e-05,
+      "loss": 0.6438,
+      "step": 1496
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.42284413388889525,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6793,
+      "step": 1497
+    },
+    {
+      "epoch": 0.7989333333333334,
+      "grad_norm": 0.38536687081551113,
+      "learning_rate": 2.048093436450603e-05,
+      "loss": 0.6552,
+      "step": 1498
+    },
+    {
+      "epoch": 0.7994666666666667,
+      "grad_norm": 0.49040526777930155,
+      "learning_rate": 2.0376271269487514e-05,
+      "loss": 0.7177,
+      "step": 1499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.372303282355137,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6111,
+      "step": 1500
+    },
+    {
+      "epoch": 0.8005333333333333,
+      "grad_norm": 0.4734318146302882,
+      "learning_rate": 2.0167658696900317e-05,
+      "loss": 0.625,
+      "step": 1501
+    },
+    {
+      "epoch": 0.8010666666666667,
+      "grad_norm": 0.44864014345408976,
+      "learning_rate": 2.0063709842280432e-05,
+      "loss": 0.7238,
+      "step": 1502
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.4148463663422953,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.6828,
+      "step": 1503
+    },
+    {
+      "epoch": 0.8021333333333334,
+      "grad_norm": 0.4474148187744677,
+      "learning_rate": 1.985652854842247e-05,
+      "loss": 0.6902,
+      "step": 1504
+    },
+    {
+      "epoch": 0.8026666666666666,
+      "grad_norm": 0.5980958684596339,
+      "learning_rate": 1.9753296727859195e-05,
+      "loss": 0.6955,
+      "step": 1505
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.44142730805286245,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6299,
+      "step": 1506
+    },
+    {
+      "epoch": 0.8037333333333333,
+      "grad_norm": 0.4330012446563654,
+      "learning_rate": 1.9547552280792524e-05,
+      "loss": 0.5907,
+      "step": 1507
+    },
+    {
+      "epoch": 0.8042666666666667,
+      "grad_norm": 0.5569374098186959,
+      "learning_rate": 1.9445040268673298e-05,
+      "loss": 0.7162,
+      "step": 1508
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.4563590039156943,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.6869,
+      "step": 1509
+    },
+    {
+      "epoch": 0.8053333333333333,
+      "grad_norm": 0.49580207639687396,
+      "learning_rate": 1.9240738197844278e-05,
+      "loss": 0.6722,
+      "step": 1510
+    },
+    {
+      "epoch": 0.8058666666666666,
+      "grad_norm": 0.4120313480188349,
+      "learning_rate": 1.9138948749211472e-05,
+      "loss": 0.7161,
+      "step": 1511
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.42589014999870267,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6424,
+      "step": 1512
+    },
+    {
+      "epoch": 0.8069333333333333,
+      "grad_norm": 0.38193578525540006,
+      "learning_rate": 1.8936094545302095e-05,
+      "loss": 0.6497,
+      "step": 1513
+    },
+    {
+      "epoch": 0.8074666666666667,
+      "grad_norm": 0.6245364403023802,
+      "learning_rate": 1.883503039577894e-05,
+      "loss": 0.6005,
+      "step": 1514
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.37241477418400815,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.6059,
+      "step": 1515
+    },
+    {
+      "epoch": 0.8085333333333333,
+      "grad_norm": 0.39893356007131425,
+      "learning_rate": 1.8633629510559314e-05,
+      "loss": 0.6535,
+      "step": 1516
+    },
+    {
+      "epoch": 0.8090666666666667,
+      "grad_norm": 0.4889994395764013,
+      "learning_rate": 1.8533293376276472e-05,
+      "loss": 0.68,
+      "step": 1517
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.33046179141381127,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.5902,
+      "step": 1518
+    },
+    {
+      "epoch": 0.8101333333333334,
+      "grad_norm": 0.5711800874080143,
+      "learning_rate": 1.8333351222458407e-05,
+      "loss": 0.7674,
+      "step": 1519
+    },
+    {
+      "epoch": 0.8106666666666666,
+      "grad_norm": 0.4549457289530646,
+      "learning_rate": 1.8233745799980817e-05,
+      "loss": 0.6665,
+      "step": 1520
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.5699317027188194,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.7299,
+      "step": 1521
+    },
+    {
+      "epoch": 0.8117333333333333,
+      "grad_norm": 0.4020798210602477,
+      "learning_rate": 1.803526775107217e-05,
+      "loss": 0.6339,
+      "step": 1522
+    },
+    {
+      "epoch": 0.8122666666666667,
+      "grad_norm": 0.3995482232637711,
+      "learning_rate": 1.7936395717326704e-05,
+      "loss": 0.6528,
+      "step": 1523
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.44557820935371334,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.5653,
+      "step": 1524
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 0.4233355126969602,
+      "learning_rate": 1.773938710748706e-05,
+      "loss": 0.6653,
+      "step": 1525
+    },
+    {
+      "epoch": 0.8138666666666666,
+      "grad_norm": 0.4261010144722392,
+      "learning_rate": 1.7641251119690505e-05,
+      "loss": 0.6439,
+      "step": 1526
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.4386345128850192,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.6489,
+      "step": 1527
+    },
+    {
+      "epoch": 0.8149333333333333,
+      "grad_norm": 0.45087774496786187,
+      "learning_rate": 1.744571724358789e-05,
+      "loss": 0.6529,
+      "step": 1528
+    },
+    {
+      "epoch": 0.8154666666666667,
+      "grad_norm": 0.3975402796187728,
+      "learning_rate": 1.7348319939175637e-05,
+      "loss": 0.6548,
+      "step": 1529
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.5480017096708946,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.7525,
+      "step": 1530
+    },
+    {
+      "epoch": 0.8165333333333333,
+      "grad_norm": 0.3942083871662254,
+      "learning_rate": 1.715426605184407e-05,
+      "loss": 0.642,
+      "step": 1531
+    },
+    {
+      "epoch": 0.8170666666666667,
+      "grad_norm": 0.45959459072184644,
+      "learning_rate": 1.705761004839911e-05,
+      "loss": 0.7472,
+      "step": 1532
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.584546316076038,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7557,
+      "step": 1533
+    },
+    {
+      "epoch": 0.8181333333333334,
+      "grad_norm": 0.41340745771988363,
+      "learning_rate": 1.6865041365097435e-05,
+      "loss": 0.6449,
+      "step": 1534
+    },
+    {
+      "epoch": 0.8186666666666667,
+      "grad_norm": 0.44490990245648465,
+      "learning_rate": 1.676912926028007e-05,
+      "loss": 0.6575,
+      "step": 1535
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.38421477120172864,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.7134,
+      "step": 1536
+    },
+    {
+      "epoch": 0.8197333333333333,
+      "grad_norm": 0.5435579560675197,
+      "learning_rate": 1.6578050956351886e-05,
+      "loss": 0.6273,
+      "step": 1537
+    },
+    {
+      "epoch": 0.8202666666666667,
+      "grad_norm": 0.42269084778287574,
+      "learning_rate": 1.6482885327829913e-05,
+      "loss": 0.5812,
+      "step": 1538
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.44831986276577557,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.7349,
+      "step": 1539
+    },
+    {
+      "epoch": 0.8213333333333334,
+      "grad_norm": 0.4834054201932014,
+      "learning_rate": 1.6293302538564382e-05,
+      "loss": 0.6461,
+      "step": 1540
+    },
+    {
+      "epoch": 0.8218666666666666,
+      "grad_norm": 0.3814636033243915,
+      "learning_rate": 1.619888594394382e-05,
+      "loss": 0.6315,
+      "step": 1541
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.4487163646690131,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6516,
+      "step": 1542
+    },
+    {
+      "epoch": 0.8229333333333333,
+      "grad_norm": 0.45954131188099634,
+      "learning_rate": 1.601080376443763e-05,
+      "loss": 0.6191,
+      "step": 1543
+    },
+    {
+      "epoch": 0.8234666666666667,
+      "grad_norm": 0.5934862372056542,
+      "learning_rate": 1.5917138741193973e-05,
+      "loss": 0.6278,
+      "step": 1544
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.47139248381987847,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.7213,
+      "step": 1545
+    },
+    {
+      "epoch": 0.8245333333333333,
+      "grad_norm": 0.4329217337090352,
+      "learning_rate": 1.573056222621453e-05,
+      "loss": 0.6571,
+      "step": 1546
+    },
+    {
+      "epoch": 0.8250666666666666,
+      "grad_norm": 0.40005387036558254,
+      "learning_rate": 1.5637651291624523e-05,
+      "loss": 0.6529,
+      "step": 1547
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.46448885085215247,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.7176,
+      "step": 1548
+    },
+    {
+      "epoch": 0.8261333333333334,
+      "grad_norm": 0.5931599643544335,
+      "learning_rate": 1.5452585455473977e-05,
+      "loss": 0.7218,
+      "step": 1549
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.5804676849521008,
+      "learning_rate": 1.536043110654809e-05,
+      "loss": 0.6989,
+      "step": 1550
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.5437723742515328,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6913,
+      "step": 1551
+    },
+    {
+      "epoch": 0.8277333333333333,
+      "grad_norm": 0.4471021431166154,
+      "learning_rate": 1.5176880922928616e-05,
+      "loss": 0.6592,
+      "step": 1552
+    },
+    {
+      "epoch": 0.8282666666666667,
+      "grad_norm": 0.40378540338053776,
+      "learning_rate": 1.5085485636343755e-05,
+      "loss": 0.6474,
+      "step": 1553
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.44201360523337074,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6388,
+      "step": 1554
+    },
+    {
+      "epoch": 0.8293333333333334,
+      "grad_norm": 0.46975363512733703,
+      "learning_rate": 1.4903456038223939e-05,
+      "loss": 0.6647,
+      "step": 1555
+    },
+    {
+      "epoch": 0.8298666666666666,
+      "grad_norm": 0.41694059202066175,
+      "learning_rate": 1.4812822270257009e-05,
+      "loss": 0.6641,
+      "step": 1556
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.43436459491647883,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.7326,
+      "step": 1557
+    },
+    {
+      "epoch": 0.8309333333333333,
+      "grad_norm": 0.4914570957874253,
+      "learning_rate": 1.4632318149739177e-05,
+      "loss": 0.6595,
+      "step": 1558
+    },
+    {
+      "epoch": 0.8314666666666667,
+      "grad_norm": 0.443023839564708,
+      "learning_rate": 1.454244833620102e-05,
+      "loss": 0.6915,
+      "step": 1559
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.37100018571953414,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6382,
+      "step": 1560
+    },
+    {
+      "epoch": 0.8325333333333333,
+      "grad_norm": 0.42875477380139815,
+      "learning_rate": 1.4363474544389877e-05,
+      "loss": 0.6479,
+      "step": 1561
+    },
+    {
+      "epoch": 0.8330666666666666,
+      "grad_norm": 0.4485483572251155,
+      "learning_rate": 1.4274371100559791e-05,
+      "loss": 0.65,
+      "step": 1562
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.4906779644680305,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.6578,
+      "step": 1563
+    },
+    {
+      "epoch": 0.8341333333333333,
+      "grad_norm": 0.5061243094311983,
+      "learning_rate": 1.409693244743192e-05,
+      "loss": 0.73,
+      "step": 1564
+    },
+    {
+      "epoch": 0.8346666666666667,
+      "grad_norm": 0.41422267631820975,
+      "learning_rate": 1.4008597767992871e-05,
+      "loss": 0.6828,
+      "step": 1565
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.5606618613098131,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.7368,
+      "step": 1566
+    },
+    {
+      "epoch": 0.8357333333333333,
+      "grad_norm": 0.3522095805748436,
+      "learning_rate": 1.3832699022267515e-05,
+      "loss": 0.5896,
+      "step": 1567
+    },
+    {
+      "epoch": 0.8362666666666667,
+      "grad_norm": 0.38823566997342707,
+      "learning_rate": 1.37451354812416e-05,
+      "loss": 0.608,
+      "step": 1568
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.4045178917126892,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6504,
+      "step": 1569
+    },
+    {
+      "epoch": 0.8373333333333334,
+      "grad_norm": 0.376864282481641,
+      "learning_rate": 1.3570781370252582e-05,
+      "loss": 0.6924,
+      "step": 1570
+    },
+    {
+      "epoch": 0.8378666666666666,
+      "grad_norm": 0.4513033853517228,
+      "learning_rate": 1.3483991320937306e-05,
+      "loss": 0.6463,
+      "step": 1571
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.39152204105908295,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.6315,
+      "step": 1572
+    },
+    {
+      "epoch": 0.8389333333333333,
+      "grad_norm": 0.41193491535153914,
+      "learning_rate": 1.3311186530505838e-05,
+      "loss": 0.6635,
+      "step": 1573
+    },
+    {
+      "epoch": 0.8394666666666667,
+      "grad_norm": 0.33939290424227786,
+      "learning_rate": 1.322517230541096e-05,
+      "loss": 0.5928,
+      "step": 1574
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.502490061859469,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6716,
+      "step": 1575
+    },
+    {
+      "epoch": 0.8405333333333334,
+      "grad_norm": 0.37138805758821103,
+      "learning_rate": 1.30539214797198e-05,
+      "loss": 0.6555,
+      "step": 1576
+    },
+    {
+      "epoch": 0.8410666666666666,
+      "grad_norm": 0.345152734180387,
+      "learning_rate": 1.2968685390504465e-05,
+      "loss": 0.5893,
+      "step": 1577
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.36996512317730207,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6358,
+      "step": 1578
+    },
+    {
+      "epoch": 0.8421333333333333,
+      "grad_norm": 0.4232283639253023,
+      "learning_rate": 1.2798993131973091e-05,
+      "loss": 0.6177,
+      "step": 1579
+    },
+    {
+      "epoch": 0.8426666666666667,
+      "grad_norm": 0.45832069051361574,
+      "learning_rate": 1.2714537469383858e-05,
+      "loss": 0.6688,
+      "step": 1580
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.40990727691313,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.68,
+      "step": 1581
+    },
+    {
+      "epoch": 0.8437333333333333,
+      "grad_norm": 0.38082701571782146,
+      "learning_rate": 1.2546408338544769e-05,
+      "loss": 0.6278,
+      "step": 1582
+    },
+    {
+      "epoch": 0.8442666666666667,
+      "grad_norm": 0.40993562802627775,
+      "learning_rate": 1.2462735372353996e-05,
+      "loss": 0.633,
+      "step": 1583
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.4123608049941386,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6222,
+      "step": 1584
+    },
+    {
+      "epoch": 0.8453333333333334,
+      "grad_norm": 0.40104591477234164,
+      "learning_rate": 1.2296173887730123e-05,
+      "loss": 0.6481,
+      "step": 1585
+    },
+    {
+      "epoch": 0.8458666666666667,
+      "grad_norm": 0.4793488240086187,
+      "learning_rate": 1.2213285866674905e-05,
+      "loss": 0.7249,
+      "step": 1586
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.36096504113832656,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6014,
+      "step": 1587
+    },
+    {
+      "epoch": 0.8469333333333333,
+      "grad_norm": 0.47041779848564563,
+      "learning_rate": 1.2048296504658207e-05,
+      "loss": 0.6758,
+      "step": 1588
+    },
+    {
+      "epoch": 0.8474666666666667,
+      "grad_norm": 0.48596104980486965,
+      "learning_rate": 1.1966195656380031e-05,
+      "loss": 0.6595,
+      "step": 1589
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.42683599920343207,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6439,
+      "step": 1590
+    },
+    {
+      "epoch": 0.8485333333333334,
+      "grad_norm": 0.42736175129702464,
+      "learning_rate": 1.1802782851111205e-05,
+      "loss": 0.6612,
+      "step": 1591
+    },
+    {
+      "epoch": 0.8490666666666666,
+      "grad_norm": 0.5777785499809717,
+      "learning_rate": 1.1721471382096027e-05,
+      "loss": 0.7469,
+      "step": 1592
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.39216045037865677,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6363,
+      "step": 1593
+    },
+    {
+      "epoch": 0.8501333333333333,
+      "grad_norm": 0.4561800936774624,
+      "learning_rate": 1.1559639525345311e-05,
+      "loss": 0.7438,
+      "step": 1594
+    },
+    {
+      "epoch": 0.8506666666666667,
+      "grad_norm": 0.4943567026640203,
+      "learning_rate": 1.1479119620864276e-05,
+      "loss": 0.7048,
+      "step": 1595
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.4814877788409868,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.7268,
+      "step": 1596
+    },
+    {
+      "epoch": 0.8517333333333333,
+      "grad_norm": 0.4092600930058757,
+      "learning_rate": 1.1318873061913405e-05,
+      "loss": 0.6065,
+      "step": 1597
+    },
+    {
+      "epoch": 0.8522666666666666,
+      "grad_norm": 0.409685982408688,
+      "learning_rate": 1.123914688596409e-05,
+      "loss": 0.6506,
+      "step": 1598
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.3870958481224451,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6187,
+      "step": 1599
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.41553703471203224,
+      "learning_rate": 1.1080489931489391e-05,
+      "loss": 0.621,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8538666666666667,
+      "grad_norm": 0.4093322391584789,
+      "learning_rate": 1.1001559626737756e-05,
+      "loss": 0.6051,
+      "step": 1601
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.47041694634040865,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6844,
+      "step": 1602
+    },
+    {
+      "epoch": 0.8549333333333333,
+      "grad_norm": 0.418429662076582,
+      "learning_rate": 1.0844496540694515e-05,
+      "loss": 0.6589,
+      "step": 1603
+    },
+    {
+      "epoch": 0.8554666666666667,
+      "grad_norm": 0.3829077339559944,
+      "learning_rate": 1.0766364228417148e-05,
+      "loss": 0.6478,
+      "step": 1604
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.4534298654346946,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.7143,
+      "step": 1605
+    },
+    {
+      "epoch": 0.8565333333333334,
+      "grad_norm": 0.4859898034600532,
+      "learning_rate": 1.0610899231924886e-05,
+      "loss": 0.6448,
+      "step": 1606
+    },
+    {
+      "epoch": 0.8570666666666666,
+      "grad_norm": 0.46432539091272734,
+      "learning_rate": 1.0533567011952094e-05,
+      "loss": 0.6885,
+      "step": 1607
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.5085493694017713,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.7544,
+      "step": 1608
+    },
+    {
+      "epoch": 0.8581333333333333,
+      "grad_norm": 0.4473521312310639,
+      "learning_rate": 1.0379704283181179e-05,
+      "loss": 0.6196,
+      "step": 1609
+    },
+    {
+      "epoch": 0.8586666666666667,
+      "grad_norm": 0.4747926932747285,
+      "learning_rate": 1.0303174233840528e-05,
+      "loss": 0.7073,
+      "step": 1610
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.4009158607052678,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6103,
+      "step": 1611
+    },
+    {
+      "epoch": 0.8597333333333333,
+      "grad_norm": 0.4475180779475372,
+      "learning_rate": 1.0150917907899926e-05,
+      "loss": 0.6572,
+      "step": 1612
+    },
+    {
+      "epoch": 0.8602666666666666,
+      "grad_norm": 0.3884492418674599,
+      "learning_rate": 1.007519208596045e-05,
+      "loss": 0.6477,
+      "step": 1613
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.4332361050436333,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6365,
+      "step": 1614
+    },
+    {
+      "epoch": 0.8613333333333333,
+      "grad_norm": 0.45285475697097116,
+      "learning_rate": 9.924546254786493e-06,
+      "loss": 0.6509,
+      "step": 1615
+    },
+    {
+      "epoch": 0.8618666666666667,
+      "grad_norm": 0.37958600626287053,
+      "learning_rate": 9.849626695403324e-06,
+      "loss": 0.6218,
+      "step": 1616
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.4082527332253524,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6273,
+      "step": 1617
+    },
+    {
+      "epoch": 0.8629333333333333,
+      "grad_norm": 0.45348682552554404,
+      "learning_rate": 9.700595407649805e-06,
+      "loss": 0.6677,
+      "step": 1618
+    },
+    {
+      "epoch": 0.8634666666666667,
+      "grad_norm": 0.462376062388315,
+      "learning_rate": 9.62648412430951e-06,
+      "loss": 0.6577,
+      "step": 1619
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.4145611018534509,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6881,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8645333333333334,
+      "grad_norm": 0.49556932106832846,
+      "learning_rate": 9.479071385238892e-06,
+      "loss": 0.6319,
+      "step": 1621
+    },
+    {
+      "epoch": 0.8650666666666667,
+      "grad_norm": 0.41068976055254247,
+      "learning_rate": 9.40577036970538e-06,
+      "loss": 0.7006,
+      "step": 1622
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.43485989501407807,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6392,
+      "step": 1623
+    },
+    {
+      "epoch": 0.8661333333333333,
+      "grad_norm": 0.50740804285652,
+      "learning_rate": 9.259980141081115e-06,
+      "loss": 0.7311,
+      "step": 1624
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.4047173072522602,
+      "learning_rate": 9.187491363342093e-06,
+      "loss": 0.6593,
+      "step": 1625
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3894499674015839,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6533,
+      "step": 1626
+    },
+    {
+      "epoch": 0.8677333333333334,
+      "grad_norm": 0.3659261227794051,
+      "learning_rate": 9.043327563322112e-06,
+      "loss": 0.614,
+      "step": 1627
+    },
+    {
+      "epoch": 0.8682666666666666,
+      "grad_norm": 0.451797701464948,
+      "learning_rate": 8.971652971536148e-06,
+      "loss": 0.6973,
+      "step": 1628
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.49015568983154395,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6806,
+      "step": 1629
+    },
+    {
+      "epoch": 0.8693333333333333,
+      "grad_norm": 0.4382845781834796,
+      "learning_rate": 8.829119474567671e-06,
+      "loss": 0.6291,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8698666666666667,
+      "grad_norm": 0.4391521683429244,
+      "learning_rate": 8.758260995011825e-06,
+      "loss": 0.7008,
+      "step": 1631
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4304589058729017,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6583,
+      "step": 1632
+    },
+    {
+      "epoch": 0.8709333333333333,
+      "grad_norm": 0.4864114334043858,
+      "learning_rate": 8.617361631727138e-06,
+      "loss": 0.7412,
+      "step": 1633
+    },
+    {
+      "epoch": 0.8714666666666666,
+      "grad_norm": 0.43922743441940315,
+      "learning_rate": 8.547321168745193e-06,
+      "loss": 0.6269,
+      "step": 1634
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.5474810445951583,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.7699,
+      "step": 1635
+    },
+    {
+      "epoch": 0.8725333333333334,
+      "grad_norm": 0.41757604429661016,
+      "learning_rate": 8.408059725858719e-06,
+      "loss": 0.6052,
+      "step": 1636
+    },
+    {
+      "epoch": 0.8730666666666667,
+      "grad_norm": 0.4167594304186119,
+      "learning_rate": 8.338839161809997e-06,
+      "loss": 0.6862,
+      "step": 1637
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.37822319125493636,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.6262,
+      "step": 1638
+    },
+    {
+      "epoch": 0.8741333333333333,
+      "grad_norm": 0.5023340764183061,
+      "learning_rate": 8.201219382016556e-06,
+      "loss": 0.6251,
+      "step": 1639
+    },
+    {
+      "epoch": 0.8746666666666667,
+      "grad_norm": 0.4079889689772476,
+      "learning_rate": 8.132820577225387e-06,
+      "loss": 0.5957,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.406420779466018,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6523,
+      "step": 1641
+    },
+    {
+      "epoch": 0.8757333333333334,
+      "grad_norm": 0.43747980019701993,
+      "learning_rate": 7.996846159099557e-06,
+      "loss": 0.6769,
+      "step": 1642
+    },
+    {
+      "epoch": 0.8762666666666666,
+      "grad_norm": 0.41352787710817335,
+      "learning_rate": 7.929270951805178e-06,
+      "loss": 0.7113,
+      "step": 1643
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.40583860313023423,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6809,
+      "step": 1644
+    },
+    {
+      "epoch": 0.8773333333333333,
+      "grad_norm": 0.41158724418458603,
+      "learning_rate": 7.794945549701993e-06,
+      "loss": 0.6402,
+      "step": 1645
+    },
+    {
+      "epoch": 0.8778666666666667,
+      "grad_norm": 0.5243531711929061,
+      "learning_rate": 7.728195756009204e-06,
+      "loss": 0.673,
+      "step": 1646
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.337576549622103,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6261,
+      "step": 1647
+    },
+    {
+      "epoch": 0.8789333333333333,
+      "grad_norm": 0.4754446973458503,
+      "learning_rate": 7.595522979965819e-06,
+      "loss": 0.7401,
+      "step": 1648
+    },
+    {
+      "epoch": 0.8794666666666666,
+      "grad_norm": 0.4164840399768914,
+      "learning_rate": 7.529600393796232e-06,
+      "loss": 0.6357,
+      "step": 1649
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4135462920490355,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6859,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8805333333333333,
+      "grad_norm": 0.7234470340955943,
+      "learning_rate": 7.3985838094349444e-06,
+      "loss": 0.7088,
+      "step": 1651
+    },
+    {
+      "epoch": 0.8810666666666667,
+      "grad_norm": 0.36432432756450434,
+      "learning_rate": 7.333490202478666e-06,
+      "loss": 0.6534,
+      "step": 1652
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.48065737527318425,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6908,
+      "step": 1653
+    },
+    {
+      "epoch": 0.8821333333333333,
+      "grad_norm": 0.3932409524931165,
+      "learning_rate": 7.204133330911178e-06,
+      "loss": 0.6908,
+      "step": 1654
+    },
+    {
+      "epoch": 0.8826666666666667,
+      "grad_norm": 0.4346120600494569,
+      "learning_rate": 7.1398704525792e-06,
+      "loss": 0.5935,
+      "step": 1655
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.44275062077364813,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6455,
+      "step": 1656
+    },
+    {
+      "epoch": 0.8837333333333334,
+      "grad_norm": 0.392220127998605,
+      "learning_rate": 7.012176770311862e-06,
+      "loss": 0.6515,
+      "step": 1657
+    },
+    {
+      "epoch": 0.8842666666666666,
+      "grad_norm": 0.4643778577168819,
+      "learning_rate": 6.948746347689183e-06,
+      "loss": 0.6172,
+      "step": 1658
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.4274755369987598,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6109,
+      "step": 1659
+    },
+    {
+      "epoch": 0.8853333333333333,
+      "grad_norm": 0.46868591713180785,
+      "learning_rate": 6.8227192865295995e-06,
+      "loss": 0.6145,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8858666666666667,
+      "grad_norm": 0.5042715565932409,
+      "learning_rate": 6.760123024328624e-06,
+      "loss": 0.7054,
+      "step": 1661
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.4154466335564727,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.5967,
+      "step": 1662
+    },
+    {
+      "epoch": 0.8869333333333334,
+      "grad_norm": 0.4216382747965016,
+      "learning_rate": 6.635765971293484e-06,
+      "loss": 0.5687,
+      "step": 1663
+    },
+    {
+      "epoch": 0.8874666666666666,
+      "grad_norm": 0.42449643103424906,
+      "learning_rate": 6.5740055518083375e-06,
+      "loss": 0.6767,
+      "step": 1664
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.4001245042411552,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6211,
+      "step": 1665
+    },
+    {
+      "epoch": 0.8885333333333333,
+      "grad_norm": 0.47478289908225485,
+      "learning_rate": 6.451321849032288e-06,
+      "loss": 0.6765,
+      "step": 1666
+    },
+    {
+      "epoch": 0.8890666666666667,
+      "grad_norm": 0.38673287441234144,
+      "learning_rate": 6.390398932093555e-06,
+      "loss": 0.6299,
+      "step": 1667
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.4831165164661389,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6253,
+      "step": 1668
+    },
+    {
+      "epoch": 0.8901333333333333,
+      "grad_norm": 0.4254562070627475,
+      "learning_rate": 6.269391876739495e-06,
+      "loss": 0.6319,
+      "step": 1669
+    },
+    {
+      "epoch": 0.8906666666666667,
+      "grad_norm": 0.44905117536001765,
+      "learning_rate": 6.209308099669597e-06,
+      "loss": 0.6273,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.4670117155323974,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.7235,
+      "step": 1671
+    },
+    {
+      "epoch": 0.8917333333333334,
+      "grad_norm": 0.39418772023176973,
+      "learning_rate": 6.089980943839924e-06,
+      "loss": 0.5481,
+      "step": 1672
+    },
+    {
+      "epoch": 0.8922666666666667,
+      "grad_norm": 0.4756135363588192,
+      "learning_rate": 6.030737921409169e-06,
+      "loss": 0.644,
+      "step": 1673
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.41633182755771625,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6518,
+      "step": 1674
+    },
+    {
+      "epoch": 0.8933333333333333,
+      "grad_norm": 0.4621793648032974,
+      "learning_rate": 5.913093872058528e-06,
+      "loss": 0.7258,
+      "step": 1675
+    },
+    {
+      "epoch": 0.8938666666666667,
+      "grad_norm": 0.4323272116296017,
+      "learning_rate": 5.854693196441641e-06,
+      "loss": 0.6358,
+      "step": 1676
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.5295944212721158,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.617,
+      "step": 1677
+    },
+    {
+      "epoch": 0.8949333333333334,
+      "grad_norm": 0.4364076762764822,
+      "learning_rate": 5.738735415290642e-06,
+      "loss": 0.6217,
+      "step": 1678
+    },
+    {
+      "epoch": 0.8954666666666666,
+      "grad_norm": 0.5300885117518196,
+      "learning_rate": 5.681178656024055e-06,
+      "loss": 0.7928,
+      "step": 1679
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.41327557859176295,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6586,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8965333333333333,
+      "grad_norm": 0.7325507837569631,
+      "learning_rate": 5.566910259474289e-06,
+      "loss": 0.6572,
+      "step": 1681
+    },
+    {
+      "epoch": 0.8970666666666667,
+      "grad_norm": 0.40786063676707335,
+      "learning_rate": 5.510198963413881e-06,
+      "loss": 0.5755,
+      "step": 1682
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.4272122242567254,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6983,
+      "step": 1683
+    },
+    {
+      "epoch": 0.8981333333333333,
+      "grad_norm": 0.3998728235135429,
+      "learning_rate": 5.397623022464226e-06,
+      "loss": 0.607,
+      "step": 1684
+    },
+    {
+      "epoch": 0.8986666666666666,
+      "grad_norm": 0.5348664801254471,
+      "learning_rate": 5.341758713743828e-06,
+      "loss": 0.7396,
+      "step": 1685
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3995469349406626,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6787,
+      "step": 1686
+    },
+    {
+      "epoch": 0.8997333333333334,
+      "grad_norm": 0.46000290049490755,
+      "learning_rate": 5.230878253907912e-06,
+      "loss": 0.7569,
+      "step": 1687
+    },
+    {
+      "epoch": 0.9002666666666667,
+      "grad_norm": 0.46010987639731693,
+      "learning_rate": 5.175862433898282e-06,
+      "loss": 0.6347,
+      "step": 1688
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.4978647672880174,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6699,
+      "step": 1689
+    },
+    {
+      "epoch": 0.9013333333333333,
+      "grad_norm": 0.35737868752140106,
+      "learning_rate": 5.066680435123106e-06,
+      "loss": 0.6557,
+      "step": 1690
+    },
+    {
+      "epoch": 0.9018666666666667,
+      "grad_norm": 0.39647211628608875,
+      "learning_rate": 5.012514582391592e-06,
+      "loss": 0.7077,
+      "step": 1691
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.4513532218109532,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6431,
+      "step": 1692
+    },
+    {
+      "epoch": 0.9029333333333334,
+      "grad_norm": 0.4457417615670569,
+      "learning_rate": 4.905033978977491e-06,
+      "loss": 0.6464,
+      "step": 1693
+    },
+    {
+      "epoch": 0.9034666666666666,
+      "grad_norm": 0.5455223307685727,
+      "learning_rate": 4.851719549248301e-06,
+      "loss": 0.751,
+      "step": 1694
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.5051881456825834,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.7221,
+      "step": 1695
+    },
+    {
+      "epoch": 0.9045333333333333,
+      "grad_norm": 0.5309592517243472,
+      "learning_rate": 4.745943229770122e-06,
+      "loss": 0.6432,
+      "step": 1696
+    },
+    {
+      "epoch": 0.9050666666666667,
+      "grad_norm": 0.4867932356703566,
+      "learning_rate": 4.693481655885257e-06,
+      "loss": 0.6611,
+      "step": 1697
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.5584650616591511,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.7606,
+      "step": 1698
+    },
+    {
+      "epoch": 0.9061333333333333,
+      "grad_norm": 0.4728468412522664,
+      "learning_rate": 4.58941246311464e-06,
+      "loss": 0.6698,
+      "step": 1699
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.42002690440527485,
+      "learning_rate": 4.537805154995278e-06,
+      "loss": 0.6121,
+      "step": 1700
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.39957076551971166,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6648,
+      "step": 1701
+    },
+    {
+      "epoch": 0.9077333333333333,
+      "grad_norm": 0.44850496464790096,
+      "learning_rate": 4.435445885824285e-06,
+      "loss": 0.6803,
+      "step": 1702
+    },
+    {
+      "epoch": 0.9082666666666667,
+      "grad_norm": 0.4436197128959715,
+      "learning_rate": 4.384694230432984e-06,
+      "loss": 0.6027,
+      "step": 1703
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.4042949955641835,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6297,
+      "step": 1704
+    },
+    {
+      "epoch": 0.9093333333333333,
+      "grad_norm": 0.5007062642674075,
+      "learning_rate": 4.2840476357989825e-06,
+      "loss": 0.7184,
+      "step": 1705
+    },
+    {
+      "epoch": 0.9098666666666667,
+      "grad_norm": 0.4205208053626174,
+      "learning_rate": 4.2341529971023255e-06,
+      "loss": 0.6676,
+      "step": 1706
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.4431673816438805,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.7164,
+      "step": 1707
+    },
+    {
+      "epoch": 0.9109333333333334,
+      "grad_norm": 0.4249690226422859,
+      "learning_rate": 4.135221781914034e-06,
+      "loss": 0.6665,
+      "step": 1708
+    },
+    {
+      "epoch": 0.9114666666666666,
+      "grad_norm": 0.4936236618332871,
+      "learning_rate": 4.0861855008460405e-06,
+      "loss": 0.6611,
+      "step": 1709
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4446079264504513,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.5919,
+      "step": 1710
+    },
+    {
+      "epoch": 0.9125333333333333,
+      "grad_norm": 0.4188549944038524,
+      "learning_rate": 3.988972323910778e-06,
+      "loss": 0.6815,
+      "step": 1711
+    },
+    {
+      "epoch": 0.9130666666666667,
+      "grad_norm": 0.37412575666763476,
+      "learning_rate": 3.9407957183368095e-06,
+      "loss": 0.5891,
+      "step": 1712
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.3973365621648777,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.5576,
+      "step": 1713
+    },
+    {
+      "epoch": 0.9141333333333334,
+      "grad_norm": 0.5505908525386042,
+      "learning_rate": 3.845303192289074e-06,
+      "loss": 0.6993,
+      "step": 1714
+    },
+    {
+      "epoch": 0.9146666666666666,
+      "grad_norm": 0.39051526021548083,
+      "learning_rate": 3.797987556970495e-06,
+      "loss": 0.5802,
+      "step": 1715
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.4588299711160169,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6591,
+      "step": 1716
+    },
+    {
+      "epoch": 0.9157333333333333,
+      "grad_norm": 0.43305516598913746,
+      "learning_rate": 3.7042182482018075e-06,
+      "loss": 0.6858,
+      "step": 1717
+    },
+    {
+      "epoch": 0.9162666666666667,
+      "grad_norm": 0.4181562679597887,
+      "learning_rate": 3.6577648547611033e-06,
+      "loss": 0.6112,
+      "step": 1718
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.5274864643828572,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.7102,
+      "step": 1719
+    },
+    {
+      "epoch": 0.9173333333333333,
+      "grad_norm": 0.48775833418327424,
+      "learning_rate": 3.565721283350931e-06,
+      "loss": 0.695,
+      "step": 1720
+    },
+    {
+      "epoch": 0.9178666666666667,
+      "grad_norm": 0.5580209662077895,
+      "learning_rate": 3.5201313802375456e-06,
+      "loss": 0.755,
+      "step": 1721
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.40488479521891896,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.677,
+      "step": 1722
+    },
+    {
+      "epoch": 0.9189333333333334,
+      "grad_norm": 0.4199604648333833,
+      "learning_rate": 3.4298160198856568e-06,
+      "loss": 0.6491,
+      "step": 1723
+    },
+    {
+      "epoch": 0.9194666666666667,
+      "grad_norm": 0.3844521137272803,
+      "learning_rate": 3.3850908323424967e-06,
+      "loss": 0.6531,
+      "step": 1724
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.48731897208695457,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.7045,
+      "step": 1725
+    },
+    {
+      "epoch": 0.9205333333333333,
+      "grad_norm": 0.5383857503345972,
+      "learning_rate": 3.296506110302422e-06,
+      "loss": 0.7467,
+      "step": 1726
+    },
+    {
+      "epoch": 0.9210666666666667,
+      "grad_norm": 0.3962907679692899,
+      "learning_rate": 3.252646840332918e-06,
+      "loss": 0.6084,
+      "step": 1727
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4363993116628175,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6224,
+      "step": 1728
+    },
+    {
+      "epoch": 0.9221333333333334,
+      "grad_norm": 0.3815183988303869,
+      "learning_rate": 3.1657951373467497e-06,
+      "loss": 0.6161,
+      "step": 1729
+    },
+    {
+      "epoch": 0.9226666666666666,
+      "grad_norm": 0.35610263564934913,
+      "learning_rate": 3.1228029636824475e-06,
+      "loss": 0.629,
+      "step": 1730
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.40053521504278355,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6371,
+      "step": 1731
+    },
+    {
+      "epoch": 0.9237333333333333,
+      "grad_norm": 0.4369059990678778,
+      "learning_rate": 3.037686613916857e-06,
+      "loss": 0.6471,
+      "step": 1732
+    },
+    {
+      "epoch": 0.9242666666666667,
+      "grad_norm": 0.40213261631132446,
+      "learning_rate": 2.995562691985898e-06,
+      "loss": 0.6986,
+      "step": 1733
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.5612624532899247,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.7653,
+      "step": 1734
+    },
+    {
+      "epoch": 0.9253333333333333,
+      "grad_norm": 0.5538150635552385,
+      "learning_rate": 2.912183982969385e-06,
+      "loss": 0.6946,
+      "step": 1735
+    },
+    {
+      "epoch": 0.9258666666666666,
+      "grad_norm": 0.4705386625620636,
+      "learning_rate": 2.8709294448653225e-06,
+      "loss": 0.6737,
+      "step": 1736
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.38927429742939385,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6392,
+      "step": 1737
+    },
+    {
+      "epoch": 0.9269333333333334,
+      "grad_norm": 0.4248654288152213,
+      "learning_rate": 2.789290617426765e-06,
+      "loss": 0.6731,
+      "step": 1738
+    },
+    {
+      "epoch": 0.9274666666666667,
+      "grad_norm": 0.6866722643448364,
+      "learning_rate": 2.748906571878207e-06,
+      "loss": 0.685,
+      "step": 1739
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.45430216283323627,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.6642,
+      "step": 1740
+    },
+    {
+      "epoch": 0.9285333333333333,
+      "grad_norm": 0.4908708531043127,
+      "learning_rate": 2.6690098200866098e-06,
+      "loss": 0.6784,
+      "step": 1741
+    },
+    {
+      "epoch": 0.9290666666666667,
+      "grad_norm": 0.39701150718508316,
+      "learning_rate": 2.6294973524274125e-06,
+      "loss": 0.5999,
+      "step": 1742
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.43096087129001465,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6403,
+      "step": 1743
+    },
+    {
+      "epoch": 0.9301333333333334,
+      "grad_norm": 0.3358603543528395,
+      "learning_rate": 2.551344823532964e-06,
+      "loss": 0.5447,
+      "step": 1744
+    },
+    {
+      "epoch": 0.9306666666666666,
+      "grad_norm": 0.420568826048583,
+      "learning_rate": 2.5127049956730207e-06,
+      "loss": 0.639,
+      "step": 1745
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.4656415532714179,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.6152,
+      "step": 1746
+    },
+    {
+      "epoch": 0.9317333333333333,
+      "grad_norm": 0.3394905390020499,
+      "learning_rate": 2.436298790049363e-06,
+      "loss": 0.5866,
+      "step": 1747
+    },
+    {
+      "epoch": 0.9322666666666667,
+      "grad_norm": 0.38209415876202724,
+      "learning_rate": 2.3985326404461604e-06,
+      "loss": 0.6513,
+      "step": 1748
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.39565128177302533,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6567,
+      "step": 1749
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.4285550280410376,
+      "learning_rate": 2.3238748115339324e-06,
+      "loss": 0.6859,
+      "step": 1750
+    },
+    {
+      "epoch": 0.9338666666666666,
+      "grad_norm": 0.40854938238705735,
+      "learning_rate": 2.286983355164529e-06,
+      "loss": 0.6726,
+      "step": 1751
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.4158470411645834,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6526,
+      "step": 1752
+    },
+    {
+      "epoch": 0.9349333333333333,
+      "grad_norm": 0.5060676799282359,
+      "learning_rate": 2.2140759094162467e-06,
+      "loss": 0.6022,
+      "step": 1753
+    },
+    {
+      "epoch": 0.9354666666666667,
+      "grad_norm": 0.42024963141870625,
+      "learning_rate": 2.178060137750071e-06,
+      "loss": 0.6029,
+      "step": 1754
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.3818807214931423,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.6145,
+      "step": 1755
+    },
+    {
+      "epoch": 0.9365333333333333,
+      "grad_norm": 0.41950692764051695,
+      "learning_rate": 2.106905034576112e-06,
+      "loss": 0.6712,
+      "step": 1756
+    },
+    {
+      "epoch": 0.9370666666666667,
+      "grad_norm": 0.4948719627454073,
+      "learning_rate": 2.0717659155482738e-06,
+      "loss": 0.7097,
+      "step": 1757
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.42376708716125905,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.685,
+      "step": 1758
+    },
+    {
+      "epoch": 0.9381333333333334,
+      "grad_norm": 0.46599713659798614,
+      "learning_rate": 2.002365067264289e-06,
+      "loss": 0.5762,
+      "step": 1759
+    },
+    {
+      "epoch": 0.9386666666666666,
+      "grad_norm": 0.502423239699822,
+      "learning_rate": 1.968103545249611e-06,
+      "loss": 0.6482,
+      "step": 1760
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.41704783463295836,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6231,
+      "step": 1761
+    },
+    {
+      "epoch": 0.9397333333333333,
+      "grad_norm": 0.4471318403366392,
+      "learning_rate": 1.900458817025097e-06,
+      "loss": 0.58,
+      "step": 1762
+    },
+    {
+      "epoch": 0.9402666666666667,
+      "grad_norm": 0.4930432533346224,
+      "learning_rate": 1.8670758128126909e-06,
+      "loss": 0.6066,
+      "step": 1763
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.4191093443697424,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6389,
+      "step": 1764
+    },
+    {
+      "epoch": 0.9413333333333334,
+      "grad_norm": 0.4643665707610858,
+      "learning_rate": 1.8011890226208527e-06,
+      "loss": 0.6667,
+      "step": 1765
+    },
+    {
+      "epoch": 0.9418666666666666,
+      "grad_norm": 0.39975257989388546,
+      "learning_rate": 1.7686854333893833e-06,
+      "loss": 0.6552,
+      "step": 1766
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.46064028104022436,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6825,
+      "step": 1767
+    },
+    {
+      "epoch": 0.9429333333333333,
+      "grad_norm": 0.4004501369453656,
+      "learning_rate": 1.7045583519583074e-06,
+      "loss": 0.6653,
+      "step": 1768
+    },
+    {
+      "epoch": 0.9434666666666667,
+      "grad_norm": 0.44625854340376037,
+      "learning_rate": 1.6729350512519005e-06,
+      "loss": 0.7159,
+      "step": 1769
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4434161255187958,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6436,
+      "step": 1770
+    },
+    {
+      "epoch": 0.9445333333333333,
+      "grad_norm": 0.4344127642206033,
+      "learning_rate": 1.6105694020169593e-06,
+      "loss": 0.6412,
+      "step": 1771
+    },
+    {
+      "epoch": 0.9450666666666667,
+      "grad_norm": 0.40063459126374223,
+      "learning_rate": 1.5798272397217095e-06,
+      "loss": 0.6225,
+      "step": 1772
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.43356689270483173,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6219,
+      "step": 1773
+    },
+    {
+      "epoch": 0.9461333333333334,
+      "grad_norm": 0.5220526369818685,
+      "learning_rate": 1.5192246987791981e-06,
+      "loss": 0.6594,
+      "step": 1774
+    },
+    {
+      "epoch": 0.9466666666666667,
+      "grad_norm": 0.5423024778568717,
+      "learning_rate": 1.489364501100332e-06,
+      "loss": 0.7521,
+      "step": 1775
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.4004007956034665,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6324,
+      "step": 1776
+    },
+    {
+      "epoch": 0.9477333333333333,
+      "grad_norm": 0.46326132066593056,
+      "learning_rate": 1.430526697162482e-06,
+      "loss": 0.6959,
+      "step": 1777
+    },
+    {
+      "epoch": 0.9482666666666667,
+      "grad_norm": 0.5839550416919262,
+      "learning_rate": 1.4015492666021312e-06,
+      "loss": 0.6809,
+      "step": 1778
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.45523305572258604,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.6495,
+      "step": 1779
+    },
+    {
+      "epoch": 0.9493333333333334,
+      "grad_norm": 0.4597435532782685,
+      "learning_rate": 1.344477780953346e-06,
+      "loss": 0.6678,
+      "step": 1780
+    },
+    {
+      "epoch": 0.9498666666666666,
+      "grad_norm": 0.424409525577842,
+      "learning_rate": 1.3163838962890195e-06,
+      "loss": 0.6858,
+      "step": 1781
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.46094318525091577,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6388,
+      "step": 1782
+    },
+    {
+      "epoch": 0.9509333333333333,
+      "grad_norm": 0.5355779994156216,
+      "learning_rate": 1.261080262743297e-06,
+      "loss": 0.6435,
+      "step": 1783
+    },
+    {
+      "epoch": 0.9514666666666667,
+      "grad_norm": 0.3531036043502969,
+      "learning_rate": 1.2338706790069431e-06,
+      "loss": 0.545,
+      "step": 1784
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.3698317888394057,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.5993,
+      "step": 1785
+    },
+    {
+      "epoch": 0.9525333333333333,
+      "grad_norm": 0.5431907428668711,
+      "learning_rate": 1.1803363838667092e-06,
+      "loss": 0.8041,
+      "step": 1786
+    },
+    {
+      "epoch": 0.9530666666666666,
+      "grad_norm": 0.4553855167796496,
+      "learning_rate": 1.1540118323243865e-06,
+      "loss": 0.6571,
+      "step": 1787
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.4350567834631104,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6423,
+      "step": 1788
+    },
+    {
+      "epoch": 0.9541333333333334,
+      "grad_norm": 0.4747873187224694,
+      "learning_rate": 1.1022483143405705e-06,
+      "loss": 0.6739,
+      "step": 1789
+    },
+    {
+      "epoch": 0.9546666666666667,
+      "grad_norm": 0.40461884370271717,
+      "learning_rate": 1.076809502472831e-06,
+      "loss": 0.72,
+      "step": 1790
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.4167798030556322,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6255,
+      "step": 1791
+    },
+    {
+      "epoch": 0.9557333333333333,
+      "grad_norm": 0.46845901849429383,
+      "learning_rate": 1.0268181528061749e-06,
+      "loss": 0.6545,
+      "step": 1792
+    },
+    {
+      "epoch": 0.9562666666666667,
+      "grad_norm": 0.3973222335473333,
+      "learning_rate": 1.0022657642890231e-06,
+      "loss": 0.6372,
+      "step": 1793
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.38073284437655835,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.6289,
+      "step": 1794
+    },
+    {
+      "epoch": 0.9573333333333334,
+      "grad_norm": 0.38073722863953624,
+      "learning_rate": 9.540479264726676e-07,
+      "loss": 0.5871,
+      "step": 1795
+    },
+    {
+      "epoch": 0.9578666666666666,
+      "grad_norm": 0.4190946658205246,
+      "learning_rate": 9.303826211592315e-07,
+      "loss": 0.602,
+      "step": 1796
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.4337641950485008,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6911,
+      "step": 1797
+    },
+    {
+      "epoch": 0.9589333333333333,
+      "grad_norm": 0.4265497198339448,
+      "learning_rate": 8.839395910626213e-07,
+      "loss": 0.6969,
+      "step": 1798
+    },
+    {
+      "epoch": 0.9594666666666667,
+      "grad_norm": 0.41996031012540436,
+      "learning_rate": 8.611620049653879e-07,
+      "loss": 0.6376,
+      "step": 1799
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.5499886782502461,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.7559,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9605333333333334,
+      "grad_norm": 0.3828713210153735,
+      "learning_rate": 8.16495030759501e-07,
+      "loss": 0.5777,
+      "step": 1801
+    },
+    {
+      "epoch": 0.9610666666666666,
+      "grad_norm": 0.4611247428551417,
+      "learning_rate": 7.946057760332193e-07,
+      "loss": 0.6816,
+      "step": 1802
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.36152230735902663,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6159,
+      "step": 1803
+    },
+    {
+      "epoch": 0.9621333333333333,
+      "grad_norm": 0.4419954083812373,
+      "learning_rate": 7.517160581569372e-07,
+      "loss": 0.7064,
+      "step": 1804
+    },
+    {
+      "epoch": 0.9626666666666667,
+      "grad_norm": 0.4265143979697728,
+      "learning_rate": 7.307157230821426e-07,
+      "loss": 0.6554,
+      "step": 1805
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.39116645586599447,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6161,
+      "step": 1806
+    },
+    {
+      "epoch": 0.9637333333333333,
+      "grad_norm": 0.567563131148666,
+      "learning_rate": 6.896044142100433e-07,
+      "loss": 0.6576,
+      "step": 1807
+    },
+    {
+      "epoch": 0.9642666666666667,
+      "grad_norm": 0.5093875925229141,
+      "learning_rate": 6.694935631773258e-07,
+      "loss": 0.6774,
+      "step": 1808
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.5584664714476052,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6326,
+      "step": 1809
+    },
+    {
+      "epoch": 0.9653333333333334,
+      "grad_norm": 0.487255380633235,
+      "learning_rate": 6.301617681886863e-07,
+      "loss": 0.7465,
+      "step": 1810
+    },
+    {
+      "epoch": 0.9658666666666667,
+      "grad_norm": 0.4350224105217472,
+      "learning_rate": 6.109409416834688e-07,
+      "loss": 0.6694,
+      "step": 1811
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.5007742206468339,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6576,
+      "step": 1812
+    },
+    {
+      "epoch": 0.9669333333333333,
+      "grad_norm": 0.49278031797530814,
+      "learning_rate": 5.733897176325665e-07,
+      "loss": 0.6655,
+      "step": 1813
+    },
+    {
+      "epoch": 0.9674666666666667,
+      "grad_norm": 0.36776361811801445,
+      "learning_rate": 5.550594322205504e-07,
+      "loss": 0.6232,
+      "step": 1814
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.43084289840079937,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6286,
+      "step": 1815
+    },
+    {
+      "epoch": 0.9685333333333334,
+      "grad_norm": 0.4415496400128871,
+      "learning_rate": 5.192897883082747e-07,
+      "loss": 0.6283,
+      "step": 1816
+    },
+    {
+      "epoch": 0.9690666666666666,
+      "grad_norm": 0.3450616572510623,
+      "learning_rate": 5.018505366216175e-07,
+      "loss": 0.5864,
+      "step": 1817
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.4149338144520578,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6326,
+      "step": 1818
+    },
+    {
+      "epoch": 0.9701333333333333,
+      "grad_norm": 0.38583315875884233,
+      "learning_rate": 4.678634341683252e-07,
+      "loss": 0.6516,
+      "step": 1819
+    },
+    {
+      "epoch": 0.9706666666666667,
+      "grad_norm": 0.39196067997004447,
+      "learning_rate": 4.5131568489236166e-07,
+      "loss": 0.6474,
+      "step": 1820
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.4627428355091497,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6496,
+      "step": 1821
+    },
+    {
+      "epoch": 0.9717333333333333,
+      "grad_norm": 0.5094001659759443,
+      "learning_rate": 4.191120373120749e-07,
+      "loss": 0.7016,
+      "step": 1822
+    },
+    {
+      "epoch": 0.9722666666666666,
+      "grad_norm": 0.40974755328404217,
+      "learning_rate": 4.034562351727389e-07,
+      "loss": 0.5765,
+      "step": 1823
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.40032544663655756,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.7405,
+      "step": 1824
+    },
+    {
+      "epoch": 0.9733333333333334,
+      "grad_norm": 0.41338686182286327,
+      "learning_rate": 3.73036907948543e-07,
+      "loss": 0.6478,
+      "step": 1825
+    },
+    {
+      "epoch": 0.9738666666666667,
+      "grad_norm": 0.5006313374914235,
+      "learning_rate": 3.582734737004101e-07,
+      "loss": 0.6717,
+      "step": 1826
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.3584787782331542,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.6608,
+      "step": 1827
+    },
+    {
+      "epoch": 0.9749333333333333,
+      "grad_norm": 0.3957933792772522,
+      "learning_rate": 3.296392843612273e-07,
+      "loss": 0.6643,
+      "step": 1828
+    },
+    {
+      "epoch": 0.9754666666666667,
+      "grad_norm": 0.4720312848719183,
+      "learning_rate": 3.1576861477621287e-07,
+      "loss": 0.6611,
+      "step": 1829
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4435292333643389,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6347,
+      "step": 1830
+    },
+    {
+      "epoch": 0.9765333333333334,
+      "grad_norm": 0.41758627561980133,
+      "learning_rate": 2.889203328748424e-07,
+      "loss": 0.6293,
+      "step": 1831
+    },
+    {
+      "epoch": 0.9770666666666666,
+      "grad_norm": 0.4785062670362954,
+      "learning_rate": 2.759428007315212e-07,
+      "loss": 0.6627,
+      "step": 1832
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.3946963967872426,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.5852,
+      "step": 1833
+    },
+    {
+      "epoch": 0.9781333333333333,
+      "grad_norm": 0.42829762589529746,
+      "learning_rate": 2.5088114782392257e-07,
+      "loss": 0.6636,
+      "step": 1834
+    },
+    {
+      "epoch": 0.9786666666666667,
+      "grad_norm": 0.44071064352491884,
+      "learning_rate": 2.3879710189753656e-07,
+      "loss": 0.6802,
+      "step": 1835
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.3940438253700029,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.5963,
+      "step": 1836
+    },
+    {
+      "epoch": 0.9797333333333333,
+      "grad_norm": 0.42639732872760644,
+      "learning_rate": 2.15522751523467e-07,
+      "loss": 0.5808,
+      "step": 1837
+    },
+    {
+      "epoch": 0.9802666666666666,
+      "grad_norm": 0.4166432355006099,
+      "learning_rate": 2.0433251657653308e-07,
+      "loss": 0.6667,
+      "step": 1838
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.4161754160413017,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6684,
+      "step": 1839
+    },
+    {
+      "epoch": 0.9813333333333333,
+      "grad_norm": 0.4216631747912127,
+      "learning_rate": 1.8284609424142895e-07,
+      "loss": 0.622,
+      "step": 1840
+    },
+    {
+      "epoch": 0.9818666666666667,
+      "grad_norm": 0.710984201933716,
+      "learning_rate": 1.7254997101500137e-07,
+      "loss": 0.6509,
+      "step": 1841
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.45259145045156,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6937,
+      "step": 1842
+    },
+    {
+      "epoch": 0.9829333333333333,
+      "grad_norm": 0.44887693286795777,
+      "learning_rate": 1.5285205417319149e-07,
+      "loss": 0.6366,
+      "step": 1843
+    },
+    {
+      "epoch": 0.9834666666666667,
+      "grad_norm": 0.42828981202862987,
+      "learning_rate": 1.4345031937879062e-07,
+      "loss": 0.5888,
+      "step": 1844
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.46509297587211545,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6257,
+      "step": 1845
+    },
+    {
+      "epoch": 0.9845333333333334,
+      "grad_norm": 0.4244069577499102,
+      "learning_rate": 1.255414374179531e-07,
+      "loss": 0.6163,
+      "step": 1846
+    },
+    {
+      "epoch": 0.9850666666666666,
+      "grad_norm": 0.46625633515460524,
+      "learning_rate": 1.170343437301491e-07,
+      "loss": 0.7083,
+      "step": 1847
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.36187151298797665,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.5688,
+      "step": 1848
+    },
+    {
+      "epoch": 0.9861333333333333,
+      "grad_norm": 0.4789112174362444,
+      "learning_rate": 1.0091497795706728e-07,
+      "loss": 0.6826,
+      "step": 1849
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.5058145200460931,
+      "learning_rate": 9.330275400666332e-08,
+      "loss": 0.6656,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.4508626575496384,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6407,
+      "step": 1851
+    },
+    {
+      "epoch": 0.9877333333333334,
+      "grad_norm": 0.446817396692741,
+      "learning_rate": 7.8973337634336e-08,
+      "loss": 0.6505,
+      "step": 1852
+    },
+    {
+      "epoch": 0.9882666666666666,
+      "grad_norm": 0.4559128212421954,
+      "learning_rate": 7.225618800222877e-08,
+      "loss": 0.7048,
+      "step": 1853
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.4693187807566613,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.5972,
+      "step": 1854
+    },
+    {
+      "epoch": 0.9893333333333333,
+      "grad_norm": 0.4303581174526853,
+      "learning_rate": 5.971710613821291e-08,
+      "loss": 0.67,
+      "step": 1855
+    },
+    {
+      "epoch": 0.9898666666666667,
+      "grad_norm": 0.40639260137853606,
+      "learning_rate": 5.389521134989695e-08,
+      "loss": 0.6457,
+      "step": 1856
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.42425778703745654,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6568,
+      "step": 1857
+    },
+    {
+      "epoch": 0.9909333333333333,
+      "grad_norm": 0.41936464307298676,
+      "learning_rate": 4.314680098592705e-08,
+      "loss": 0.6312,
+      "step": 1858
+    },
+    {
+      "epoch": 0.9914666666666667,
+      "grad_norm": 0.4194137029906762,
+      "learning_rate": 3.8220317506654226e-08,
+      "loss": 0.6429,
+      "step": 1859
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.45631605031815015,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6457,
+      "step": 1860
+    },
+    {
+      "epoch": 0.9925333333333334,
+      "grad_norm": 0.439101437133196,
+      "learning_rate": 2.9262867509605163e-08,
+      "loss": 0.6866,
+      "step": 1861
+    },
+    {
+      "epoch": 0.9930666666666667,
+      "grad_norm": 0.4851770734086949,
+      "learning_rate": 2.5231927740154704e-08,
+      "loss": 0.669,
+      "step": 1862
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.4469981928465987,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6478,
+      "step": 1863
+    },
+    {
+      "epoch": 0.9941333333333333,
+      "grad_norm": 0.43997181930545687,
+      "learning_rate": 1.8065678844314538e-08,
+      "loss": 0.6979,
+      "step": 1864
+    },
+    {
+      "epoch": 0.9946666666666667,
+      "grad_norm": 0.4445565582998424,
+      "learning_rate": 1.4930391117451426e-08,
+      "loss": 0.6102,
+      "step": 1865
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.4641788184566056,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.5776,
+      "step": 1866
+    },
+    {
+      "epoch": 0.9957333333333334,
+      "grad_norm": 0.46727608567067147,
+      "learning_rate": 9.555535917993297e-09,
+      "loss": 0.7867,
+      "step": 1867
+    },
+    {
+      "epoch": 0.9962666666666666,
+      "grad_norm": 0.4669485338942387,
+      "learning_rate": 7.315984495548378e-09,
+      "loss": 0.6466,
+      "step": 1868
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.38202673202303844,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6139,
+      "step": 1869
+    },
+    {
+      "epoch": 0.9973333333333333,
+      "grad_norm": 0.40543450738181425,
+      "learning_rate": 3.732667443390181e-09,
+      "loss": 0.6796,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9978666666666667,
+      "grad_norm": 0.46735765973973104,
+      "learning_rate": 2.388912514017516e-09,
+      "loss": 0.6674,
+      "step": 1871
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.43420510375418675,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.5687,
+      "step": 1872
+    },
+    {
+      "epoch": 0.9989333333333333,
+      "grad_norm": 0.4329805481272118,
+      "learning_rate": 5.972299119250125e-10,
+      "loss": 0.6635,
+      "step": 1873
+    },
+    {
+      "epoch": 0.9994666666666666,
+      "grad_norm": 0.5417662911864329,
+      "learning_rate": 1.4930758944764479e-10,
+      "loss": 0.6103,
+      "step": 1874
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.4466295705323495,
+      "learning_rate": 0.0,
+      "loss": 0.6386,
+      "step": 1875
+    },
+    {
+      "epoch": 1.0,
+      "step": 1875,
+      "total_flos": 1615624508571648.0,
+      "train_loss": 0.7279649205525716,
+      "train_runtime": 29022.034,
+      "train_samples_per_second": 1.034,
+      "train_steps_per_second": 0.065
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1615624508571648.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6012a4ebec36e2eba6258d3b4595f70f4a64a094
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "down_proj",
+    "q_proj",
+    "k_proj",
+    "gate_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ae50c1d9bfd9d7f87e5ba15c0b44de3afa36f7f6
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba6b277f48d7dfaa5e05a643477dbb65a3549a1dfccfc581101891ed2edb6ad9
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b65fcb6adfc0c662578d8d8950e1f639bb56effa
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9a20fd34a6c8ca151907d24cd0b8d9b1480f4a9fd387812faa61c971b1da711
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..168989d3270dcde7a3f34471ab90cbe646a6c73e
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,13167 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005333333333333334,
+      "grad_norm": 0.9119700272975201,
+      "learning_rate": 3.5087719298245615e-06,
+      "loss": 1.3821,
+      "step": 1
+    },
+    {
+      "epoch": 0.0010666666666666667,
+      "grad_norm": 1.0718196643084557,
+      "learning_rate": 7.017543859649123e-06,
+      "loss": 1.4152,
+      "step": 2
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.1393778386467817,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.5635,
+      "step": 3
+    },
+    {
+      "epoch": 0.0021333333333333334,
+      "grad_norm": 1.0131635593166934,
+      "learning_rate": 1.4035087719298246e-05,
+      "loss": 1.4527,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026666666666666666,
+      "grad_norm": 0.8693186056499881,
+      "learning_rate": 1.7543859649122806e-05,
+      "loss": 1.2904,
+      "step": 5
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9535159566588861,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.4675,
+      "step": 6
+    },
+    {
+      "epoch": 0.0037333333333333333,
+      "grad_norm": 0.9025498048400175,
+      "learning_rate": 2.456140350877193e-05,
+      "loss": 1.2784,
+      "step": 7
+    },
+    {
+      "epoch": 0.004266666666666667,
+      "grad_norm": 0.9991334895096446,
+      "learning_rate": 2.8070175438596492e-05,
+      "loss": 1.2612,
+      "step": 8
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.8756434808752498,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.205,
+      "step": 9
+    },
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 0.900983440467934,
+      "learning_rate": 3.508771929824561e-05,
+      "loss": 1.0833,
+      "step": 10
+    },
+    {
+      "epoch": 0.005866666666666667,
+      "grad_norm": 0.8303584673970603,
+      "learning_rate": 3.859649122807018e-05,
+      "loss": 1.0393,
+      "step": 11
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8616493818161056,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.0162,
+      "step": 12
+    },
+    {
+      "epoch": 0.006933333333333333,
+      "grad_norm": 0.7882583515645812,
+      "learning_rate": 4.56140350877193e-05,
+      "loss": 0.9879,
+      "step": 13
+    },
+    {
+      "epoch": 0.007466666666666667,
+      "grad_norm": 0.8990632518216867,
+      "learning_rate": 4.912280701754386e-05,
+      "loss": 1.0495,
+      "step": 14
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.7193602759554054,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 0.9614,
+      "step": 15
+    },
+    {
+      "epoch": 0.008533333333333334,
+      "grad_norm": 0.7455038933596019,
+      "learning_rate": 5.6140350877192984e-05,
+      "loss": 0.9421,
+      "step": 16
+    },
+    {
+      "epoch": 0.009066666666666667,
+      "grad_norm": 0.7662290819962464,
+      "learning_rate": 5.9649122807017544e-05,
+      "loss": 0.9845,
+      "step": 17
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7454656513898643,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.0582,
+      "step": 18
+    },
+    {
+      "epoch": 0.010133333333333333,
+      "grad_norm": 0.7822446530803914,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 1.057,
+      "step": 19
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": 0.5982877909907908,
+      "learning_rate": 7.017543859649122e-05,
+      "loss": 0.968,
+      "step": 20
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.5840007619802585,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.9889,
+      "step": 21
+    },
+    {
+      "epoch": 0.011733333333333333,
+      "grad_norm": 0.7592239226578813,
+      "learning_rate": 7.719298245614036e-05,
+      "loss": 0.9574,
+      "step": 22
+    },
+    {
+      "epoch": 0.012266666666666667,
+      "grad_norm": 0.5328144086451,
+      "learning_rate": 8.070175438596491e-05,
+      "loss": 0.8629,
+      "step": 23
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.6020252347294877,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.9021,
+      "step": 24
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 0.7507292569128217,
+      "learning_rate": 8.771929824561403e-05,
+      "loss": 1.0836,
+      "step": 25
+    },
+    {
+      "epoch": 0.013866666666666666,
+      "grad_norm": 0.5839345053116907,
+      "learning_rate": 9.12280701754386e-05,
+      "loss": 0.9046,
+      "step": 26
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.6215030194976022,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9765,
+      "step": 27
+    },
+    {
+      "epoch": 0.014933333333333333,
+      "grad_norm": 0.4915343259310359,
+      "learning_rate": 9.824561403508771e-05,
+      "loss": 0.8249,
+      "step": 28
+    },
+    {
+      "epoch": 0.015466666666666667,
+      "grad_norm": 0.4972319010091224,
+      "learning_rate": 0.0001017543859649123,
+      "loss": 0.8724,
+      "step": 29
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.59980842610381,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.9404,
+      "step": 30
+    },
+    {
+      "epoch": 0.016533333333333334,
+      "grad_norm": 0.46809304950939246,
+      "learning_rate": 0.00010877192982456141,
+      "loss": 0.8375,
+      "step": 31
+    },
+    {
+      "epoch": 0.017066666666666667,
+      "grad_norm": 0.5945233102204843,
+      "learning_rate": 0.00011228070175438597,
+      "loss": 0.9573,
+      "step": 32
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.6260137470165155,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.9153,
+      "step": 33
+    },
+    {
+      "epoch": 0.018133333333333335,
+      "grad_norm": 0.6023912493485183,
+      "learning_rate": 0.00011929824561403509,
+      "loss": 0.9513,
+      "step": 34
+    },
+    {
+      "epoch": 0.018666666666666668,
+      "grad_norm": 0.5556194850064836,
+      "learning_rate": 0.00012280701754385965,
+      "loss": 0.8632,
+      "step": 35
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5021439033588748,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8663,
+      "step": 36
+    },
+    {
+      "epoch": 0.019733333333333332,
+      "grad_norm": 0.6624496837211223,
+      "learning_rate": 0.0001298245614035088,
+      "loss": 0.8901,
+      "step": 37
+    },
+    {
+      "epoch": 0.020266666666666665,
+      "grad_norm": 0.48543070481003414,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.8713,
+      "step": 38
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.5724306483192274,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9028,
+      "step": 39
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 0.49738799856649346,
+      "learning_rate": 0.00014035087719298245,
+      "loss": 0.8829,
+      "step": 40
+    },
+    {
+      "epoch": 0.021866666666666666,
+      "grad_norm": 0.5778587860212584,
+      "learning_rate": 0.00014385964912280703,
+      "loss": 0.8749,
+      "step": 41
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5426013547625161,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.8475,
+      "step": 42
+    },
+    {
+      "epoch": 0.022933333333333333,
+      "grad_norm": 0.5662964359473427,
+      "learning_rate": 0.00015087719298245616,
+      "loss": 0.9221,
+      "step": 43
+    },
+    {
+      "epoch": 0.023466666666666667,
+      "grad_norm": 0.5117063133824206,
+      "learning_rate": 0.0001543859649122807,
+      "loss": 0.8964,
+      "step": 44
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.4969149597813583,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8306,
+      "step": 45
+    },
+    {
+      "epoch": 0.024533333333333334,
+      "grad_norm": 0.48172094638505625,
+      "learning_rate": 0.00016140350877192982,
+      "loss": 0.8223,
+      "step": 46
+    },
+    {
+      "epoch": 0.025066666666666668,
+      "grad_norm": 0.48071715164871254,
+      "learning_rate": 0.0001649122807017544,
+      "loss": 0.8312,
+      "step": 47
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5292675753394324,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.9317,
+      "step": 48
+    },
+    {
+      "epoch": 0.026133333333333335,
+      "grad_norm": 0.5342209972167025,
+      "learning_rate": 0.00017192982456140353,
+      "loss": 0.8673,
+      "step": 49
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.5081316121656525,
+      "learning_rate": 0.00017543859649122806,
+      "loss": 0.8913,
+      "step": 50
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.5588855560452214,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8175,
+      "step": 51
+    },
+    {
+      "epoch": 0.027733333333333332,
+      "grad_norm": 0.5323609152601871,
+      "learning_rate": 0.0001824561403508772,
+      "loss": 0.8262,
+      "step": 52
+    },
+    {
+      "epoch": 0.028266666666666666,
+      "grad_norm": 0.4803683672039637,
+      "learning_rate": 0.00018596491228070177,
+      "loss": 0.8626,
+      "step": 53
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5118088641558101,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8519,
+      "step": 54
+    },
+    {
+      "epoch": 0.029333333333333333,
+      "grad_norm": 0.464785397119558,
+      "learning_rate": 0.00019298245614035088,
+      "loss": 0.8197,
+      "step": 55
+    },
+    {
+      "epoch": 0.029866666666666666,
+      "grad_norm": 0.48244517394651254,
+      "learning_rate": 0.00019649122807017543,
+      "loss": 0.8196,
+      "step": 56
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.5228604284909696,
+      "learning_rate": 0.0002,
+      "loss": 0.9283,
+      "step": 57
+    },
+    {
+      "epoch": 0.030933333333333334,
+      "grad_norm": 0.43974129140484236,
+      "learning_rate": 0.00019999985069241055,
+      "loss": 0.7655,
+      "step": 58
+    },
+    {
+      "epoch": 0.031466666666666664,
+      "grad_norm": 0.4925496871049201,
+      "learning_rate": 0.00019999940277008808,
+      "loss": 0.7855,
+      "step": 59
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5410668694562225,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8061,
+      "step": 60
+    },
+    {
+      "epoch": 0.03253333333333333,
+      "grad_norm": 0.4921790802433175,
+      "learning_rate": 0.00019999761108748597,
+      "loss": 0.7518,
+      "step": 61
+    },
+    {
+      "epoch": 0.03306666666666667,
+      "grad_norm": 0.5760430011798686,
+      "learning_rate": 0.00019999626733255662,
+      "loss": 0.9381,
+      "step": 62
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.6915625292235289,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.9606,
+      "step": 63
+    },
+    {
+      "epoch": 0.034133333333333335,
+      "grad_norm": 0.5017585919637912,
+      "learning_rate": 0.00019999268401550447,
+      "loss": 0.8618,
+      "step": 64
+    },
+    {
+      "epoch": 0.034666666666666665,
+      "grad_norm": 0.5509276120158647,
+      "learning_rate": 0.000199990444464082,
+      "loss": 0.886,
+      "step": 65
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.43360403605997344,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.7596,
+      "step": 66
+    },
+    {
+      "epoch": 0.03573333333333333,
+      "grad_norm": 0.49634898390287524,
+      "learning_rate": 0.00019998506960888256,
+      "loss": 0.8257,
+      "step": 67
+    },
+    {
+      "epoch": 0.03626666666666667,
+      "grad_norm": 0.5745231614633087,
+      "learning_rate": 0.00019998193432115572,
+      "loss": 0.8586,
+      "step": 68
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.8266645215289209,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8365,
+      "step": 69
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 0.5477131617262088,
+      "learning_rate": 0.00019997476807225985,
+      "loss": 0.8333,
+      "step": 70
+    },
+    {
+      "epoch": 0.037866666666666667,
+      "grad_norm": 0.4705032521122792,
+      "learning_rate": 0.0001999707371324904,
+      "loss": 0.7209,
+      "step": 71
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 1.1485917970768864,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8667,
+      "step": 72
+    },
+    {
+      "epoch": 0.038933333333333334,
+      "grad_norm": 0.5213355277551339,
+      "learning_rate": 0.00019996177968249334,
+      "loss": 0.7743,
+      "step": 73
+    },
+    {
+      "epoch": 0.039466666666666664,
+      "grad_norm": 0.5383053332172496,
+      "learning_rate": 0.0001999568531990141,
+      "loss": 0.8704,
+      "step": 74
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6013015270279097,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.9075,
+      "step": 75
+    },
+    {
+      "epoch": 0.04053333333333333,
+      "grad_norm": 0.5678333700613905,
+      "learning_rate": 0.00019994610478865011,
+      "loss": 0.9107,
+      "step": 76
+    },
+    {
+      "epoch": 0.04106666666666667,
+      "grad_norm": 0.5042385210985819,
+      "learning_rate": 0.0001999402828938618,
+      "loss": 0.8281,
+      "step": 77
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5228070045040702,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8563,
+      "step": 78
+    },
+    {
+      "epoch": 0.042133333333333335,
+      "grad_norm": 0.48593817752220875,
+      "learning_rate": 0.00019992774381199778,
+      "loss": 0.8274,
+      "step": 79
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 0.45881395235536493,
+      "learning_rate": 0.00019992102666236566,
+      "loss": 0.7983,
+      "step": 80
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.4694744200503165,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8657,
+      "step": 81
+    },
+    {
+      "epoch": 0.04373333333333333,
+      "grad_norm": 0.586378163426478,
+      "learning_rate": 0.00019990669724599336,
+      "loss": 0.8803,
+      "step": 82
+    },
+    {
+      "epoch": 0.04426666666666667,
+      "grad_norm": 0.4795277369427708,
+      "learning_rate": 0.00019989908502204292,
+      "loss": 0.899,
+      "step": 83
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.6019636798520344,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.889,
+      "step": 84
+    },
+    {
+      "epoch": 0.04533333333333334,
+      "grad_norm": 0.5343643349771755,
+      "learning_rate": 0.00019988296565626987,
+      "loss": 0.8313,
+      "step": 85
+    },
+    {
+      "epoch": 0.04586666666666667,
+      "grad_norm": 0.5055538415468328,
+      "learning_rate": 0.00019987445856258206,
+      "loss": 0.7763,
+      "step": 86
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.5627564761750243,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.9671,
+      "step": 87
+    },
+    {
+      "epoch": 0.046933333333333334,
+      "grad_norm": 0.48616773415275505,
+      "learning_rate": 0.00019985654968062122,
+      "loss": 0.7953,
+      "step": 88
+    },
+    {
+      "epoch": 0.047466666666666664,
+      "grad_norm": 0.42410036246753435,
+      "learning_rate": 0.00019984714794582683,
+      "loss": 0.7518,
+      "step": 89
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5010923642769226,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8625,
+      "step": 90
+    },
+    {
+      "epoch": 0.04853333333333333,
+      "grad_norm": 0.5520335058511099,
+      "learning_rate": 0.000199827450028985,
+      "loss": 0.9194,
+      "step": 91
+    },
+    {
+      "epoch": 0.04906666666666667,
+      "grad_norm": 0.43426528508465045,
+      "learning_rate": 0.00019981715390575858,
+      "loss": 0.794,
+      "step": 92
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.6264895000839313,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8819,
+      "step": 93
+    },
+    {
+      "epoch": 0.050133333333333335,
+      "grad_norm": 0.44275750149644866,
+      "learning_rate": 0.00019979566748342347,
+      "loss": 0.7832,
+      "step": 94
+    },
+    {
+      "epoch": 0.050666666666666665,
+      "grad_norm": 0.5010512964374146,
+      "learning_rate": 0.00019978447724847652,
+      "loss": 0.7201,
+      "step": 95
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4091834743340737,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.7087,
+      "step": 96
+    },
+    {
+      "epoch": 0.05173333333333333,
+      "grad_norm": 0.5502180992156529,
+      "learning_rate": 0.00019976120289810247,
+      "loss": 0.8258,
+      "step": 97
+    },
+    {
+      "epoch": 0.05226666666666667,
+      "grad_norm": 0.5331677987078178,
+      "learning_rate": 0.00019974911885217608,
+      "loss": 0.8464,
+      "step": 98
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.4818881770102341,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8135,
+      "step": 99
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.4916236843884564,
+      "learning_rate": 0.0001997240571992685,
+      "loss": 0.785,
+      "step": 100
+    },
+    {
+      "epoch": 0.05386666666666667,
+      "grad_norm": 0.48518289942846865,
+      "learning_rate": 0.00019971107966712518,
+      "loss": 0.8556,
+      "step": 101
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.4791473934410761,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.7906,
+      "step": 102
+    },
+    {
+      "epoch": 0.054933333333333334,
+      "grad_norm": 0.49151403025248214,
+      "learning_rate": 0.0001996842313852238,
+      "loss": 0.8514,
+      "step": 103
+    },
+    {
+      "epoch": 0.055466666666666664,
+      "grad_norm": 0.4832521014843663,
+      "learning_rate": 0.00019967036071563877,
+      "loss": 0.7771,
+      "step": 104
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.5078382653849669,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.8755,
+      "step": 105
+    },
+    {
+      "epoch": 0.05653333333333333,
+      "grad_norm": 0.5340060334983346,
+      "learning_rate": 0.0001996417265262996,
+      "loss": 0.8358,
+      "step": 106
+    },
+    {
+      "epoch": 0.05706666666666667,
+      "grad_norm": 0.4805941528332125,
+      "learning_rate": 0.00019962696309205148,
+      "loss": 0.8763,
+      "step": 107
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5376214878006288,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8399,
+      "step": 108
+    },
+    {
+      "epoch": 0.058133333333333335,
+      "grad_norm": 0.41915303830518985,
+      "learning_rate": 0.0001995965437648273,
+      "loss": 0.7516,
+      "step": 109
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 0.46136233136624094,
+      "learning_rate": 0.00019958088796268793,
+      "loss": 0.8391,
+      "step": 110
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.4763426334860097,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8312,
+      "step": 111
+    },
+    {
+      "epoch": 0.05973333333333333,
+      "grad_norm": 0.5026388350654547,
+      "learning_rate": 0.00019954868431510764,
+      "loss": 0.8018,
+      "step": 112
+    },
+    {
+      "epoch": 0.06026666666666667,
+      "grad_norm": 0.6182458559640106,
+      "learning_rate": 0.00019953213656583168,
+      "loss": 0.9336,
+      "step": 113
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.527715068970576,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8682,
+      "step": 114
+    },
+    {
+      "epoch": 0.06133333333333333,
+      "grad_norm": 0.4447773028873336,
+      "learning_rate": 0.00019949814946337838,
+      "loss": 0.7474,
+      "step": 115
+    },
+    {
+      "epoch": 0.06186666666666667,
+      "grad_norm": 0.463959281978166,
+      "learning_rate": 0.00019948071021169174,
+      "loss": 0.8171,
+      "step": 116
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.5373781665591376,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.813,
+      "step": 117
+    },
+    {
+      "epoch": 0.06293333333333333,
+      "grad_norm": 0.4835772506454369,
+      "learning_rate": 0.00019944494056777946,
+      "loss": 0.8434,
+      "step": 118
+    },
+    {
+      "epoch": 0.06346666666666667,
+      "grad_norm": 0.4429695324944789,
+      "learning_rate": 0.00019942661028236745,
+      "loss": 0.8312,
+      "step": 119
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4128380918026377,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.7632,
+      "step": 120
+    },
+    {
+      "epoch": 0.06453333333333333,
+      "grad_norm": 0.5210114678778003,
+      "learning_rate": 0.00019938905905831654,
+      "loss": 0.8029,
+      "step": 121
+    },
+    {
+      "epoch": 0.06506666666666666,
+      "grad_norm": 0.44628619243476736,
+      "learning_rate": 0.00019936983823181132,
+      "loss": 0.8182,
+      "step": 122
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.5332101118781163,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.8561,
+      "step": 123
+    },
+    {
+      "epoch": 0.06613333333333334,
+      "grad_norm": 0.5342966562026665,
+      "learning_rate": 0.00019933050643682269,
+      "loss": 0.8538,
+      "step": 124
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.44840331988829313,
+      "learning_rate": 0.00019931039558578997,
+      "loss": 0.7975,
+      "step": 125
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.7289144373966412,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.8564,
+      "step": 126
+    },
+    {
+      "epoch": 0.06773333333333334,
+      "grad_norm": 0.4942579196886622,
+      "learning_rate": 0.00019926928427691786,
+      "loss": 0.8017,
+      "step": 127
+    },
+    {
+      "epoch": 0.06826666666666667,
+      "grad_norm": 0.5188942354342044,
+      "learning_rate": 0.00019924828394184306,
+      "loss": 0.7921,
+      "step": 128
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.5287360157098405,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.7583,
+      "step": 129
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 0.5252132506997987,
+      "learning_rate": 0.0001992053942239668,
+      "loss": 0.9195,
+      "step": 130
+    },
+    {
+      "epoch": 0.06986666666666666,
+      "grad_norm": 0.47226849074210947,
+      "learning_rate": 0.0001991835049692405,
+      "loss": 0.8064,
+      "step": 131
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.46085123217104457,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8815,
+      "step": 132
+    },
+    {
+      "epoch": 0.07093333333333333,
+      "grad_norm": 0.6590894707714906,
+      "learning_rate": 0.0001991388379950346,
+      "loss": 0.8787,
+      "step": 133
+    },
+    {
+      "epoch": 0.07146666666666666,
+      "grad_norm": 0.5029288229631176,
+      "learning_rate": 0.0001991160604089374,
+      "loss": 0.7836,
+      "step": 134
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5211207894918858,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.7657,
+      "step": 135
+    },
+    {
+      "epoch": 0.07253333333333334,
+      "grad_norm": 0.5465988338616571,
+      "learning_rate": 0.00019906961737884077,
+      "loss": 0.8561,
+      "step": 136
+    },
+    {
+      "epoch": 0.07306666666666667,
+      "grad_norm": 0.5809027775381883,
+      "learning_rate": 0.00019904595207352737,
+      "loss": 0.7855,
+      "step": 137
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.462816765401537,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8331,
+      "step": 138
+    },
+    {
+      "epoch": 0.07413333333333333,
+      "grad_norm": 0.5600373856307657,
+      "learning_rate": 0.000198997734235711,
+      "loss": 0.9407,
+      "step": 139
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.6067728214054904,
+      "learning_rate": 0.00019897318184719385,
+      "loss": 0.8819,
+      "step": 140
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.49008991813846703,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.7995,
+      "step": 141
+    },
+    {
+      "epoch": 0.07573333333333333,
+      "grad_norm": 0.5541028209169759,
+      "learning_rate": 0.0001989231904975272,
+      "loss": 0.8401,
+      "step": 142
+    },
+    {
+      "epoch": 0.07626666666666666,
+      "grad_norm": 0.464542513012385,
+      "learning_rate": 0.00019889775168565943,
+      "loss": 0.7591,
+      "step": 143
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5154056362998911,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7447,
+      "step": 144
+    },
+    {
+      "epoch": 0.07733333333333334,
+      "grad_norm": 0.5717212031111361,
+      "learning_rate": 0.00019884598816767563,
+      "loss": 0.8583,
+      "step": 145
+    },
+    {
+      "epoch": 0.07786666666666667,
+      "grad_norm": 0.5254911177742544,
+      "learning_rate": 0.0001988196636161333,
+      "loss": 0.7924,
+      "step": 146
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.4974739978149858,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8095,
+      "step": 147
+    },
+    {
+      "epoch": 0.07893333333333333,
+      "grad_norm": 0.5769448315890482,
+      "learning_rate": 0.00019876612932099308,
+      "loss": 0.9072,
+      "step": 148
+    },
+    {
+      "epoch": 0.07946666666666667,
+      "grad_norm": 0.5638757489101307,
+      "learning_rate": 0.0001987389197372567,
+      "loss": 0.8934,
+      "step": 149
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5079642645844904,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.796,
+      "step": 150
+    },
+    {
+      "epoch": 0.08053333333333333,
+      "grad_norm": 0.4475648715258986,
+      "learning_rate": 0.00019868361610371097,
+      "loss": 0.7826,
+      "step": 151
+    },
+    {
+      "epoch": 0.08106666666666666,
+      "grad_norm": 0.5782655589773584,
+      "learning_rate": 0.00019865552221904665,
+      "loss": 0.9027,
+      "step": 152
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.46136766235987614,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.7798,
+      "step": 153
+    },
+    {
+      "epoch": 0.08213333333333334,
+      "grad_norm": 0.4989979612996333,
+      "learning_rate": 0.00019859845073339787,
+      "loss": 0.7765,
+      "step": 154
+    },
+    {
+      "epoch": 0.08266666666666667,
+      "grad_norm": 0.430880249853866,
+      "learning_rate": 0.00019856947330283752,
+      "loss": 0.793,
+      "step": 155
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.622610818905008,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.9388,
+      "step": 156
+    },
+    {
+      "epoch": 0.08373333333333334,
+      "grad_norm": 0.46427227544234384,
+      "learning_rate": 0.0001985106354988997,
+      "loss": 0.8083,
+      "step": 157
+    },
+    {
+      "epoch": 0.08426666666666667,
+      "grad_norm": 0.48434244209571903,
+      "learning_rate": 0.00019848077530122083,
+      "loss": 0.7798,
+      "step": 158
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.5185043526212169,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8302,
+      "step": 159
+    },
+    {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.5218948447299503,
+      "learning_rate": 0.00019842017276027832,
+      "loss": 0.7977,
+      "step": 160
+    },
+    {
+      "epoch": 0.08586666666666666,
+      "grad_norm": 0.4929951302850292,
+      "learning_rate": 0.00019838943059798304,
+      "loss": 0.7564,
+      "step": 161
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.4221293177296331,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.7325,
+      "step": 162
+    },
+    {
+      "epoch": 0.08693333333333333,
+      "grad_norm": 0.4835300941894702,
+      "learning_rate": 0.0001983270649487481,
+      "loss": 0.7834,
+      "step": 163
+    },
+    {
+      "epoch": 0.08746666666666666,
+      "grad_norm": 0.5061016404838357,
+      "learning_rate": 0.0001982954416480417,
+      "loss": 0.7837,
+      "step": 164
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.42083367657463067,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.7589,
+      "step": 165
+    },
+    {
+      "epoch": 0.08853333333333334,
+      "grad_norm": 0.49457450556033916,
+      "learning_rate": 0.00019823131456661063,
+      "loss": 0.7924,
+      "step": 166
+    },
+    {
+      "epoch": 0.08906666666666667,
+      "grad_norm": 0.5702855770493767,
+      "learning_rate": 0.00019819881097737915,
+      "loss": 0.8282,
+      "step": 167
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5603048847161772,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.8849,
+      "step": 168
+    },
+    {
+      "epoch": 0.09013333333333333,
+      "grad_norm": 0.5323390999392662,
+      "learning_rate": 0.00019813292418718732,
+      "loss": 0.8207,
+      "step": 169
+    },
+    {
+      "epoch": 0.09066666666666667,
+      "grad_norm": 0.5550731344239631,
+      "learning_rate": 0.0001980995411829749,
+      "loss": 0.7816,
+      "step": 170
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.5467278587957556,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.795,
+      "step": 171
+    },
+    {
+      "epoch": 0.09173333333333333,
+      "grad_norm": 0.5871770847988043,
+      "learning_rate": 0.0001980318964547504,
+      "loss": 0.913,
+      "step": 172
+    },
+    {
+      "epoch": 0.09226666666666666,
+      "grad_norm": 0.42477602952763593,
+      "learning_rate": 0.0001979976349327357,
+      "loss": 0.8453,
+      "step": 173
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4775436168861191,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.8257,
+      "step": 174
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 0.5809533542417403,
+      "learning_rate": 0.00019792823408445174,
+      "loss": 0.8422,
+      "step": 175
+    },
+    {
+      "epoch": 0.09386666666666667,
+      "grad_norm": 0.5223553052307842,
+      "learning_rate": 0.0001978930949654239,
+      "loss": 0.9074,
+      "step": 176
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.6509701615774982,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.8967,
+      "step": 177
+    },
+    {
+      "epoch": 0.09493333333333333,
+      "grad_norm": 0.5107337996414248,
+      "learning_rate": 0.00019782193986224995,
+      "loss": 0.8419,
+      "step": 178
+    },
+    {
+      "epoch": 0.09546666666666667,
+      "grad_norm": 0.48571779122474323,
+      "learning_rate": 0.00019778592409058378,
+      "loss": 0.8223,
+      "step": 179
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.502817573421187,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.7862,
+      "step": 180
+    },
+    {
+      "epoch": 0.09653333333333333,
+      "grad_norm": 0.47996959799300476,
+      "learning_rate": 0.0001977130166448355,
+      "loss": 0.8256,
+      "step": 181
+    },
+    {
+      "epoch": 0.09706666666666666,
+      "grad_norm": 0.5004879222180794,
+      "learning_rate": 0.00019767612518846608,
+      "loss": 0.8276,
+      "step": 182
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.44055728969726077,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.767,
+      "step": 183
+    },
+    {
+      "epoch": 0.09813333333333334,
+      "grad_norm": 0.48817412888697204,
+      "learning_rate": 0.00019760146735955388,
+      "loss": 0.8126,
+      "step": 184
+    },
+    {
+      "epoch": 0.09866666666666667,
+      "grad_norm": 0.5026490948743149,
+      "learning_rate": 0.00019756370120995066,
+      "loss": 0.7725,
+      "step": 185
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.42242925801537123,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7868,
+      "step": 186
+    },
+    {
+      "epoch": 0.09973333333333333,
+      "grad_norm": 0.502595850937941,
+      "learning_rate": 0.000197487295004327,
+      "loss": 0.8072,
+      "step": 187
+    },
+    {
+      "epoch": 0.10026666666666667,
+      "grad_norm": 0.4536894309981548,
+      "learning_rate": 0.00019744865517646706,
+      "loss": 0.8315,
+      "step": 188
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.49222831809950096,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.8421,
+      "step": 189
+    },
+    {
+      "epoch": 0.10133333333333333,
+      "grad_norm": 0.6071361839588753,
+      "learning_rate": 0.0001973705026475726,
+      "loss": 0.797,
+      "step": 190
+    },
+    {
+      "epoch": 0.10186666666666666,
+      "grad_norm": 0.4556865828533464,
+      "learning_rate": 0.00019733099017991341,
+      "loss": 0.7521,
+      "step": 191
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5357536923095771,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.765,
+      "step": 192
+    },
+    {
+      "epoch": 0.10293333333333334,
+      "grad_norm": 0.45300643748452357,
+      "learning_rate": 0.0001972510934281218,
+      "loss": 0.7924,
+      "step": 193
+    },
+    {
+      "epoch": 0.10346666666666667,
+      "grad_norm": 0.4150781235551886,
+      "learning_rate": 0.00019721070938257324,
+      "loss": 0.7856,
+      "step": 194
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.5396250966839642,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.834,
+      "step": 195
+    },
+    {
+      "epoch": 0.10453333333333334,
+      "grad_norm": 0.5777663052587192,
+      "learning_rate": 0.0001971290705551347,
+      "loss": 0.8661,
+      "step": 196
+    },
+    {
+      "epoch": 0.10506666666666667,
+      "grad_norm": 0.41978092651639354,
+      "learning_rate": 0.00019708781601703065,
+      "loss": 0.8292,
+      "step": 197
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.4327189342629885,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.7917,
+      "step": 198
+    },
+    {
+      "epoch": 0.10613333333333333,
+      "grad_norm": 0.5339549918217065,
+      "learning_rate": 0.00019700443730801413,
+      "loss": 0.843,
+      "step": 199
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.6567496789400608,
+      "learning_rate": 0.00019696231338608316,
+      "loss": 0.9519,
+      "step": 200
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.48212135366814757,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.8556,
+      "step": 201
+    },
+    {
+      "epoch": 0.10773333333333333,
+      "grad_norm": 0.4513003393009851,
+      "learning_rate": 0.00019687719703631755,
+      "loss": 0.816,
+      "step": 202
+    },
+    {
+      "epoch": 0.10826666666666666,
+      "grad_norm": 0.5945292664973479,
+      "learning_rate": 0.00019683420486265327,
+      "loss": 0.8212,
+      "step": 203
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.5674758834098058,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.9254,
+      "step": 204
+    },
+    {
+      "epoch": 0.10933333333333334,
+      "grad_norm": 0.5049248316067212,
+      "learning_rate": 0.0001967473531596671,
+      "loss": 0.8058,
+      "step": 205
+    },
+    {
+      "epoch": 0.10986666666666667,
+      "grad_norm": 0.4310675636099357,
+      "learning_rate": 0.0001967034938896976,
+      "loss": 0.7619,
+      "step": 206
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.5688076725316934,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.894,
+      "step": 207
+    },
+    {
+      "epoch": 0.11093333333333333,
+      "grad_norm": 0.4825263353941901,
+      "learning_rate": 0.0001966149091676575,
+      "loss": 0.8716,
+      "step": 208
+    },
+    {
+      "epoch": 0.11146666666666667,
+      "grad_norm": 0.5204282839861062,
+      "learning_rate": 0.00019657018398011434,
+      "loss": 0.7731,
+      "step": 209
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.48193418900741986,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.7857,
+      "step": 210
+    },
+    {
+      "epoch": 0.11253333333333333,
+      "grad_norm": 0.5923084939770342,
+      "learning_rate": 0.00019647986861976246,
+      "loss": 0.8693,
+      "step": 211
+    },
+    {
+      "epoch": 0.11306666666666666,
+      "grad_norm": 0.4346421240251085,
+      "learning_rate": 0.0001964342787166491,
+      "loss": 0.8206,
+      "step": 212
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.41518975595943,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7725,
+      "step": 213
+    },
+    {
+      "epoch": 0.11413333333333334,
+      "grad_norm": 0.5412131324371964,
+      "learning_rate": 0.0001963422351452389,
+      "loss": 0.7835,
+      "step": 214
+    },
+    {
+      "epoch": 0.11466666666666667,
+      "grad_norm": 0.413041307949752,
+      "learning_rate": 0.0001962957817517982,
+      "loss": 0.7578,
+      "step": 215
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4350353314909387,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.745,
+      "step": 216
+    },
+    {
+      "epoch": 0.11573333333333333,
+      "grad_norm": 0.5156924530388096,
+      "learning_rate": 0.00019620201244302952,
+      "loss": 0.8341,
+      "step": 217
+    },
+    {
+      "epoch": 0.11626666666666667,
+      "grad_norm": 0.5216251483099174,
+      "learning_rate": 0.00019615469680771096,
+      "loss": 0.8143,
+      "step": 218
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.4847475901770463,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.8239,
+      "step": 219
+    },
+    {
+      "epoch": 0.11733333333333333,
+      "grad_norm": 0.5130061024698517,
+      "learning_rate": 0.00019605920428166323,
+      "loss": 0.8101,
+      "step": 220
+    },
+    {
+      "epoch": 0.11786666666666666,
+      "grad_norm": 0.5627559370642568,
+      "learning_rate": 0.00019601102767608923,
+      "loss": 0.905,
+      "step": 221
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5760016774779814,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.878,
+      "step": 222
+    },
+    {
+      "epoch": 0.11893333333333334,
+      "grad_norm": 0.697745543680888,
+      "learning_rate": 0.00019591381449915397,
+      "loss": 0.8782,
+      "step": 223
+    },
+    {
+      "epoch": 0.11946666666666667,
+      "grad_norm": 0.45254804088248957,
+      "learning_rate": 0.00019586477821808597,
+      "loss": 0.789,
+      "step": 224
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5177321917758226,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.8278,
+      "step": 225
+    },
+    {
+      "epoch": 0.12053333333333334,
+      "grad_norm": 0.6542211526689103,
+      "learning_rate": 0.00019576584700289768,
+      "loss": 0.8947,
+      "step": 226
+    },
+    {
+      "epoch": 0.12106666666666667,
+      "grad_norm": 0.46088993389214256,
+      "learning_rate": 0.00019571595236420102,
+      "loss": 0.8063,
+      "step": 227
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.44168806734271926,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.7994,
+      "step": 228
+    },
+    {
+      "epoch": 0.12213333333333333,
+      "grad_norm": 0.4893573959878582,
+      "learning_rate": 0.00019561530576956703,
+      "loss": 0.7875,
+      "step": 229
+    },
+    {
+      "epoch": 0.12266666666666666,
+      "grad_norm": 0.4402039706479555,
+      "learning_rate": 0.00019556455411417573,
+      "loss": 0.7653,
+      "step": 230
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.5503484096259508,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8052,
+      "step": 231
+    },
+    {
+      "epoch": 0.12373333333333333,
+      "grad_norm": 0.445732614986419,
+      "learning_rate": 0.00019546219484500475,
+      "loss": 0.7911,
+      "step": 232
+    },
+    {
+      "epoch": 0.12426666666666666,
+      "grad_norm": 0.4683586447728192,
+      "learning_rate": 0.00019541058753688538,
+      "loss": 0.8049,
+      "step": 233
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.43590689675821337,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7289,
+      "step": 234
+    },
+    {
+      "epoch": 0.12533333333333332,
+      "grad_norm": 0.4313722639335353,
+      "learning_rate": 0.00019530651834411474,
+      "loss": 0.7321,
+      "step": 235
+    },
+    {
+      "epoch": 0.12586666666666665,
+      "grad_norm": 0.4977979563104626,
+      "learning_rate": 0.00019525405677022989,
+      "loss": 0.7528,
+      "step": 236
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.5386795150252797,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.8602,
+      "step": 237
+    },
+    {
+      "epoch": 0.12693333333333334,
+      "grad_norm": 0.4647156785894839,
+      "learning_rate": 0.0001951482804507517,
+      "loss": 0.7707,
+      "step": 238
+    },
+    {
+      "epoch": 0.12746666666666667,
+      "grad_norm": 0.43784545150260307,
+      "learning_rate": 0.00019509496602102252,
+      "loss": 0.7433,
+      "step": 239
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.5418414036525606,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.8628,
+      "step": 240
+    },
+    {
+      "epoch": 0.12853333333333333,
+      "grad_norm": 0.49907138618976254,
+      "learning_rate": 0.00019498748541760846,
+      "loss": 0.8055,
+      "step": 241
+    },
+    {
+      "epoch": 0.12906666666666666,
+      "grad_norm": 0.4478793098798483,
+      "learning_rate": 0.0001949333195648769,
+      "loss": 0.7474,
+      "step": 242
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.7411814406167593,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8965,
+      "step": 243
+    },
+    {
+      "epoch": 0.13013333333333332,
+      "grad_norm": 0.4671607046294163,
+      "learning_rate": 0.00019482413756610173,
+      "loss": 0.7168,
+      "step": 244
+    },
+    {
+      "epoch": 0.13066666666666665,
+      "grad_norm": 0.5183060058121355,
+      "learning_rate": 0.0001947691217460921,
+      "loss": 0.7327,
+      "step": 245
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.45676797841041233,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8069,
+      "step": 246
+    },
+    {
+      "epoch": 0.13173333333333334,
+      "grad_norm": 0.5003422880277102,
+      "learning_rate": 0.00019465824128625617,
+      "loss": 0.8096,
+      "step": 247
+    },
+    {
+      "epoch": 0.13226666666666667,
+      "grad_norm": 0.4720046110864816,
+      "learning_rate": 0.00019460237697753577,
+      "loss": 0.7936,
+      "step": 248
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.4659375231388962,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8447,
+      "step": 249
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.3991832637400293,
+      "learning_rate": 0.00019448980103658613,
+      "loss": 0.7613,
+      "step": 250
+    },
+    {
+      "epoch": 0.13386666666666666,
+      "grad_norm": 0.46609744715715673,
+      "learning_rate": 0.0001944330897405257,
+      "loss": 0.8139,
+      "step": 251
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5473569722098224,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8917,
+      "step": 252
+    },
+    {
+      "epoch": 0.13493333333333332,
+      "grad_norm": 0.5125319162989075,
+      "learning_rate": 0.00019431882134397598,
+      "loss": 0.8458,
+      "step": 253
+    },
+    {
+      "epoch": 0.13546666666666668,
+      "grad_norm": 0.44520219411089196,
+      "learning_rate": 0.00019426126458470936,
+      "loss": 0.7899,
+      "step": 254
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4939429098502889,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.8231,
+      "step": 255
+    },
+    {
+      "epoch": 0.13653333333333334,
+      "grad_norm": 0.5186461089228898,
+      "learning_rate": 0.00019414530680355837,
+      "loss": 0.819,
+      "step": 256
+    },
+    {
+      "epoch": 0.13706666666666667,
+      "grad_norm": 0.43812957642139744,
+      "learning_rate": 0.00019408690612794148,
+      "loss": 0.7547,
+      "step": 257
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.4768485119368363,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7975,
+      "step": 258
+    },
+    {
+      "epoch": 0.13813333333333333,
+      "grad_norm": 0.5438896641929261,
+      "learning_rate": 0.00019396926207859084,
+      "loss": 0.876,
+      "step": 259
+    },
+    {
+      "epoch": 0.13866666666666666,
+      "grad_norm": 0.5040362007931832,
+      "learning_rate": 0.0001939100190561601,
+      "loss": 0.8342,
+      "step": 260
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.5907767181497797,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8547,
+      "step": 261
+    },
+    {
+      "epoch": 0.13973333333333332,
+      "grad_norm": 0.4134578829391838,
+      "learning_rate": 0.0001937906919003304,
+      "loss": 0.7977,
+      "step": 262
+    },
+    {
+      "epoch": 0.14026666666666668,
+      "grad_norm": 0.49573855042537013,
+      "learning_rate": 0.00019373060812326052,
+      "loss": 0.7871,
+      "step": 263
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5376762627254102,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.7686,
+      "step": 264
+    },
+    {
+      "epoch": 0.14133333333333334,
+      "grad_norm": 0.4271734633171038,
+      "learning_rate": 0.00019360960106790643,
+      "loss": 0.7099,
+      "step": 265
+    },
+    {
+      "epoch": 0.14186666666666667,
+      "grad_norm": 0.48891927309885547,
+      "learning_rate": 0.0001935486781509677,
+      "loss": 0.8501,
+      "step": 266
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.46702103354673885,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7685,
+      "step": 267
+    },
+    {
+      "epoch": 0.14293333333333333,
+      "grad_norm": 0.6714440742270075,
+      "learning_rate": 0.00019342599444819168,
+      "loss": 0.8112,
+      "step": 268
+    },
+    {
+      "epoch": 0.14346666666666666,
+      "grad_norm": 0.46719158465041644,
+      "learning_rate": 0.00019336423402870653,
+      "loss": 0.7317,
+      "step": 269
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4731175780188826,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.7222,
+      "step": 270
+    },
+    {
+      "epoch": 0.14453333333333335,
+      "grad_norm": 0.4813435196214086,
+      "learning_rate": 0.0001932398769756714,
+      "loss": 0.8823,
+      "step": 271
+    },
+    {
+      "epoch": 0.14506666666666668,
+      "grad_norm": 0.5365664793147633,
+      "learning_rate": 0.0001931772807134704,
+      "loss": 0.7758,
+      "step": 272
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.5151121319660465,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.8531,
+      "step": 273
+    },
+    {
+      "epoch": 0.14613333333333334,
+      "grad_norm": 0.4507442499257661,
+      "learning_rate": 0.00019305125365231084,
+      "loss": 0.7806,
+      "step": 274
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 0.49363095971705245,
+      "learning_rate": 0.00019298782322968815,
+      "loss": 0.7851,
+      "step": 275
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.4239664546814916,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.7667,
+      "step": 276
+    },
+    {
+      "epoch": 0.14773333333333333,
+      "grad_norm": 0.5449213157291893,
+      "learning_rate": 0.0001928601295474208,
+      "loss": 0.8113,
+      "step": 277
+    },
+    {
+      "epoch": 0.14826666666666666,
+      "grad_norm": 0.5308694195693685,
+      "learning_rate": 0.00019279586666908884,
+      "loss": 0.7687,
+      "step": 278
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.5226657003524614,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7762,
+      "step": 279
+    },
+    {
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.5066598445384889,
+      "learning_rate": 0.00019266650979752136,
+      "loss": 0.8752,
+      "step": 280
+    },
+    {
+      "epoch": 0.14986666666666668,
+      "grad_norm": 0.4451461004102294,
+      "learning_rate": 0.00019260141619056507,
+      "loss": 0.7267,
+      "step": 281
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.8399821417921948,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.9547,
+      "step": 282
+    },
+    {
+      "epoch": 0.15093333333333334,
+      "grad_norm": 0.4897670468294,
+      "learning_rate": 0.0001924703996062038,
+      "loss": 0.8311,
+      "step": 283
+    },
+    {
+      "epoch": 0.15146666666666667,
+      "grad_norm": 0.48818058616095594,
+      "learning_rate": 0.0001924044770200342,
+      "loss": 0.7179,
+      "step": 284
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.4419529212954025,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.8136,
+      "step": 285
+    },
+    {
+      "epoch": 0.15253333333333333,
+      "grad_norm": 0.4023698844938176,
+      "learning_rate": 0.0001922718042439908,
+      "loss": 0.7455,
+      "step": 286
+    },
+    {
+      "epoch": 0.15306666666666666,
+      "grad_norm": 0.47105044808585567,
+      "learning_rate": 0.000192205054450298,
+      "loss": 0.8503,
+      "step": 287
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.5408142824082688,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.7178,
+      "step": 288
+    },
+    {
+      "epoch": 0.15413333333333334,
+      "grad_norm": 0.5581117645528779,
+      "learning_rate": 0.00019207072904819486,
+      "loss": 0.8308,
+      "step": 289
+    },
+    {
+      "epoch": 0.15466666666666667,
+      "grad_norm": 0.493046943947969,
+      "learning_rate": 0.00019200315384090044,
+      "loss": 0.8241,
+      "step": 290
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.48141813650755766,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7823,
+      "step": 291
+    },
+    {
+      "epoch": 0.15573333333333333,
+      "grad_norm": 0.5783959120067039,
+      "learning_rate": 0.00019186717942277462,
+      "loss": 0.8724,
+      "step": 292
+    },
+    {
+      "epoch": 0.15626666666666666,
+      "grad_norm": 0.5329112197466159,
+      "learning_rate": 0.00019179878061798347,
+      "loss": 0.8125,
+      "step": 293
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.4479424873064313,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.7899,
+      "step": 294
+    },
+    {
+      "epoch": 0.15733333333333333,
+      "grad_norm": 0.4376026788812274,
+      "learning_rate": 0.00019166116083819002,
+      "loss": 0.7518,
+      "step": 295
+    },
+    {
+      "epoch": 0.15786666666666666,
+      "grad_norm": 0.5184722445808722,
+      "learning_rate": 0.00019159194027414128,
+      "loss": 0.7547,
+      "step": 296
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.4561139568161695,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.7951,
+      "step": 297
+    },
+    {
+      "epoch": 0.15893333333333334,
+      "grad_norm": 0.4938852643093283,
+      "learning_rate": 0.00019145267883125482,
+      "loss": 0.7797,
+      "step": 298
+    },
+    {
+      "epoch": 0.15946666666666667,
+      "grad_norm": 0.4819159951909378,
+      "learning_rate": 0.00019138263836827288,
+      "loss": 0.7643,
+      "step": 299
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5486220244815267,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.788,
+      "step": 300
+    },
+    {
+      "epoch": 0.16053333333333333,
+      "grad_norm": 0.5045180731593369,
+      "learning_rate": 0.00019124173900498818,
+      "loss": 0.7667,
+      "step": 301
+    },
+    {
+      "epoch": 0.16106666666666666,
+      "grad_norm": 0.5181599899043072,
+      "learning_rate": 0.00019117088052543233,
+      "loss": 0.8043,
+      "step": 302
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.4923806565929291,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8508,
+      "step": 303
+    },
+    {
+      "epoch": 0.16213333333333332,
+      "grad_norm": 0.419136356006891,
+      "learning_rate": 0.00019102834702846387,
+      "loss": 0.7943,
+      "step": 304
+    },
+    {
+      "epoch": 0.16266666666666665,
+      "grad_norm": 0.4455194340299678,
+      "learning_rate": 0.0001909566724366779,
+      "loss": 0.7328,
+      "step": 305
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.4945754793624841,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.8475,
+      "step": 306
+    },
+    {
+      "epoch": 0.16373333333333334,
+      "grad_norm": 0.4420538266510707,
+      "learning_rate": 0.00019081250863665794,
+      "loss": 0.8184,
+      "step": 307
+    },
+    {
+      "epoch": 0.16426666666666667,
+      "grad_norm": 0.4269060032710139,
+      "learning_rate": 0.0001907400198589189,
+      "loss": 0.7401,
+      "step": 308
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.5099723371851572,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.845,
+      "step": 309
+    },
+    {
+      "epoch": 0.16533333333333333,
+      "grad_norm": 0.4434519471106178,
+      "learning_rate": 0.00019059422963029464,
+      "loss": 0.7504,
+      "step": 310
+    },
+    {
+      "epoch": 0.16586666666666666,
+      "grad_norm": 0.43784132154621286,
+      "learning_rate": 0.0001905209286147611,
+      "loss": 0.7089,
+      "step": 311
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.43873857004609185,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.7002,
+      "step": 312
+    },
+    {
+      "epoch": 0.16693333333333332,
+      "grad_norm": 0.4506345032265064,
+      "learning_rate": 0.0001903735158756905,
+      "loss": 0.7988,
+      "step": 313
+    },
+    {
+      "epoch": 0.16746666666666668,
+      "grad_norm": 0.4544785800442285,
+      "learning_rate": 0.0001902994045923502,
+      "loss": 0.7629,
+      "step": 314
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.5072986473772981,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.7645,
+      "step": 315
+    },
+    {
+      "epoch": 0.16853333333333334,
+      "grad_norm": 0.48380278700417223,
+      "learning_rate": 0.0001901503733045967,
+      "loss": 0.7708,
+      "step": 316
+    },
+    {
+      "epoch": 0.16906666666666667,
+      "grad_norm": 0.5401366720008375,
+      "learning_rate": 0.00019007545374521355,
+      "loss": 0.8885,
+      "step": 317
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.40824039395235506,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7431,
+      "step": 318
+    },
+    {
+      "epoch": 0.17013333333333333,
+      "grad_norm": 0.4773952792733267,
+      "learning_rate": 0.00018992480791403958,
+      "loss": 0.7904,
+      "step": 319
+    },
+    {
+      "epoch": 0.17066666666666666,
+      "grad_norm": 0.5147375115771243,
+      "learning_rate": 0.0001898490820921001,
+      "loss": 0.7826,
+      "step": 320
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.4967881434720466,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.7921,
+      "step": 321
+    },
+    {
+      "epoch": 0.17173333333333332,
+      "grad_norm": 0.4020471288283699,
+      "learning_rate": 0.0001896968257661595,
+      "loss": 0.7395,
+      "step": 322
+    },
+    {
+      "epoch": 0.17226666666666668,
+      "grad_norm": 0.5547001347028977,
+      "learning_rate": 0.00018962029571681886,
+      "loss": 0.8265,
+      "step": 323
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.5937416350965022,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.6799,
+      "step": 324
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 0.47119996583636425,
+      "learning_rate": 0.00018946643298804793,
+      "loss": 0.7639,
+      "step": 325
+    },
+    {
+      "epoch": 0.17386666666666667,
+      "grad_norm": 0.47514075140957784,
+      "learning_rate": 0.00018938910076807513,
+      "loss": 0.756,
+      "step": 326
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.5430729858471183,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.8153,
+      "step": 327
+    },
+    {
+      "epoch": 0.17493333333333333,
+      "grad_norm": 0.4392242790602314,
+      "learning_rate": 0.0001892336357715829,
+      "loss": 0.7723,
+      "step": 328
+    },
+    {
+      "epoch": 0.17546666666666666,
+      "grad_norm": 0.43309400523514135,
+      "learning_rate": 0.0001891555034593055,
+      "loss": 0.7881,
+      "step": 329
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4428285074966686,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.8107,
+      "step": 330
+    },
+    {
+      "epoch": 0.17653333333333332,
+      "grad_norm": 0.39498572960290707,
+      "learning_rate": 0.00018899844037326225,
+      "loss": 0.7057,
+      "step": 331
+    },
+    {
+      "epoch": 0.17706666666666668,
+      "grad_norm": 0.45384895759122684,
+      "learning_rate": 0.0001889195100685106,
+      "loss": 0.757,
+      "step": 332
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.4645589158513839,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7917,
+      "step": 333
+    },
+    {
+      "epoch": 0.17813333333333334,
+      "grad_norm": 0.4828225303507632,
+      "learning_rate": 0.00018876085311403593,
+      "loss": 0.7706,
+      "step": 334
+    },
+    {
+      "epoch": 0.17866666666666667,
+      "grad_norm": 0.41676161155491853,
+      "learning_rate": 0.00018868112693808665,
+      "loss": 0.8008,
+      "step": 335
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.40768148396249776,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7125,
+      "step": 336
+    },
+    {
+      "epoch": 0.17973333333333333,
+      "grad_norm": 0.4206330022531057,
+      "learning_rate": 0.00018852088037913577,
+      "loss": 0.708,
+      "step": 337
+    },
+    {
+      "epoch": 0.18026666666666666,
+      "grad_norm": 0.45852669946674146,
+      "learning_rate": 0.0001884403604746547,
+      "loss": 0.7252,
+      "step": 338
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.5187458223572374,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8094,
+      "step": 339
+    },
+    {
+      "epoch": 0.18133333333333335,
+      "grad_norm": 0.4218113268112975,
+      "learning_rate": 0.00018827852861790398,
+      "loss": 0.7457,
+      "step": 340
+    },
+    {
+      "epoch": 0.18186666666666668,
+      "grad_norm": 0.4779548315246646,
+      "learning_rate": 0.00018819721714888877,
+      "loss": 0.7733,
+      "step": 341
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.5305887385974879,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.794,
+      "step": 342
+    },
+    {
+      "epoch": 0.18293333333333334,
+      "grad_norm": 0.4436089689306347,
+      "learning_rate": 0.00018803380434362,
+      "loss": 0.7014,
+      "step": 343
+    },
+    {
+      "epoch": 0.18346666666666667,
+      "grad_norm": 0.505347922907718,
+      "learning_rate": 0.0001879517034953418,
+      "loss": 0.8379,
+      "step": 344
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.5307748724766961,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7534,
+      "step": 345
+    },
+    {
+      "epoch": 0.18453333333333333,
+      "grad_norm": 0.4603281559104605,
+      "learning_rate": 0.00018778671413332513,
+      "loss": 0.6725,
+      "step": 346
+    },
+    {
+      "epoch": 0.18506666666666666,
+      "grad_norm": 0.4448136344506952,
+      "learning_rate": 0.00018770382611226987,
+      "loss": 0.7645,
+      "step": 347
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.4500479016642222,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.7326,
+      "step": 348
+    },
+    {
+      "epoch": 0.18613333333333335,
+      "grad_norm": 0.44498235610391224,
+      "learning_rate": 0.000187537264627646,
+      "loss": 0.7449,
+      "step": 349
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.4481725927338574,
+      "learning_rate": 0.00018745359166145523,
+      "loss": 0.7592,
+      "step": 350
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.5038906250643743,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7622,
+      "step": 351
+    },
+    {
+      "epoch": 0.18773333333333334,
+      "grad_norm": 0.4891318264847687,
+      "learning_rate": 0.00018728546253061614,
+      "loss": 0.7279,
+      "step": 352
+    },
+    {
+      "epoch": 0.18826666666666667,
+      "grad_norm": 0.46863427713366296,
+      "learning_rate": 0.00018720100686802694,
+      "loss": 0.8108,
+      "step": 353
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5081538565443008,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.851,
+      "step": 354
+    },
+    {
+      "epoch": 0.18933333333333333,
+      "grad_norm": 0.44156652490572995,
+      "learning_rate": 0.00018703131460949554,
+      "loss": 0.7617,
+      "step": 355
+    },
+    {
+      "epoch": 0.18986666666666666,
+      "grad_norm": 0.4222184947608319,
+      "learning_rate": 0.0001869460785202802,
+      "loss": 0.738,
+      "step": 356
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.487228890741418,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.8004,
+      "step": 357
+    },
+    {
+      "epoch": 0.19093333333333334,
+      "grad_norm": 0.5018825649734135,
+      "learning_rate": 0.00018677482769458904,
+      "loss": 0.8676,
+      "step": 358
+    },
+    {
+      "epoch": 0.19146666666666667,
+      "grad_norm": 0.6506669771279707,
+      "learning_rate": 0.00018668881346949417,
+      "loss": 0.862,
+      "step": 359
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.6038723576749023,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.9103,
+      "step": 360
+    },
+    {
+      "epoch": 0.19253333333333333,
+      "grad_norm": 0.4424814885937773,
+      "learning_rate": 0.00018651600867906272,
+      "loss": 0.7687,
+      "step": 361
+    },
+    {
+      "epoch": 0.19306666666666666,
+      "grad_norm": 0.46165403935919586,
+      "learning_rate": 0.00018642921862974742,
+      "loss": 0.8191,
+      "step": 362
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.5007573545248332,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.845,
+      "step": 363
+    },
+    {
+      "epoch": 0.19413333333333332,
+      "grad_norm": 0.5423783522464709,
+      "learning_rate": 0.00018625486451875843,
+      "loss": 0.7283,
+      "step": 364
+    },
+    {
+      "epoch": 0.19466666666666665,
+      "grad_norm": 0.46307366479277484,
+      "learning_rate": 0.0001861673009777325,
+      "loss": 0.8054,
+      "step": 365
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.392444456670787,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.7268,
+      "step": 366
+    },
+    {
+      "epoch": 0.19573333333333334,
+      "grad_norm": 0.5157382155851634,
+      "learning_rate": 0.00018599140223200716,
+      "loss": 0.8073,
+      "step": 367
+    },
+    {
+      "epoch": 0.19626666666666667,
+      "grad_norm": 0.6019794040542387,
+      "learning_rate": 0.0001859030675525681,
+      "loss": 0.7911,
+      "step": 368
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.5257744626167561,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.8577,
+      "step": 369
+    },
+    {
+      "epoch": 0.19733333333333333,
+      "grad_norm": 0.405993749808719,
+      "learning_rate": 0.0001857256288994402,
+      "loss": 0.7631,
+      "step": 370
+    },
+    {
+      "epoch": 0.19786666666666666,
+      "grad_norm": 0.476821211198031,
+      "learning_rate": 0.00018563652545561013,
+      "loss": 0.8177,
+      "step": 371
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.3908364341021406,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.6799,
+      "step": 372
+    },
+    {
+      "epoch": 0.19893333333333332,
+      "grad_norm": 0.46398208229538557,
+      "learning_rate": 0.000185457551663799,
+      "loss": 0.7424,
+      "step": 373
+    },
+    {
+      "epoch": 0.19946666666666665,
+      "grad_norm": 0.4222843288943092,
+      "learning_rate": 0.00018536768185026083,
+      "loss": 0.7429,
+      "step": 374
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.46262208566459617,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.7788,
+      "step": 375
+    },
+    {
+      "epoch": 0.20053333333333334,
+      "grad_norm": 0.4464732188169139,
+      "learning_rate": 0.00018518717772974302,
+      "loss": 0.7253,
+      "step": 376
+    },
+    {
+      "epoch": 0.20106666666666667,
+      "grad_norm": 0.45794068390259507,
+      "learning_rate": 0.00018509654396177609,
+      "loss": 0.7007,
+      "step": 377
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4100964140410785,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.7254,
+      "step": 378
+    },
+    {
+      "epoch": 0.20213333333333333,
+      "grad_norm": 0.5915915519935746,
+      "learning_rate": 0.00018491451436365627,
+      "loss": 0.916,
+      "step": 379
+    },
+    {
+      "epoch": 0.20266666666666666,
+      "grad_norm": 0.4283652869423221,
+      "learning_rate": 0.0001848231190770714,
+      "loss": 0.698,
+      "step": 380
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.4997935155568232,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.8432,
+      "step": 381
+    },
+    {
+      "epoch": 0.20373333333333332,
+      "grad_norm": 0.5255887998793641,
+      "learning_rate": 0.00018463956889345194,
+      "loss": 0.7727,
+      "step": 382
+    },
+    {
+      "epoch": 0.20426666666666668,
+      "grad_norm": 0.4563515619886789,
+      "learning_rate": 0.00018454741454452603,
+      "loss": 0.8111,
+      "step": 383
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.44738489833947603,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.781,
+      "step": 384
+    },
+    {
+      "epoch": 0.20533333333333334,
+      "grad_norm": 0.434618116840989,
+      "learning_rate": 0.00018436234870837547,
+      "loss": 0.7733,
+      "step": 385
+    },
+    {
+      "epoch": 0.20586666666666667,
+      "grad_norm": 0.47343657771952574,
+      "learning_rate": 0.00018426943777378552,
+      "loss": 0.7037,
+      "step": 386
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4226790047438053,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7037,
+      "step": 387
+    },
+    {
+      "epoch": 0.20693333333333333,
+      "grad_norm": 0.48576369552246895,
+      "learning_rate": 0.00018408286125880604,
+      "loss": 0.7652,
+      "step": 388
+    },
+    {
+      "epoch": 0.20746666666666666,
+      "grad_norm": 0.3921296847107615,
+      "learning_rate": 0.00018398919623556238,
+      "loss": 0.7187,
+      "step": 389
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.45799870157437206,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.7962,
+      "step": 390
+    },
+    {
+      "epoch": 0.20853333333333332,
+      "grad_norm": 0.46374413527372166,
+      "learning_rate": 0.0001838011140560562,
+      "loss": 0.8047,
+      "step": 391
+    },
+    {
+      "epoch": 0.20906666666666668,
+      "grad_norm": 0.6108939178009056,
+      "learning_rate": 0.00018370669746143564,
+      "loss": 0.7613,
+      "step": 392
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.5203855562061216,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.8278,
+      "step": 393
+    },
+    {
+      "epoch": 0.21013333333333334,
+      "grad_norm": 0.4793065235133658,
+      "learning_rate": 0.0001835171146721701,
+      "loss": 0.8114,
+      "step": 394
+    },
+    {
+      "epoch": 0.21066666666666667,
+      "grad_norm": 0.49522060969880577,
+      "learning_rate": 0.00018342194904364813,
+      "loss": 0.7992,
+      "step": 395
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5669903074697817,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.7945,
+      "step": 396
+    },
+    {
+      "epoch": 0.21173333333333333,
+      "grad_norm": 0.5845667221017729,
+      "learning_rate": 0.00018323087073971993,
+      "loss": 0.8268,
+      "step": 397
+    },
+    {
+      "epoch": 0.21226666666666666,
+      "grad_norm": 0.4377080202856934,
+      "learning_rate": 0.00018313495863490258,
+      "loss": 0.7976,
+      "step": 398
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.43160031783513936,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.7563,
+      "step": 399
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.4413100309779214,
+      "learning_rate": 0.00018294238995160094,
+      "loss": 0.7697,
+      "step": 400
+    },
+    {
+      "epoch": 0.21386666666666668,
+      "grad_norm": 0.44401477827582986,
+      "learning_rate": 0.00018284573394815597,
+      "loss": 0.7469,
+      "step": 401
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.49848698437485534,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7272,
+      "step": 402
+    },
+    {
+      "epoch": 0.21493333333333334,
+      "grad_norm": 0.47428272112916825,
+      "learning_rate": 0.00018265168006082437,
+      "loss": 0.7728,
+      "step": 403
+    },
+    {
+      "epoch": 0.21546666666666667,
+      "grad_norm": 0.4284982233684095,
+      "learning_rate": 0.00018255428275641214,
+      "loss": 0.7661,
+      "step": 404
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.5216743563351581,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7863,
+      "step": 405
+    },
+    {
+      "epoch": 0.21653333333333333,
+      "grad_norm": 0.44987741751980753,
+      "learning_rate": 0.0001823587488803095,
+      "loss": 0.6892,
+      "step": 406
+    },
+    {
+      "epoch": 0.21706666666666666,
+      "grad_norm": 0.5144487920497057,
+      "learning_rate": 0.00018226061289251298,
+      "loss": 0.7945,
+      "step": 407
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.45681089238630984,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7303,
+      "step": 408
+    },
+    {
+      "epoch": 0.21813333333333335,
+      "grad_norm": 0.40916256219685515,
+      "learning_rate": 0.00018206360428267332,
+      "loss": 0.6955,
+      "step": 409
+    },
+    {
+      "epoch": 0.21866666666666668,
+      "grad_norm": 0.44578108377027037,
+      "learning_rate": 0.00018196473224892784,
+      "loss": 0.7529,
+      "step": 410
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.3928321523460676,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.668,
+      "step": 411
+    },
+    {
+      "epoch": 0.21973333333333334,
+      "grad_norm": 0.44742959965839746,
+      "learning_rate": 0.0001817662542000192,
+      "loss": 0.7845,
+      "step": 412
+    },
+    {
+      "epoch": 0.22026666666666667,
+      "grad_norm": 0.43637108165151556,
+      "learning_rate": 0.0001816666487775416,
+      "loss": 0.7247,
+      "step": 413
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4702710993141143,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7579,
+      "step": 414
+    },
+    {
+      "epoch": 0.22133333333333333,
+      "grad_norm": 0.43545505247112853,
+      "learning_rate": 0.00018146670662372354,
+      "loss": 0.7147,
+      "step": 415
+    },
+    {
+      "epoch": 0.22186666666666666,
+      "grad_norm": 0.5139992211382781,
+      "learning_rate": 0.0001813663704894407,
+      "loss": 0.782,
+      "step": 416
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.5152609439061868,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.8259,
+      "step": 417
+    },
+    {
+      "epoch": 0.22293333333333334,
+      "grad_norm": 0.45745088126524003,
+      "learning_rate": 0.00018116496960422107,
+      "loss": 0.6755,
+      "step": 418
+    },
+    {
+      "epoch": 0.22346666666666667,
+      "grad_norm": 0.4291324050724361,
+      "learning_rate": 0.00018106390545469795,
+      "loss": 0.7133,
+      "step": 419
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5518909640141185,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7545,
+      "step": 420
+    },
+    {
+      "epoch": 0.22453333333333333,
+      "grad_norm": 0.5559425812819749,
+      "learning_rate": 0.00018086105125078857,
+      "loss": 0.7434,
+      "step": 421
+    },
+    {
+      "epoch": 0.22506666666666666,
+      "grad_norm": 0.49879588892898163,
+      "learning_rate": 0.00018075926180215576,
+      "loss": 0.8135,
+      "step": 422
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.43629817554003497,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.795,
+      "step": 423
+    },
+    {
+      "epoch": 0.22613333333333333,
+      "grad_norm": 0.5605025168067477,
+      "learning_rate": 0.0001805549597313267,
+      "loss": 0.7445,
+      "step": 424
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 0.4156512765357945,
+      "learning_rate": 0.0001804524477192075,
+      "loss": 0.7397,
+      "step": 425
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.4186968790638674,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7156,
+      "step": 426
+    },
+    {
+      "epoch": 0.22773333333333334,
+      "grad_norm": 0.4383726287017184,
+      "learning_rate": 0.00018024670327214084,
+      "loss": 0.8117,
+      "step": 427
+    },
+    {
+      "epoch": 0.22826666666666667,
+      "grad_norm": 0.4508631691000904,
+      "learning_rate": 0.00018014347145157755,
+      "loss": 0.7327,
+      "step": 428
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.4561001826749441,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.781,
+      "step": 429
+    },
+    {
+      "epoch": 0.22933333333333333,
+      "grad_norm": 0.3923978069667923,
+      "learning_rate": 0.0001799362901577196,
+      "loss": 0.6963,
+      "step": 430
+    },
+    {
+      "epoch": 0.22986666666666666,
+      "grad_norm": 0.49431958857673547,
+      "learning_rate": 0.00017983234130309968,
+      "loss": 0.8463,
+      "step": 431
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.5327542327158376,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7743,
+      "step": 432
+    },
+    {
+      "epoch": 0.23093333333333332,
+      "grad_norm": 0.40841816323592084,
+      "learning_rate": 0.00017962372873051252,
+      "loss": 0.7442,
+      "step": 433
+    },
+    {
+      "epoch": 0.23146666666666665,
+      "grad_norm": 0.41469439312498524,
+      "learning_rate": 0.00017951906563549397,
+      "loss": 0.7546,
+      "step": 434
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.5187820639244829,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.8284,
+      "step": 435
+    },
+    {
+      "epoch": 0.23253333333333334,
+      "grad_norm": 0.4814434766699283,
+      "learning_rate": 0.00017930902739070562,
+      "loss": 0.7562,
+      "step": 436
+    },
+    {
+      "epoch": 0.23306666666666667,
+      "grad_norm": 0.5158481600615762,
+      "learning_rate": 0.00017920365286814183,
+      "loss": 0.8588,
+      "step": 437
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.42692458496058494,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7278,
+      "step": 438
+    },
+    {
+      "epoch": 0.23413333333333333,
+      "grad_norm": 0.47341159773000596,
+      "learning_rate": 0.0001789921945959958,
+      "loss": 0.7832,
+      "step": 439
+    },
+    {
+      "epoch": 0.23466666666666666,
+      "grad_norm": 0.4658179251976753,
+      "learning_rate": 0.00017888611147786002,
+      "loss": 0.7859,
+      "step": 440
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.426747121759328,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7149,
+      "step": 441
+    },
+    {
+      "epoch": 0.23573333333333332,
+      "grad_norm": 0.4424751358594917,
+      "learning_rate": 0.00017867323886136348,
+      "loss": 0.713,
+      "step": 442
+    },
+    {
+      "epoch": 0.23626666666666668,
+      "grad_norm": 0.534590992906093,
+      "learning_rate": 0.00017856644999867264,
+      "loss": 0.7878,
+      "step": 443
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.41020810584542494,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.6778,
+      "step": 444
+    },
+    {
+      "epoch": 0.23733333333333334,
+      "grad_norm": 0.5089599673121381,
+      "learning_rate": 0.00017835216875884368,
+      "loss": 0.7577,
+      "step": 445
+    },
+    {
+      "epoch": 0.23786666666666667,
+      "grad_norm": 0.44368525837594225,
+      "learning_rate": 0.0001782446770215819,
+      "loss": 0.8019,
+      "step": 446
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.39948427279118004,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7178,
+      "step": 447
+    },
+    {
+      "epoch": 0.23893333333333333,
+      "grad_norm": 0.6373508403094503,
+      "learning_rate": 0.00017802899291729585,
+      "loss": 0.802,
+      "step": 448
+    },
+    {
+      "epoch": 0.23946666666666666,
+      "grad_norm": 0.45969661521525756,
+      "learning_rate": 0.0001779208011943371,
+      "loss": 0.8592,
+      "step": 449
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.41461759877326615,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.8089,
+      "step": 450
+    },
+    {
+      "epoch": 0.24053333333333332,
+      "grad_norm": 0.461235913306641,
+      "learning_rate": 0.00017770372002217172,
+      "loss": 0.7495,
+      "step": 451
+    },
+    {
+      "epoch": 0.24106666666666668,
+      "grad_norm": 0.4790846421787979,
+      "learning_rate": 0.00017759483122120238,
+      "loss": 0.8091,
+      "step": 452
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.4246111144159431,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7368,
+      "step": 453
+    },
+    {
+      "epoch": 0.24213333333333334,
+      "grad_norm": 0.45585421366298007,
+      "learning_rate": 0.00017737635881528196,
+      "loss": 0.6921,
+      "step": 454
+    },
+    {
+      "epoch": 0.24266666666666667,
+      "grad_norm": 0.5284232309591278,
+      "learning_rate": 0.00017726677586272263,
+      "loss": 0.8409,
+      "step": 455
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.5084751662011151,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7088,
+      "step": 456
+    },
+    {
+      "epoch": 0.24373333333333333,
+      "grad_norm": 0.4935359417276924,
+      "learning_rate": 0.00017704691809456143,
+      "loss": 0.7668,
+      "step": 457
+    },
+    {
+      "epoch": 0.24426666666666666,
+      "grad_norm": 0.7144664711570883,
+      "learning_rate": 0.0001769366439354882,
+      "loss": 0.8616,
+      "step": 458
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.38572961600312905,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7047,
+      "step": 459
+    },
+    {
+      "epoch": 0.24533333333333332,
+      "grad_norm": 0.4413409728911689,
+      "learning_rate": 0.00017671540671383243,
+      "loss": 0.6788,
+      "step": 460
+    },
+    {
+      "epoch": 0.24586666666666668,
+      "grad_norm": 0.5071092067644272,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 0.8143,
+      "step": 461
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.4537530113046635,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7413,
+      "step": 462
+    },
+    {
+      "epoch": 0.24693333333333334,
+      "grad_norm": 0.4824048413518666,
+      "learning_rate": 0.00017638183358256696,
+      "loss": 0.7409,
+      "step": 463
+    },
+    {
+      "epoch": 0.24746666666666667,
+      "grad_norm": 0.4999336666430477,
+      "learning_rate": 0.00017627018591992018,
+      "loss": 0.8097,
+      "step": 464
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.47620574127289,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.8534,
+      "step": 465
+    },
+    {
+      "epoch": 0.24853333333333333,
+      "grad_norm": 0.4437154788572927,
+      "learning_rate": 0.00017604620766564723,
+      "loss": 0.6857,
+      "step": 466
+    },
+    {
+      "epoch": 0.24906666666666666,
+      "grad_norm": 0.490726925790608,
+      "learning_rate": 0.00017593387774285412,
+      "loss": 0.7347,
+      "step": 467
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.5480906633151426,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8543,
+      "step": 468
+    },
+    {
+      "epoch": 0.2501333333333333,
+      "grad_norm": 0.46751688012032416,
+      "learning_rate": 0.0001757085379831246,
+      "loss": 0.7774,
+      "step": 469
+    },
+    {
+      "epoch": 0.25066666666666665,
+      "grad_norm": 0.5012425805809494,
+      "learning_rate": 0.00017559552881908695,
+      "loss": 0.767,
+      "step": 470
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 1.0992435072970104,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8206,
+      "step": 471
+    },
+    {
+      "epoch": 0.2517333333333333,
+      "grad_norm": 0.41011198381290903,
+      "learning_rate": 0.00017536883360997743,
+      "loss": 0.7132,
+      "step": 472
+    },
+    {
+      "epoch": 0.25226666666666664,
+      "grad_norm": 0.4851822022697597,
+      "learning_rate": 0.00017525514824185185,
+      "loss": 0.8222,
+      "step": 473
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.42959752142849994,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.7622,
+      "step": 474
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 0.4016411050805096,
+      "learning_rate": 0.00017502710367586687,
+      "loss": 0.7147,
+      "step": 475
+    },
+    {
+      "epoch": 0.2538666666666667,
+      "grad_norm": 0.3769853438689814,
+      "learning_rate": 0.0001749127451589832,
+      "loss": 0.716,
+      "step": 476
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.4565265477451086,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.733,
+      "step": 477
+    },
+    {
+      "epoch": 0.25493333333333335,
+      "grad_norm": 0.47916721260073963,
+      "learning_rate": 0.00017468335736489177,
+      "loss": 0.7493,
+      "step": 478
+    },
+    {
+      "epoch": 0.2554666666666667,
+      "grad_norm": 0.4264583107936126,
+      "learning_rate": 0.00017456832877267084,
+      "loss": 0.7257,
+      "step": 479
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4297826533229006,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7708,
+      "step": 480
+    },
+    {
+      "epoch": 0.25653333333333334,
+      "grad_norm": 0.45047366153254836,
+      "learning_rate": 0.00017433760391534167,
+      "loss": 0.8296,
+      "step": 481
+    },
+    {
+      "epoch": 0.25706666666666667,
+      "grad_norm": 0.4245796722615162,
+      "learning_rate": 0.00017422190833921283,
+      "loss": 0.7569,
+      "step": 482
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.45489281736063836,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7745,
+      "step": 483
+    },
+    {
+      "epoch": 0.2581333333333333,
+      "grad_norm": 0.5182743917225674,
+      "learning_rate": 0.00017398985261944856,
+      "loss": 0.7437,
+      "step": 484
+    },
+    {
+      "epoch": 0.25866666666666666,
+      "grad_norm": 0.40290214610290087,
+      "learning_rate": 0.00017387349316876666,
+      "loss": 0.7179,
+      "step": 485
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.5148862084632833,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.7906,
+      "step": 486
+    },
+    {
+      "epoch": 0.2597333333333333,
+      "grad_norm": 0.605194359947896,
+      "learning_rate": 0.0001736401128231373,
+      "loss": 0.7882,
+      "step": 487
+    },
+    {
+      "epoch": 0.26026666666666665,
+      "grad_norm": 0.4361179650333874,
+      "learning_rate": 0.00017352309262509894,
+      "loss": 0.7683,
+      "step": 488
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.5566658095073518,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.8477,
+      "step": 489
+    },
+    {
+      "epoch": 0.2613333333333333,
+      "grad_norm": 0.4514734220833295,
+      "learning_rate": 0.0001732883939257742,
+      "loss": 0.8213,
+      "step": 490
+    },
+    {
+      "epoch": 0.2618666666666667,
+      "grad_norm": 0.4421968839147472,
+      "learning_rate": 0.0001731707161253338,
+      "loss": 0.7037,
+      "step": 491
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.42301251886582336,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.747,
+      "step": 492
+    },
+    {
+      "epoch": 0.26293333333333335,
+      "grad_norm": 0.3805910704634265,
+      "learning_rate": 0.00017293470537991463,
+      "loss": 0.7443,
+      "step": 493
+    },
+    {
+      "epoch": 0.2634666666666667,
+      "grad_norm": 0.4933055681343728,
+      "learning_rate": 0.00017281637313969978,
+      "loss": 0.8022,
+      "step": 494
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3707160728618861,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.7278,
+      "step": 495
+    },
+    {
+      "epoch": 0.26453333333333334,
+      "grad_norm": 0.49277663481926737,
+      "learning_rate": 0.00017257905669104874,
+      "loss": 0.8518,
+      "step": 496
+    },
+    {
+      "epoch": 0.2650666666666667,
+      "grad_norm": 0.46835116809800187,
+      "learning_rate": 0.00017246007319127545,
+      "loss": 0.7389,
+      "step": 497
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.4746251387265908,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7445,
+      "step": 498
+    },
+    {
+      "epoch": 0.26613333333333333,
+      "grad_norm": 0.5137219752720172,
+      "learning_rate": 0.00017222145741734626,
+      "loss": 0.7075,
+      "step": 499
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.43526178449759145,
+      "learning_rate": 0.00017210182585573327,
+      "loss": 0.7448,
+      "step": 500
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.5320286884413845,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.8117,
+      "step": 501
+    },
+    {
+      "epoch": 0.2677333333333333,
+      "grad_norm": 0.36948366328283744,
+      "learning_rate": 0.00017186191716939944,
+      "loss": 0.6865,
+      "step": 502
+    },
+    {
+      "epoch": 0.26826666666666665,
+      "grad_norm": 0.8508520461548158,
+      "learning_rate": 0.0001717416407610824,
+      "loss": 0.7845,
+      "step": 503
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.5067628459496808,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8046,
+      "step": 504
+    },
+    {
+      "epoch": 0.2693333333333333,
+      "grad_norm": 0.5182405956683679,
+      "learning_rate": 0.00017150044560996488,
+      "loss": 0.8451,
+      "step": 505
+    },
+    {
+      "epoch": 0.26986666666666664,
+      "grad_norm": 0.466922838762588,
+      "learning_rate": 0.00017137952758740978,
+      "loss": 0.7409,
+      "step": 506
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.49081381268238045,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.6975,
+      "step": 507
+    },
+    {
+      "epoch": 0.27093333333333336,
+      "grad_norm": 0.4706872165534357,
+      "learning_rate": 0.00017113705245370368,
+      "loss": 0.8513,
+      "step": 508
+    },
+    {
+      "epoch": 0.2714666666666667,
+      "grad_norm": 0.5646184167803093,
+      "learning_rate": 0.00017101549606662024,
+      "loss": 0.8282,
+      "step": 509
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4222250521775234,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7423,
+      "step": 510
+    },
+    {
+      "epoch": 0.27253333333333335,
+      "grad_norm": 0.49523475485097457,
+      "learning_rate": 0.00017077174746692056,
+      "loss": 0.8919,
+      "step": 511
+    },
+    {
+      "epoch": 0.2730666666666667,
+      "grad_norm": 0.5167773312757417,
+      "learning_rate": 0.00017064955598217462,
+      "loss": 0.8549,
+      "step": 512
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.458375374443549,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7527,
+      "step": 513
+    },
+    {
+      "epoch": 0.27413333333333334,
+      "grad_norm": 0.5150099768348072,
+      "learning_rate": 0.00017040454046730115,
+      "loss": 0.8448,
+      "step": 514
+    },
+    {
+      "epoch": 0.27466666666666667,
+      "grad_norm": 0.45541857275671144,
+      "learning_rate": 0.00017028171716882714,
+      "loss": 0.7478,
+      "step": 515
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.46213514087870106,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.734,
+      "step": 516
+    },
+    {
+      "epoch": 0.27573333333333333,
+      "grad_norm": 0.49544205271878083,
+      "learning_rate": 0.00017003544132364846,
+      "loss": 0.7522,
+      "step": 517
+    },
+    {
+      "epoch": 0.27626666666666666,
+      "grad_norm": 0.496244994144503,
+      "learning_rate": 0.00016991198951236088,
+      "loss": 0.7419,
+      "step": 518
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.4508646273754306,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.736,
+      "step": 519
+    },
+    {
+      "epoch": 0.2773333333333333,
+      "grad_norm": 0.40179344760412944,
+      "learning_rate": 0.00016966445995561727,
+      "loss": 0.645,
+      "step": 520
+    },
+    {
+      "epoch": 0.27786666666666665,
+      "grad_norm": 0.5612524453527867,
+      "learning_rate": 0.00016954038294932216,
+      "loss": 0.8474,
+      "step": 521
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.4310080732108951,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7457,
+      "step": 522
+    },
+    {
+      "epoch": 0.2789333333333333,
+      "grad_norm": 0.46411264235863475,
+      "learning_rate": 0.0001692916063334479,
+      "loss": 0.7787,
+      "step": 523
+    },
+    {
+      "epoch": 0.27946666666666664,
+      "grad_norm": 0.4089350662590114,
+      "learning_rate": 0.0001691669074667535,
+      "loss": 0.7444,
+      "step": 524
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.4803026477066216,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.799,
+      "step": 525
+    },
+    {
+      "epoch": 0.28053333333333336,
+      "grad_norm": 0.4877188647636562,
+      "learning_rate": 0.0001689168904776979,
+      "loss": 0.774,
+      "step": 526
+    },
+    {
+      "epoch": 0.2810666666666667,
+      "grad_norm": 0.6431853537583769,
+      "learning_rate": 0.00016879157310192535,
+      "loss": 0.7877,
+      "step": 527
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.37497102337352045,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7186,
+      "step": 528
+    },
+    {
+      "epoch": 0.28213333333333335,
+      "grad_norm": 0.6827334487348299,
+      "learning_rate": 0.00016854032245897308,
+      "loss": 0.8863,
+      "step": 529
+    },
+    {
+      "epoch": 0.2826666666666667,
+      "grad_norm": 0.40588936339350645,
+      "learning_rate": 0.00016841438994206595,
+      "loss": 0.68,
+      "step": 530
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.4215497216503177,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7247,
+      "step": 531
+    },
+    {
+      "epoch": 0.28373333333333334,
+      "grad_norm": 0.7831930794444195,
+      "learning_rate": 0.00016816191239765667,
+      "loss": 0.8478,
+      "step": 532
+    },
+    {
+      "epoch": 0.28426666666666667,
+      "grad_norm": 0.5061159945992103,
+      "learning_rate": 0.00016803536812409075,
+      "loss": 0.8517,
+      "step": 533
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.4890160436690848,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.799,
+      "step": 534
+    },
+    {
+      "epoch": 0.2853333333333333,
+      "grad_norm": 0.5272230919346497,
+      "learning_rate": 0.00016778167046363734,
+      "loss": 0.722,
+      "step": 535
+    },
+    {
+      "epoch": 0.28586666666666666,
+      "grad_norm": 0.44480060503257196,
+      "learning_rate": 0.00016765451783432953,
+      "loss": 0.7527,
+      "step": 536
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.40239636808342866,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7641,
+      "step": 537
+    },
+    {
+      "epoch": 0.2869333333333333,
+      "grad_norm": 0.40126641105842026,
+      "learning_rate": 0.0001673996068760359,
+      "loss": 0.7094,
+      "step": 538
+    },
+    {
+      "epoch": 0.28746666666666665,
+      "grad_norm": 0.3811150028579922,
+      "learning_rate": 0.00016727184930825288,
+      "loss": 0.6642,
+      "step": 539
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.5752993294933703,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.8616,
+      "step": 540
+    },
+    {
+      "epoch": 0.2885333333333333,
+      "grad_norm": 0.39481132125794266,
+      "learning_rate": 0.00016701573190293077,
+      "loss": 0.731,
+      "step": 541
+    },
+    {
+      "epoch": 0.2890666666666667,
+      "grad_norm": 0.3999058489020813,
+      "learning_rate": 0.00016688737283019706,
+      "loss": 0.6489,
+      "step": 542
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.4703230878149846,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7545,
+      "step": 543
+    },
+    {
+      "epoch": 0.29013333333333335,
+      "grad_norm": 0.49371013141131853,
+      "learning_rate": 0.00016663005586108176,
+      "loss": 0.7686,
+      "step": 544
+    },
+    {
+      "epoch": 0.2906666666666667,
+      "grad_norm": 0.4765619157691119,
+      "learning_rate": 0.00016650109873308765,
+      "loss": 0.739,
+      "step": 545
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.441853537963337,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7868,
+      "step": 546
+    },
+    {
+      "epoch": 0.29173333333333334,
+      "grad_norm": 0.45815936906330557,
+      "learning_rate": 0.0001662425891156531,
+      "loss": 0.7086,
+      "step": 547
+    },
+    {
+      "epoch": 0.2922666666666667,
+      "grad_norm": 0.4272721732662985,
+      "learning_rate": 0.00016611303739816168,
+      "loss": 0.7447,
+      "step": 548
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.4785296959558227,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.765,
+      "step": 549
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.5004299400117509,
+      "learning_rate": 0.00016585334207993476,
+      "loss": 0.8165,
+      "step": 550
+    },
+    {
+      "epoch": 0.29386666666666666,
+      "grad_norm": 0.47233982615106174,
+      "learning_rate": 0.00016572319925468892,
+      "loss": 0.705,
+      "step": 551
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4665555158010926,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7443,
+      "step": 552
+    },
+    {
+      "epoch": 0.2949333333333333,
+      "grad_norm": 0.460903940185544,
+      "learning_rate": 0.0001654623252150624,
+      "loss": 0.7805,
+      "step": 553
+    },
+    {
+      "epoch": 0.29546666666666666,
+      "grad_norm": 0.46508206683156555,
+      "learning_rate": 0.00016533159477969122,
+      "loss": 0.715,
+      "step": 554
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4468702112243043,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.6781,
+      "step": 555
+    },
+    {
+      "epoch": 0.2965333333333333,
+      "grad_norm": 0.42634830048222383,
+      "learning_rate": 0.00016506954902973655,
+      "loss": 0.7117,
+      "step": 556
+    },
+    {
+      "epoch": 0.29706666666666665,
+      "grad_norm": 0.47911788834227576,
+      "learning_rate": 0.00016493823449766136,
+      "loss": 0.7687,
+      "step": 557
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.43085295474765856,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.6401,
+      "step": 558
+    },
+    {
+      "epoch": 0.2981333333333333,
+      "grad_norm": 0.564293491424751,
+      "learning_rate": 0.00016467502407993992,
+      "loss": 0.8547,
+      "step": 559
+    },
+    {
+      "epoch": 0.2986666666666667,
+      "grad_norm": 0.4291888973881084,
+      "learning_rate": 0.0001645431289802799,
+      "loss": 0.7025,
+      "step": 560
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.44075763650614463,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.8156,
+      "step": 561
+    },
+    {
+      "epoch": 0.29973333333333335,
+      "grad_norm": 0.48487222787768797,
+      "learning_rate": 0.00016427876096865394,
+      "loss": 0.7687,
+      "step": 562
+    },
+    {
+      "epoch": 0.3002666666666667,
+      "grad_norm": 0.4697238382791699,
+      "learning_rate": 0.00016414628884613107,
+      "loss": 0.6664,
+      "step": 563
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.5700028463426027,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7585,
+      "step": 564
+    },
+    {
+      "epoch": 0.30133333333333334,
+      "grad_norm": 0.4105318090148504,
+      "learning_rate": 0.00016388077034557355,
+      "loss": 0.6799,
+      "step": 565
+    },
+    {
+      "epoch": 0.30186666666666667,
+      "grad_norm": 0.4119685152772743,
+      "learning_rate": 0.00016374772476041748,
+      "loss": 0.7439,
+      "step": 566
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.4522329317929429,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7265,
+      "step": 567
+    },
+    {
+      "epoch": 0.30293333333333333,
+      "grad_norm": 0.4765751028037275,
+      "learning_rate": 0.00016348106290682118,
+      "loss": 0.8052,
+      "step": 568
+    },
+    {
+      "epoch": 0.30346666666666666,
+      "grad_norm": 0.4258110640284748,
+      "learning_rate": 0.00016334744743467364,
+      "loss": 0.7484,
+      "step": 569
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4461786810301393,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7671,
+      "step": 570
+    },
+    {
+      "epoch": 0.3045333333333333,
+      "grad_norm": 0.5429128790669628,
+      "learning_rate": 0.00016307964939465914,
+      "loss": 0.8071,
+      "step": 571
+    },
+    {
+      "epoch": 0.30506666666666665,
+      "grad_norm": 0.4390605947849205,
+      "learning_rate": 0.00016294546762647775,
+      "loss": 0.697,
+      "step": 572
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.4038533604127142,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7075,
+      "step": 573
+    },
+    {
+      "epoch": 0.3061333333333333,
+      "grad_norm": 0.5568686684952407,
+      "learning_rate": 0.0001626765405972011,
+      "loss": 0.8495,
+      "step": 574
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 0.4747017954068572,
+      "learning_rate": 0.00016254179613916278,
+      "loss": 0.7613,
+      "step": 575
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4366834280182313,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7626,
+      "step": 576
+    },
+    {
+      "epoch": 0.30773333333333336,
+      "grad_norm": 0.44999454185217175,
+      "learning_rate": 0.000162271747348122,
+      "loss": 0.7343,
+      "step": 577
+    },
+    {
+      "epoch": 0.3082666666666667,
+      "grad_norm": 0.5141098953987814,
+      "learning_rate": 0.0001621364438215262,
+      "loss": 0.7381,
+      "step": 578
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.4979981541763459,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.8697,
+      "step": 579
+    },
+    {
+      "epoch": 0.30933333333333335,
+      "grad_norm": 0.5435192399709737,
+      "learning_rate": 0.00016186528052636692,
+      "loss": 0.7988,
+      "step": 580
+    },
+    {
+      "epoch": 0.3098666666666667,
+      "grad_norm": 0.5865212864847045,
+      "learning_rate": 0.0001617294215675382,
+      "loss": 0.8213,
+      "step": 581
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.4000217603254605,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7828,
+      "step": 582
+    },
+    {
+      "epoch": 0.31093333333333334,
+      "grad_norm": 0.6411672155226577,
+      "learning_rate": 0.0001614571510558588,
+      "loss": 0.8783,
+      "step": 583
+    },
+    {
+      "epoch": 0.31146666666666667,
+      "grad_norm": 0.45516267410646694,
+      "learning_rate": 0.00016132074031604917,
+      "loss": 0.7786,
+      "step": 584
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.41059050816186476,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7115,
+      "step": 585
+    },
+    {
+      "epoch": 0.31253333333333333,
+      "grad_norm": 0.46687590472932344,
+      "learning_rate": 0.00016104736990520468,
+      "loss": 0.7689,
+      "step": 586
+    },
+    {
+      "epoch": 0.31306666666666666,
+      "grad_norm": 0.38822886603753526,
+      "learning_rate": 0.0001609104110504954,
+      "loss": 0.7012,
+      "step": 587
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.47065876283619956,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7555,
+      "step": 588
+    },
+    {
+      "epoch": 0.3141333333333333,
+      "grad_norm": 0.5020199385720759,
+      "learning_rate": 0.00016063594808740113,
+      "loss": 0.774,
+      "step": 589
+    },
+    {
+      "epoch": 0.31466666666666665,
+      "grad_norm": 0.3921988640457385,
+      "learning_rate": 0.00016049844479860422,
+      "loss": 0.7096,
+      "step": 590
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.5201087441068271,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.8493,
+      "step": 591
+    },
+    {
+      "epoch": 0.3157333333333333,
+      "grad_norm": 0.5451923737626841,
+      "learning_rate": 0.00016022289665953808,
+      "loss": 0.7868,
+      "step": 592
+    },
+    {
+      "epoch": 0.31626666666666664,
+      "grad_norm": 0.47105095892126675,
+      "learning_rate": 0.00016008485263209742,
+      "loss": 0.7852,
+      "step": 593
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.5609582405955111,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.6913,
+      "step": 594
+    },
+    {
+      "epoch": 0.31733333333333336,
+      "grad_norm": 0.48611142261714435,
+      "learning_rate": 0.0001598082267225018,
+      "loss": 0.7384,
+      "step": 595
+    },
+    {
+      "epoch": 0.3178666666666667,
+      "grad_norm": 0.4406908732434517,
+      "learning_rate": 0.0001596696456663938,
+      "loss": 0.6992,
+      "step": 596
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.4589133116579723,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7593,
+      "step": 597
+    },
+    {
+      "epoch": 0.31893333333333335,
+      "grad_norm": 0.42418996835669126,
+      "learning_rate": 0.00015939194942067646,
+      "loss": 0.7366,
+      "step": 598
+    },
+    {
+      "epoch": 0.3194666666666667,
+      "grad_norm": 0.5763811465298075,
+      "learning_rate": 0.0001592528350603103,
+      "loss": 0.8606,
+      "step": 599
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.424128657028291,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.687,
+      "step": 600
+    },
+    {
+      "epoch": 0.32053333333333334,
+      "grad_norm": 0.43710125300169733,
+      "learning_rate": 0.00015897407594164467,
+      "loss": 0.6815,
+      "step": 601
+    },
+    {
+      "epoch": 0.32106666666666667,
+      "grad_norm": 0.46211941169802706,
+      "learning_rate": 0.00015883443201576225,
+      "loss": 0.7514,
+      "step": 602
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.4909993806968154,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7277,
+      "step": 603
+    },
+    {
+      "epoch": 0.3221333333333333,
+      "grad_norm": 0.4923983192411157,
+      "learning_rate": 0.00015855461751588677,
+      "loss": 0.7655,
+      "step": 604
+    },
+    {
+      "epoch": 0.32266666666666666,
+      "grad_norm": 0.4741403142399323,
+      "learning_rate": 0.0001584144477774623,
+      "loss": 0.7538,
+      "step": 605
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.49141031927588963,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7945,
+      "step": 606
+    },
+    {
+      "epoch": 0.3237333333333333,
+      "grad_norm": 0.44870531022352494,
+      "learning_rate": 0.00015813358541647915,
+      "loss": 0.7406,
+      "step": 607
+    },
+    {
+      "epoch": 0.32426666666666665,
+      "grad_norm": 0.45734728130848884,
+      "learning_rate": 0.00015799289363261813,
+      "loss": 0.7482,
+      "step": 608
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.49567172635911816,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7477,
+      "step": 609
+    },
+    {
+      "epoch": 0.3253333333333333,
+      "grad_norm": 0.43475413300778126,
+      "learning_rate": 0.00015771099095879108,
+      "loss": 0.713,
+      "step": 610
+    },
+    {
+      "epoch": 0.3258666666666667,
+      "grad_norm": 0.46977840374942775,
+      "learning_rate": 0.0001575697809106292,
+      "loss": 0.8047,
+      "step": 611
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4463777590892542,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.7758,
+      "step": 612
+    },
+    {
+      "epoch": 0.32693333333333335,
+      "grad_norm": 0.45214582433111267,
+      "learning_rate": 0.00015728684550018064,
+      "loss": 0.7142,
+      "step": 613
+    },
+    {
+      "epoch": 0.3274666666666667,
+      "grad_norm": 0.49773011149752655,
+      "learning_rate": 0.0001571451209827821,
+      "loss": 0.7274,
+      "step": 614
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.4314692528617401,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7007,
+      "step": 615
+    },
+    {
+      "epoch": 0.32853333333333334,
+      "grad_norm": 0.47206850117655724,
+      "learning_rate": 0.00015686116043968972,
+      "loss": 0.8095,
+      "step": 616
+    },
+    {
+      "epoch": 0.3290666666666667,
+      "grad_norm": 0.4991695597510264,
+      "learning_rate": 0.00015671892526194516,
+      "loss": 0.7972,
+      "step": 617
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.3983939422798158,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7764,
+      "step": 618
+    },
+    {
+      "epoch": 0.33013333333333333,
+      "grad_norm": 0.5050203062130337,
+      "learning_rate": 0.0001564339472177373,
+      "loss": 0.859,
+      "step": 619
+    },
+    {
+      "epoch": 0.33066666666666666,
+      "grad_norm": 0.4387603838845481,
+      "learning_rate": 0.00015629120520226165,
+      "loss": 0.6947,
+      "step": 620
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.3964909034223212,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.668,
+      "step": 621
+    },
+    {
+      "epoch": 0.3317333333333333,
+      "grad_norm": 0.43171786252638306,
+      "learning_rate": 0.0001560052173158123,
+      "loss": 0.793,
+      "step": 622
+    },
+    {
+      "epoch": 0.33226666666666665,
+      "grad_norm": 0.5187456883855971,
+      "learning_rate": 0.00015586197229884184,
+      "loss": 0.7265,
+      "step": 623
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.41460633189876106,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7311,
+      "step": 624
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.5480534679771224,
+      "learning_rate": 0.00015557498225616487,
+      "loss": 0.7459,
+      "step": 625
+    },
+    {
+      "epoch": 0.33386666666666664,
+      "grad_norm": 0.4353692803898675,
+      "learning_rate": 0.0001554312380874542,
+      "loss": 0.7122,
+      "step": 626
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.4821633256669228,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.742,
+      "step": 627
+    },
+    {
+      "epoch": 0.33493333333333336,
+      "grad_norm": 0.5037306860164165,
+      "learning_rate": 0.00015514325360149668,
+      "loss": 0.7978,
+      "step": 628
+    },
+    {
+      "epoch": 0.3354666666666667,
+      "grad_norm": 0.507867334028557,
+      "learning_rate": 0.0001549990141442153,
+      "loss": 0.8045,
+      "step": 629
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5110555865166123,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7501,
+      "step": 630
+    },
+    {
+      "epoch": 0.33653333333333335,
+      "grad_norm": 0.4073410256028646,
+      "learning_rate": 0.00015471004295465035,
+      "loss": 0.7028,
+      "step": 631
+    },
+    {
+      "epoch": 0.3370666666666667,
+      "grad_norm": 0.45785855304947404,
+      "learning_rate": 0.0001545653120852787,
+      "loss": 0.7284,
+      "step": 632
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.4220091163815511,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.6307,
+      "step": 633
+    },
+    {
+      "epoch": 0.33813333333333334,
+      "grad_norm": 0.4914744393206375,
+      "learning_rate": 0.00015427536195829742,
+      "loss": 0.809,
+      "step": 634
+    },
+    {
+      "epoch": 0.33866666666666667,
+      "grad_norm": 0.4109261497500046,
+      "learning_rate": 0.00015413014356652286,
+      "loss": 0.6795,
+      "step": 635
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.5026884808116417,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7994,
+      "step": 636
+    },
+    {
+      "epoch": 0.33973333333333333,
+      "grad_norm": 0.4541051107471657,
+      "learning_rate": 0.00015383922229462549,
+      "loss": 0.7953,
+      "step": 637
+    },
+    {
+      "epoch": 0.34026666666666666,
+      "grad_norm": 0.6293001395704625,
+      "learning_rate": 0.00015369352028323774,
+      "loss": 0.8012,
+      "step": 638
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.4976287602137279,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.6998,
+      "step": 639
+    },
+    {
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.4955922406872679,
+      "learning_rate": 0.0001534016356850244,
+      "loss": 0.7605,
+      "step": 640
+    },
+    {
+      "epoch": 0.34186666666666665,
+      "grad_norm": 0.44482542538331415,
+      "learning_rate": 0.0001532554539698105,
+      "loss": 0.7485,
+      "step": 641
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.4313395133652232,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7156,
+      "step": 642
+    },
+    {
+      "epoch": 0.3429333333333333,
+      "grad_norm": 0.4311577164197288,
+      "learning_rate": 0.00015296261388977108,
+      "loss": 0.7096,
+      "step": 643
+    },
+    {
+      "epoch": 0.34346666666666664,
+      "grad_norm": 0.4311431692900766,
+      "learning_rate": 0.0001528159563994104,
+      "loss": 0.7385,
+      "step": 644
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.4427931426113665,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.755,
+      "step": 645
+    },
+    {
+      "epoch": 0.34453333333333336,
+      "grad_norm": 0.4465549549553681,
+      "learning_rate": 0.00015252216870771345,
+      "loss": 0.6685,
+      "step": 646
+    },
+    {
+      "epoch": 0.3450666666666667,
+      "grad_norm": 0.5075226293783536,
+      "learning_rate": 0.00015237503938367186,
+      "loss": 0.7675,
+      "step": 647
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.5226164172149571,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.8111,
+      "step": 648
+    },
+    {
+      "epoch": 0.34613333333333335,
+      "grad_norm": 0.5139809186257368,
+      "learning_rate": 0.00015208031197595356,
+      "loss": 0.767,
+      "step": 649
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.44754949926385723,
+      "learning_rate": 0.0001519327147723776,
+      "loss": 0.7254,
+      "step": 650
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.4438473672187995,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7482,
+      "step": 651
+    },
+    {
+      "epoch": 0.34773333333333334,
+      "grad_norm": 0.42731239047649827,
+      "learning_rate": 0.0001516370555695291,
+      "loss": 0.7416,
+      "step": 652
+    },
+    {
+      "epoch": 0.34826666666666667,
+      "grad_norm": 0.46758032555994145,
+      "learning_rate": 0.00015148899445313981,
+      "loss": 0.744,
+      "step": 653
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.47426682965763883,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7751,
+      "step": 654
+    },
+    {
+      "epoch": 0.34933333333333333,
+      "grad_norm": 0.44707733777243375,
+      "learning_rate": 0.00015119241140109467,
+      "loss": 0.7932,
+      "step": 655
+    },
+    {
+      "epoch": 0.34986666666666666,
+      "grad_norm": 0.4004552113265985,
+      "learning_rate": 0.00015104389035108077,
+      "loss": 0.7307,
+      "step": 656
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.5467220954498396,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.7373,
+      "step": 657
+    },
+    {
+      "epoch": 0.3509333333333333,
+      "grad_norm": 0.5220563295258382,
+      "learning_rate": 0.0001507463914206012,
+      "loss": 0.8364,
+      "step": 658
+    },
+    {
+      "epoch": 0.35146666666666665,
+      "grad_norm": 0.5634002615178116,
+      "learning_rate": 0.0001505974144285124,
+      "loss": 0.7415,
+      "step": 659
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.38745943754068546,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.6637,
+      "step": 660
+    },
+    {
+      "epoch": 0.3525333333333333,
+      "grad_norm": 0.486359386615219,
+      "learning_rate": 0.00015029900761497506,
+      "loss": 0.7836,
+      "step": 661
+    },
+    {
+      "epoch": 0.35306666666666664,
+      "grad_norm": 0.5010873605466595,
+      "learning_rate": 0.00015014957868461458,
+      "loss": 0.7917,
+      "step": 662
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.501288289888726,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.6915,
+      "step": 663
+    },
+    {
+      "epoch": 0.35413333333333336,
+      "grad_norm": 0.42303647740091144,
+      "learning_rate": 0.000149850272007796,
+      "loss": 0.7427,
+      "step": 664
+    },
+    {
+      "epoch": 0.3546666666666667,
+      "grad_norm": 0.5633511965577845,
+      "learning_rate": 0.00014970039515511304,
+      "loss": 0.7679,
+      "step": 665
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.4532301301358009,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7322,
+      "step": 666
+    },
+    {
+      "epoch": 0.35573333333333335,
+      "grad_norm": 0.5658234907452527,
+      "learning_rate": 0.0001494001966589736,
+      "loss": 0.7685,
+      "step": 667
+    },
+    {
+      "epoch": 0.3562666666666667,
+      "grad_norm": 0.5118907752126777,
+      "learning_rate": 0.00014924987591195547,
+      "loss": 0.7205,
+      "step": 668
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.4700921538809694,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.735,
+      "step": 669
+    },
+    {
+      "epoch": 0.35733333333333334,
+      "grad_norm": 0.3948866192176909,
+      "learning_rate": 0.0001489487936644237,
+      "loss": 0.6639,
+      "step": 670
+    },
+    {
+      "epoch": 0.35786666666666667,
+      "grad_norm": 0.5029679317744075,
+      "learning_rate": 0.00014879803306298736,
+      "loss": 0.8032,
+      "step": 671
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.5285095766455293,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.8266,
+      "step": 672
+    },
+    {
+      "epoch": 0.3589333333333333,
+      "grad_norm": 0.4678610632791758,
+      "learning_rate": 0.00014849607515574276,
+      "loss": 0.7573,
+      "step": 673
+    },
+    {
+      "epoch": 0.35946666666666666,
+      "grad_norm": 0.438079569007165,
+      "learning_rate": 0.00014834487875162657,
+      "loss": 0.7636,
+      "step": 674
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4315164695282025,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.6787,
+      "step": 675
+    },
+    {
+      "epoch": 0.3605333333333333,
+      "grad_norm": 0.402632017098526,
+      "learning_rate": 0.00014804205329988225,
+      "loss": 0.738,
+      "step": 676
+    },
+    {
+      "epoch": 0.36106666666666665,
+      "grad_norm": 0.5319956953561994,
+      "learning_rate": 0.00014789042515653687,
+      "loss": 0.6921,
+      "step": 677
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4671769037323997,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7307,
+      "step": 678
+    },
+    {
+      "epoch": 0.3621333333333333,
+      "grad_norm": 0.4963743520933098,
+      "learning_rate": 0.00014758674029882152,
+      "loss": 0.8179,
+      "step": 679
+    },
+    {
+      "epoch": 0.3626666666666667,
+      "grad_norm": 0.4376609890215395,
+      "learning_rate": 0.00014743468449130063,
+      "loss": 0.7613,
+      "step": 680
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.42537507347010867,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7545,
+      "step": 681
+    },
+    {
+      "epoch": 0.36373333333333335,
+      "grad_norm": 0.42779369666001926,
+      "learning_rate": 0.00014713014838923976,
+      "loss": 0.7511,
+      "step": 682
+    },
+    {
+      "epoch": 0.3642666666666667,
+      "grad_norm": 0.46320255199500754,
+      "learning_rate": 0.00014697766900409074,
+      "loss": 0.7646,
+      "step": 683
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.40544842990795427,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.6486,
+      "step": 684
+    },
+    {
+      "epoch": 0.36533333333333334,
+      "grad_norm": 0.43613137315290307,
+      "learning_rate": 0.0001466722898421873,
+      "loss": 0.7125,
+      "step": 685
+    },
+    {
+      "epoch": 0.3658666666666667,
+      "grad_norm": 0.4516560485665258,
+      "learning_rate": 0.0001465193909773413,
+      "loss": 0.7988,
+      "step": 686
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.4622059127493755,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.8072,
+      "step": 687
+    },
+    {
+      "epoch": 0.36693333333333333,
+      "grad_norm": 0.6062904742380004,
+      "learning_rate": 0.00014621317696275564,
+      "loss": 0.7711,
+      "step": 688
+    },
+    {
+      "epoch": 0.36746666666666666,
+      "grad_norm": 0.4457959629263612,
+      "learning_rate": 0.00014605986272741748,
+      "loss": 0.7529,
+      "step": 689
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.5036905895559952,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.742,
+      "step": 690
+    },
+    {
+      "epoch": 0.3685333333333333,
+      "grad_norm": 0.43078435343308674,
+      "learning_rate": 0.00014575282208974702,
+      "loss": 0.7135,
+      "step": 691
+    },
+    {
+      "epoch": 0.36906666666666665,
+      "grad_norm": 0.4309120967334071,
+      "learning_rate": 0.00014559909660428468,
+      "loss": 0.7066,
+      "step": 692
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.3910644078370677,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.6915,
+      "step": 693
+    },
+    {
+      "epoch": 0.3701333333333333,
+      "grad_norm": 0.49404409459857673,
+      "learning_rate": 0.00014529123759534255,
+      "loss": 0.7782,
+      "step": 694
+    },
+    {
+      "epoch": 0.37066666666666664,
+      "grad_norm": 0.3786436836018683,
+      "learning_rate": 0.00014513710499117647,
+      "loss": 0.6636,
+      "step": 695
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4820612392270634,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.727,
+      "step": 696
+    },
+    {
+      "epoch": 0.37173333333333336,
+      "grad_norm": 0.49517669517339263,
+      "learning_rate": 0.00014482843588476974,
+      "loss": 0.7085,
+      "step": 697
+    },
+    {
+      "epoch": 0.3722666666666667,
+      "grad_norm": 0.3894098085822135,
+      "learning_rate": 0.00014467390030426186,
+      "loss": 0.7626,
+      "step": 698
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.4809366667011091,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7941,
+      "step": 699
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.4334479277896229,
+      "learning_rate": 0.0001443644293959693,
+      "loss": 0.7546,
+      "step": 700
+    },
+    {
+      "epoch": 0.3738666666666667,
+      "grad_norm": 0.411286707423272,
+      "learning_rate": 0.00014420949499231172,
+      "loss": 0.6783,
+      "step": 701
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4091591056349995,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.7366,
+      "step": 702
+    },
+    {
+      "epoch": 0.37493333333333334,
+      "grad_norm": 0.4459957849244347,
+      "learning_rate": 0.00014389923059926062,
+      "loss": 0.7454,
+      "step": 703
+    },
+    {
+      "epoch": 0.37546666666666667,
+      "grad_norm": 0.4340937086824739,
+      "learning_rate": 0.0001437439015363638,
+      "loss": 0.7597,
+      "step": 704
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.436311728081951,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7509,
+      "step": 705
+    },
+    {
+      "epoch": 0.37653333333333333,
+      "grad_norm": 0.4130299382442576,
+      "learning_rate": 0.00014343285199700683,
+      "loss": 0.7613,
+      "step": 706
+    },
+    {
+      "epoch": 0.37706666666666666,
+      "grad_norm": 0.5002475940688763,
+      "learning_rate": 0.0001432771324493879,
+      "loss": 0.7146,
+      "step": 707
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.4327174731401735,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.6813,
+      "step": 708
+    },
+    {
+      "epoch": 0.3781333333333333,
+      "grad_norm": 0.5696939459056176,
+      "learning_rate": 0.00014296530612327863,
+      "loss": 0.895,
+      "step": 709
+    },
+    {
+      "epoch": 0.37866666666666665,
+      "grad_norm": 0.4131339642407923,
+      "learning_rate": 0.00014280920027594907,
+      "loss": 0.7575,
+      "step": 710
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.39482645237754493,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.6859,
+      "step": 711
+    },
+    {
+      "epoch": 0.3797333333333333,
+      "grad_norm": 0.46843622030627735,
+      "learning_rate": 0.00014249660554351752,
+      "loss": 0.7232,
+      "step": 712
+    },
+    {
+      "epoch": 0.38026666666666664,
+      "grad_norm": 0.45430122743579016,
+      "learning_rate": 0.00014234011759187083,
+      "loss": 0.7073,
+      "step": 713
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.45158387414078693,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7257,
+      "step": 714
+    },
+    {
+      "epoch": 0.38133333333333336,
+      "grad_norm": 0.39216543786202235,
+      "learning_rate": 0.00014202676285419812,
+      "loss": 0.6368,
+      "step": 715
+    },
+    {
+      "epoch": 0.3818666666666667,
+      "grad_norm": 0.4848796664385961,
+      "learning_rate": 0.00014186989700389687,
+      "loss": 0.6737,
+      "step": 716
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.410014976951978,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.6624,
+      "step": 717
+    },
+    {
+      "epoch": 0.38293333333333335,
+      "grad_norm": 0.39256947308996265,
+      "learning_rate": 0.0001415557906824895,
+      "loss": 0.6836,
+      "step": 718
+    },
+    {
+      "epoch": 0.3834666666666667,
+      "grad_norm": 0.4565237959087724,
+      "learning_rate": 0.00014139855114935252,
+      "loss": 0.6788,
+      "step": 719
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.5259409688862253,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.7694,
+      "step": 720
+    },
+    {
+      "epoch": 0.38453333333333334,
+      "grad_norm": 0.47852330100220636,
+      "learning_rate": 0.0001410837016859161,
+      "loss": 0.7403,
+      "step": 721
+    },
+    {
+      "epoch": 0.38506666666666667,
+      "grad_norm": 0.4887007053618095,
+      "learning_rate": 0.00014092609269580496,
+      "loss": 0.7099,
+      "step": 722
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.43058414711812387,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7203,
+      "step": 723
+    },
+    {
+      "epoch": 0.38613333333333333,
+      "grad_norm": 0.4454818052149887,
+      "learning_rate": 0.00014061050855201723,
+      "loss": 0.7833,
+      "step": 724
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 0.5429261826001993,
+      "learning_rate": 0.0001404525343407228,
+      "loss": 0.7327,
+      "step": 725
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.39815626556736594,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7461,
+      "step": 726
+    },
+    {
+      "epoch": 0.3877333333333333,
+      "grad_norm": 0.39246289798886425,
+      "learning_rate": 0.00014013622399800627,
+      "loss": 0.6866,
+      "step": 727
+    },
+    {
+      "epoch": 0.38826666666666665,
+      "grad_norm": 0.4741579535345787,
+      "learning_rate": 0.00013997788881113489,
+      "loss": 0.6839,
+      "step": 728
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.4489399709915272,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.6996,
+      "step": 729
+    },
+    {
+      "epoch": 0.3893333333333333,
+      "grad_norm": 0.4115010330663161,
+      "learning_rate": 0.0001396608607704289,
+      "loss": 0.718,
+      "step": 730
+    },
+    {
+      "epoch": 0.38986666666666664,
+      "grad_norm": 0.4956119127490432,
+      "learning_rate": 0.0001395021688632882,
+      "loss": 0.7818,
+      "step": 731
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.4278273974928803,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7528,
+      "step": 732
+    },
+    {
+      "epoch": 0.39093333333333335,
+      "grad_norm": 0.42786759225456455,
+      "learning_rate": 0.00013918443164482046,
+      "loss": 0.7039,
+      "step": 733
+    },
+    {
+      "epoch": 0.3914666666666667,
+      "grad_norm": 0.4337138843974863,
+      "learning_rate": 0.000139025387282305,
+      "loss": 0.661,
+      "step": 734
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.4237467557227778,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7349,
+      "step": 735
+    },
+    {
+      "epoch": 0.39253333333333335,
+      "grad_norm": 0.40920424786891146,
+      "learning_rate": 0.0001387069494253626,
+      "loss": 0.691,
+      "step": 736
+    },
+    {
+      "epoch": 0.3930666666666667,
+      "grad_norm": 0.3686416086717631,
+      "learning_rate": 0.0001385475568818394,
+      "loss": 0.7122,
+      "step": 737
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.4643581788066938,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7569,
+      "step": 738
+    },
+    {
+      "epoch": 0.39413333333333334,
+      "grad_norm": 0.4762060081185955,
+      "learning_rate": 0.00013822842694453924,
+      "loss": 0.7806,
+      "step": 739
+    },
+    {
+      "epoch": 0.39466666666666667,
+      "grad_norm": 0.5081499789746543,
+      "learning_rate": 0.0001380686905037327,
+      "loss": 0.8157,
+      "step": 740
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.5130805092024211,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7197,
+      "step": 741
+    },
+    {
+      "epoch": 0.3957333333333333,
+      "grad_norm": 0.45755645408144535,
+      "learning_rate": 0.00013774887706279165,
+      "loss": 0.7087,
+      "step": 742
+    },
+    {
+      "epoch": 0.39626666666666666,
+      "grad_norm": 0.4031907778603954,
+      "learning_rate": 0.0001375888010176686,
+      "loss": 0.7051,
+      "step": 743
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.5609100001570461,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.815,
+      "step": 744
+    },
+    {
+      "epoch": 0.3973333333333333,
+      "grad_norm": 0.49628776979255335,
+      "learning_rate": 0.00013726831266817278,
+      "loss": 0.7691,
+      "step": 745
+    },
+    {
+      "epoch": 0.39786666666666665,
+      "grad_norm": 0.45510353095639755,
+      "learning_rate": 0.00013710790132082692,
+      "loss": 0.7281,
+      "step": 746
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.47158515365956716,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7398,
+      "step": 747
+    },
+    {
+      "epoch": 0.3989333333333333,
+      "grad_norm": 0.364542506038144,
+      "learning_rate": 0.00013678674667600102,
+      "loss": 0.6406,
+      "step": 748
+    },
+    {
+      "epoch": 0.3994666666666667,
+      "grad_norm": 0.3910998335909253,
+      "learning_rate": 0.00013662600433753745,
+      "loss": 0.6849,
+      "step": 749
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5004213445389641,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.8172,
+      "step": 750
+    },
+    {
+      "epoch": 0.40053333333333335,
+      "grad_norm": 0.5649113104129985,
+      "learning_rate": 0.00013630419202851284,
+      "loss": 0.798,
+      "step": 751
+    },
+    {
+      "epoch": 0.4010666666666667,
+      "grad_norm": 0.4772701337982866,
+      "learning_rate": 0.00013614312301893223,
+      "loss": 0.7384,
+      "step": 752
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.3911165137008045,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.6913,
+      "step": 753
+    },
+    {
+      "epoch": 0.40213333333333334,
+      "grad_norm": 0.3920118938414042,
+      "learning_rate": 0.00013582066169451535,
+      "loss": 0.7101,
+      "step": 754
+    },
+    {
+      "epoch": 0.4026666666666667,
+      "grad_norm": 0.43284706717797405,
+      "learning_rate": 0.0001356592703425976,
+      "loss": 0.7517,
+      "step": 755
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.4571677343916639,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7085,
+      "step": 756
+    },
+    {
+      "epoch": 0.40373333333333333,
+      "grad_norm": 0.46129653075125937,
+      "learning_rate": 0.00013533616866903735,
+      "loss": 0.7866,
+      "step": 757
+    },
+    {
+      "epoch": 0.40426666666666666,
+      "grad_norm": 0.4040534320766542,
+      "learning_rate": 0.0001351744593122255,
+      "loss": 0.6709,
+      "step": 758
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.48713734333464104,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7979,
+      "step": 759
+    },
+    {
+      "epoch": 0.4053333333333333,
+      "grad_norm": 0.4422612508515027,
+      "learning_rate": 0.00013485072597298038,
+      "loss": 0.6914,
+      "step": 760
+    },
+    {
+      "epoch": 0.40586666666666665,
+      "grad_norm": 0.497515567006007,
+      "learning_rate": 0.00013468870295726398,
+      "loss": 0.6701,
+      "step": 761
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.3749627961878381,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.678,
+      "step": 762
+    },
+    {
+      "epoch": 0.4069333333333333,
+      "grad_norm": 0.4053618178498056,
+      "learning_rate": 0.00013436434665276865,
+      "loss": 0.7526,
+      "step": 763
+    },
+    {
+      "epoch": 0.40746666666666664,
+      "grad_norm": 0.4869231955669743,
+      "learning_rate": 0.00013420201433256689,
+      "loss": 0.7363,
+      "step": 764
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.5113603633922611,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7795,
+      "step": 765
+    },
+    {
+      "epoch": 0.40853333333333336,
+      "grad_norm": 0.43042627566994696,
+      "learning_rate": 0.00013387704377999842,
+      "loss": 0.7381,
+      "step": 766
+    },
+    {
+      "epoch": 0.4090666666666667,
+      "grad_norm": 0.42018128767763213,
+      "learning_rate": 0.00013371440651804313,
+      "loss": 0.7372,
+      "step": 767
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.45114974776976924,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7467,
+      "step": 768
+    },
+    {
+      "epoch": 0.41013333333333335,
+      "grad_norm": 0.37166054472673127,
+      "learning_rate": 0.00013338883045108674,
+      "loss": 0.7265,
+      "step": 769
+    },
+    {
+      "epoch": 0.4106666666666667,
+      "grad_norm": 0.5455089847152249,
+      "learning_rate": 0.00013322589261830517,
+      "loss": 0.8383,
+      "step": 770
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.47410323882797717,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7555,
+      "step": 771
+    },
+    {
+      "epoch": 0.41173333333333334,
+      "grad_norm": 0.3708553682655247,
+      "learning_rate": 0.0001328997197869194,
+      "loss": 0.6607,
+      "step": 772
+    },
+    {
+      "epoch": 0.41226666666666667,
+      "grad_norm": 0.39716143357931005,
+      "learning_rate": 0.0001327364857623168,
+      "loss": 0.69,
+      "step": 773
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4637436225554559,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7121,
+      "step": 774
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 0.4646629132988441,
+      "learning_rate": 0.00013240972493249847,
+      "loss": 0.748,
+      "step": 775
+    },
+    {
+      "epoch": 0.41386666666666666,
+      "grad_norm": 0.5259203569640928,
+      "learning_rate": 0.0001322461991030402,
+      "loss": 0.7625,
+      "step": 776
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.505306648827023,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7674,
+      "step": 777
+    },
+    {
+      "epoch": 0.4149333333333333,
+      "grad_norm": 0.4137847217470741,
+      "learning_rate": 0.00013191885905658872,
+      "loss": 0.696,
+      "step": 778
+    },
+    {
+      "epoch": 0.41546666666666665,
+      "grad_norm": 0.49464438417860157,
+      "learning_rate": 0.0001317550458170826,
+      "loss": 0.7926,
+      "step": 779
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4242428104136327,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.6871,
+      "step": 780
+    },
+    {
+      "epoch": 0.4165333333333333,
+      "grad_norm": 0.4727755381179743,
+      "learning_rate": 0.00013142713535136414,
+      "loss": 0.771,
+      "step": 781
+    },
+    {
+      "epoch": 0.41706666666666664,
+      "grad_norm": 0.43294329661718095,
+      "learning_rate": 0.00013126303910434214,
+      "loss": 0.7027,
+      "step": 782
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.4398267303341206,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.808,
+      "step": 783
+    },
+    {
+      "epoch": 0.41813333333333336,
+      "grad_norm": 0.5014322494464222,
+      "learning_rate": 0.00013093456703205288,
+      "loss": 0.7417,
+      "step": 784
+    },
+    {
+      "epoch": 0.4186666666666667,
+      "grad_norm": 0.43561358626505575,
+      "learning_rate": 0.00013077019218765305,
+      "loss": 0.7333,
+      "step": 785
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.5735756367259995,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7636,
+      "step": 786
+    },
+    {
+      "epoch": 0.41973333333333335,
+      "grad_norm": 0.4816319454871905,
+      "learning_rate": 0.0001304411673365826,
+      "loss": 0.7922,
+      "step": 787
+    },
+    {
+      "epoch": 0.4202666666666667,
+      "grad_norm": 0.4310653881420473,
+      "learning_rate": 0.0001302765183124302,
+      "loss": 0.7094,
+      "step": 788
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.4846420316957972,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7582,
+      "step": 789
+    },
+    {
+      "epoch": 0.42133333333333334,
+      "grad_norm": 0.4324312494858273,
+      "learning_rate": 0.00012994694952522435,
+      "loss": 0.7419,
+      "step": 790
+    },
+    {
+      "epoch": 0.42186666666666667,
+      "grad_norm": 0.5074981102326569,
+      "learning_rate": 0.00012978203074631334,
+      "loss": 0.7312,
+      "step": 791
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.4905776990896686,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7956,
+      "step": 792
+    },
+    {
+      "epoch": 0.42293333333333333,
+      "grad_norm": 0.49200158234741015,
+      "learning_rate": 0.00012945192688023624,
+      "loss": 0.7989,
+      "step": 793
+    },
+    {
+      "epoch": 0.42346666666666666,
+      "grad_norm": 0.5086000623801779,
+      "learning_rate": 0.0001292867427788104,
+      "loss": 0.7491,
+      "step": 794
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.5133411173886876,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7377,
+      "step": 795
+    },
+    {
+      "epoch": 0.4245333333333333,
+      "grad_norm": 0.506453431859821,
+      "learning_rate": 0.00012895611270550666,
+      "loss": 0.7692,
+      "step": 796
+    },
+    {
+      "epoch": 0.42506666666666665,
+      "grad_norm": 0.35191826880004473,
+      "learning_rate": 0.0001287906677209403,
+      "loss": 0.636,
+      "step": 797
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.4385447355157215,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7413,
+      "step": 798
+    },
+    {
+      "epoch": 0.4261333333333333,
+      "grad_norm": 0.5454625889989327,
+      "learning_rate": 0.0001284595203261965,
+      "loss": 0.8247,
+      "step": 799
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.4749738899934725,
+      "learning_rate": 0.00012829381890487536,
+      "loss": 0.7708,
+      "step": 800
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.43017137981306225,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.6772,
+      "step": 801
+    },
+    {
+      "epoch": 0.42773333333333335,
+      "grad_norm": 0.45323444813708,
+      "learning_rate": 0.00012796216308838117,
+      "loss": 0.7362,
+      "step": 802
+    },
+    {
+      "epoch": 0.4282666666666667,
+      "grad_norm": 0.4185420011773539,
+      "learning_rate": 0.00012779620968358273,
+      "loss": 0.7612,
+      "step": 803
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.45638549873019607,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.722,
+      "step": 804
+    },
+    {
+      "epoch": 0.42933333333333334,
+      "grad_norm": 0.3661491978523475,
+      "learning_rate": 0.00012746405435869198,
+      "loss": 0.6836,
+      "step": 805
+    },
+    {
+      "epoch": 0.4298666666666667,
+      "grad_norm": 0.44714719395811886,
+      "learning_rate": 0.00012729785343046588,
+      "loss": 0.7836,
+      "step": 806
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.47466575379253284,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7224,
+      "step": 807
+    },
+    {
+      "epoch": 0.43093333333333333,
+      "grad_norm": 0.4676065851593823,
+      "learning_rate": 0.00012696520752395672,
+      "loss": 0.8039,
+      "step": 808
+    },
+    {
+      "epoch": 0.43146666666666667,
+      "grad_norm": 0.4318394753620631,
+      "learning_rate": 0.00012679876353900482,
+      "loss": 0.7355,
+      "step": 809
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.47774067896149053,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7645,
+      "step": 810
+    },
+    {
+      "epoch": 0.4325333333333333,
+      "grad_norm": 0.4311480104137981,
+      "learning_rate": 0.00012646563599083996,
+      "loss": 0.769,
+      "step": 811
+    },
+    {
+      "epoch": 0.43306666666666666,
+      "grad_norm": 0.410371089255066,
+      "learning_rate": 0.00012629895342239643,
+      "loss": 0.7305,
+      "step": 812
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.41546251368632736,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.6847,
+      "step": 813
+    },
+    {
+      "epoch": 0.4341333333333333,
+      "grad_norm": 0.4405943194323538,
+      "learning_rate": 0.00012596535318548289,
+      "loss": 0.7464,
+      "step": 814
+    },
+    {
+      "epoch": 0.43466666666666665,
+      "grad_norm": 0.4330941730814299,
+      "learning_rate": 0.0001257984365131938,
+      "loss": 0.7176,
+      "step": 815
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.5736085095853767,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.8581,
+      "step": 816
+    },
+    {
+      "epoch": 0.4357333333333333,
+      "grad_norm": 0.4472813130814026,
+      "learning_rate": 0.00012546437255314222,
+      "loss": 0.7106,
+      "step": 817
+    },
+    {
+      "epoch": 0.4362666666666667,
+      "grad_norm": 0.43569787512162517,
+      "learning_rate": 0.0001252972262629454,
+      "loss": 0.6917,
+      "step": 818
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.49454833062512754,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7953,
+      "step": 819
+    },
+    {
+      "epoch": 0.43733333333333335,
+      "grad_norm": 0.5181978628846783,
+      "learning_rate": 0.00012496270755782914,
+      "loss": 0.7135,
+      "step": 820
+    },
+    {
+      "epoch": 0.4378666666666667,
+      "grad_norm": 0.39352616423357195,
+      "learning_rate": 0.00012479533614183334,
+      "loss": 0.6566,
+      "step": 821
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.4610612173739749,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7231,
+      "step": 822
+    },
+    {
+      "epoch": 0.43893333333333334,
+      "grad_norm": 0.46913616543609205,
+      "learning_rate": 0.00012446037168194714,
+      "loss": 0.7332,
+      "step": 823
+    },
+    {
+      "epoch": 0.43946666666666667,
+      "grad_norm": 0.46953178408541935,
+      "learning_rate": 0.00012429277963831148,
+      "loss": 0.7212,
+      "step": 824
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.6271304886126731,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.8707,
+      "step": 825
+    },
+    {
+      "epoch": 0.44053333333333333,
+      "grad_norm": 0.46209248966127764,
+      "learning_rate": 0.00012395737842592995,
+      "loss": 0.7695,
+      "step": 826
+    },
+    {
+      "epoch": 0.44106666666666666,
+      "grad_norm": 0.47942304215077186,
+      "learning_rate": 0.000123789570258743,
+      "loss": 0.8565,
+      "step": 827
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.4101241980577303,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.6485,
+      "step": 828
+    },
+    {
+      "epoch": 0.4421333333333333,
+      "grad_norm": 0.3816718730312766,
+      "learning_rate": 0.00012345374130787854,
+      "loss": 0.6256,
+      "step": 829
+    },
+    {
+      "epoch": 0.44266666666666665,
+      "grad_norm": 0.4417193001861336,
+      "learning_rate": 0.00012328572152703725,
+      "loss": 0.7207,
+      "step": 830
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.5981440375482587,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7957,
+      "step": 831
+    },
+    {
+      "epoch": 0.4437333333333333,
+      "grad_norm": 0.4647444456843219,
+      "learning_rate": 0.00012294947386319794,
+      "loss": 0.7351,
+      "step": 832
+    },
+    {
+      "epoch": 0.44426666666666664,
+      "grad_norm": 0.40296395195083007,
+      "learning_rate": 0.0001227812469842864,
+      "loss": 0.7092,
+      "step": 833
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.402563489305275,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.6711,
+      "step": 834
+    },
+    {
+      "epoch": 0.44533333333333336,
+      "grad_norm": 0.5106001320237675,
+      "learning_rate": 0.00012244458964423327,
+      "loss": 0.7049,
+      "step": 835
+    },
+    {
+      "epoch": 0.4458666666666667,
+      "grad_norm": 0.4237461312350529,
+      "learning_rate": 0.00012227616018840154,
+      "loss": 0.7332,
+      "step": 836
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.4558093727384499,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7274,
+      "step": 837
+    },
+    {
+      "epoch": 0.44693333333333335,
+      "grad_norm": 0.5204878320075624,
+      "learning_rate": 0.00012193910221990581,
+      "loss": 0.6482,
+      "step": 838
+    },
+    {
+      "epoch": 0.4474666666666667,
+      "grad_norm": 0.4889693265863364,
+      "learning_rate": 0.00012177047471374807,
+      "loss": 0.7185,
+      "step": 839
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4540591387797124,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.746,
+      "step": 840
+    },
+    {
+      "epoch": 0.44853333333333334,
+      "grad_norm": 0.46190022087141114,
+      "learning_rate": 0.0001214330251753481,
+      "loss": 0.6699,
+      "step": 841
+    },
+    {
+      "epoch": 0.44906666666666667,
+      "grad_norm": 0.46435930782938306,
+      "learning_rate": 0.00012126420415078132,
+      "loss": 0.7373,
+      "step": 842
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.385266465778187,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.6748,
+      "step": 843
+    },
+    {
+      "epoch": 0.45013333333333333,
+      "grad_norm": 0.43843292366862363,
+      "learning_rate": 0.00012092637211153885,
+      "loss": 0.6912,
+      "step": 844
+    },
+    {
+      "epoch": 0.45066666666666666,
+      "grad_norm": 0.5162492799710219,
+      "learning_rate": 0.0001207573621056809,
+      "loss": 0.7186,
+      "step": 845
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.4043800478325061,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7167,
+      "step": 846
+    },
+    {
+      "epoch": 0.4517333333333333,
+      "grad_norm": 0.5791722782202059,
+      "learning_rate": 0.00012041915664493761,
+      "loss": 0.7936,
+      "step": 847
+    },
+    {
+      "epoch": 0.45226666666666665,
+      "grad_norm": 0.4128245865326509,
+      "learning_rate": 0.00012024996219998517,
+      "loss": 0.7243,
+      "step": 848
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.4768187998001638,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7161,
+      "step": 849
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.3975473520741925,
+      "learning_rate": 0.00011991139240711857,
+      "loss": 0.7131,
+      "step": 850
+    },
+    {
+      "epoch": 0.45386666666666664,
+      "grad_norm": 0.4369425936837535,
+      "learning_rate": 0.00011974201807022525,
+      "loss": 0.7068,
+      "step": 851
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.4348740984844573,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7476,
+      "step": 852
+    },
+    {
+      "epoch": 0.45493333333333336,
+      "grad_norm": 0.3751438595107849,
+      "learning_rate": 0.00011940309304440433,
+      "loss": 0.673,
+      "step": 853
+    },
+    {
+      "epoch": 0.4554666666666667,
+      "grad_norm": 0.45681190956328344,
+      "learning_rate": 0.00011923354336755835,
+      "loss": 0.6976,
+      "step": 854
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.4595733453115094,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.6722,
+      "step": 855
+    },
+    {
+      "epoch": 0.45653333333333335,
+      "grad_norm": 0.4473268550932148,
+      "learning_rate": 0.00011889427221749916,
+      "loss": 0.7311,
+      "step": 856
+    },
+    {
+      "epoch": 0.4570666666666667,
+      "grad_norm": 0.4142221153102251,
+      "learning_rate": 0.00011872455175740112,
+      "loss": 0.7096,
+      "step": 857
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.36759043619548204,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7007,
+      "step": 858
+    },
+    {
+      "epoch": 0.45813333333333334,
+      "grad_norm": 0.4520956488770485,
+      "learning_rate": 0.00011838494360112185,
+      "loss": 0.7029,
+      "step": 859
+    },
+    {
+      "epoch": 0.45866666666666667,
+      "grad_norm": 0.46071778297359106,
+      "learning_rate": 0.00011821505691906216,
+      "loss": 0.7405,
+      "step": 860
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.42962898937413113,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7513,
+      "step": 861
+    },
+    {
+      "epoch": 0.4597333333333333,
+      "grad_norm": 0.4562043519029368,
+      "learning_rate": 0.00011787512088363817,
+      "loss": 0.7418,
+      "step": 862
+    },
+    {
+      "epoch": 0.46026666666666666,
+      "grad_norm": 0.43526330928681223,
+      "learning_rate": 0.00011770507254537453,
+      "loss": 0.7218,
+      "step": 863
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.41079236934528096,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.6617,
+      "step": 864
+    },
+    {
+      "epoch": 0.4613333333333333,
+      "grad_norm": 0.4603211357292951,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.7396,
+      "step": 865
+    },
+    {
+      "epoch": 0.46186666666666665,
+      "grad_norm": 0.4160488570454149,
+      "learning_rate": 0.00011719461234232764,
+      "loss": 0.6784,
+      "step": 866
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.49332789032248425,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.6971,
+      "step": 867
+    },
+    {
+      "epoch": 0.4629333333333333,
+      "grad_norm": 0.3959679849224477,
+      "learning_rate": 0.00011685404796484225,
+      "loss": 0.6869,
+      "step": 868
+    },
+    {
+      "epoch": 0.4634666666666667,
+      "grad_norm": 0.39101799382598734,
+      "learning_rate": 0.00011668369002869912,
+      "loss": 0.6586,
+      "step": 869
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3843654891183163,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.716,
+      "step": 870
+    },
+    {
+      "epoch": 0.46453333333333335,
+      "grad_norm": 0.6167233516284031,
+      "learning_rate": 0.00011634282520518383,
+      "loss": 0.744,
+      "step": 871
+    },
+    {
+      "epoch": 0.4650666666666667,
+      "grad_norm": 0.4667836434695035,
+      "learning_rate": 0.00011617231933568578,
+      "loss": 0.7053,
+      "step": 872
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.4307218486114171,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7375,
+      "step": 873
+    },
+    {
+      "epoch": 0.46613333333333334,
+      "grad_norm": 0.41892455337672124,
+      "learning_rate": 0.00011583116322698935,
+      "loss": 0.7204,
+      "step": 874
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.4105597984237018,
+      "learning_rate": 0.00011566051400653486,
+      "loss": 0.6541,
+      "step": 875
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.4218351688180284,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7335,
+      "step": 876
+    },
+    {
+      "epoch": 0.46773333333333333,
+      "grad_norm": 0.45366576883717097,
+      "learning_rate": 0.00011531907578133429,
+      "loss": 0.7092,
+      "step": 877
+    },
+    {
+      "epoch": 0.46826666666666666,
+      "grad_norm": 0.4667696614038426,
+      "learning_rate": 0.00011514828779617459,
+      "loss": 0.7406,
+      "step": 878
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.5425681188399039,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7437,
+      "step": 879
+    },
+    {
+      "epoch": 0.4693333333333333,
+      "grad_norm": 0.38535317471221614,
+      "learning_rate": 0.00011480657663072896,
+      "loss": 0.6716,
+      "step": 880
+    },
+    {
+      "epoch": 0.46986666666666665,
+      "grad_norm": 0.4192535492707317,
+      "learning_rate": 0.00011463565447084445,
+      "loss": 0.7233,
+      "step": 881
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.40820513367792977,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.6468,
+      "step": 882
+    },
+    {
+      "epoch": 0.4709333333333333,
+      "grad_norm": 0.47887645513809207,
+      "learning_rate": 0.00011429367954874819,
+      "loss": 0.7356,
+      "step": 883
+    },
+    {
+      "epoch": 0.47146666666666665,
+      "grad_norm": 0.48708392333779144,
+      "learning_rate": 0.0001141226278077254,
+      "loss": 0.7668,
+      "step": 884
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.44890150234734566,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.714,
+      "step": 885
+    },
+    {
+      "epoch": 0.47253333333333336,
+      "grad_norm": 0.4033643738540016,
+      "learning_rate": 0.00011378039831966134,
+      "loss": 0.7026,
+      "step": 886
+    },
+    {
+      "epoch": 0.4730666666666667,
+      "grad_norm": 0.5460244851697473,
+      "learning_rate": 0.00011360922159456928,
+      "loss": 0.6596,
+      "step": 887
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.4317809805820681,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.6896,
+      "step": 888
+    },
+    {
+      "epoch": 0.47413333333333335,
+      "grad_norm": 0.47426607526559306,
+      "learning_rate": 0.00011326674673806195,
+      "loss": 0.7394,
+      "step": 889
+    },
+    {
+      "epoch": 0.4746666666666667,
+      "grad_norm": 0.4985689491133637,
+      "learning_rate": 0.00011309544962932862,
+      "loss": 0.8086,
+      "step": 890
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.4549172063415435,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7761,
+      "step": 891
+    },
+    {
+      "epoch": 0.47573333333333334,
+      "grad_norm": 0.4266860875624114,
+      "learning_rate": 0.00011275273860849684,
+      "loss": 0.734,
+      "step": 892
+    },
+    {
+      "epoch": 0.47626666666666667,
+      "grad_norm": 0.507563664980997,
+      "learning_rate": 0.00011258132571978555,
+      "loss": 0.6982,
+      "step": 893
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.5282820119941769,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.7491,
+      "step": 894
+    },
+    {
+      "epoch": 0.47733333333333333,
+      "grad_norm": 0.40503093114604405,
+      "learning_rate": 0.00011223838774509514,
+      "loss": 0.7451,
+      "step": 895
+    },
+    {
+      "epoch": 0.47786666666666666,
+      "grad_norm": 0.488867826119295,
+      "learning_rate": 0.00011206686368318086,
+      "loss": 0.641,
+      "step": 896
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.40976055576829357,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.634,
+      "step": 897
+    },
+    {
+      "epoch": 0.4789333333333333,
+      "grad_norm": 0.4552955303343864,
+      "learning_rate": 0.00011172370797119712,
+      "loss": 0.7287,
+      "step": 898
+    },
+    {
+      "epoch": 0.47946666666666665,
+      "grad_norm": 0.4503466266751716,
+      "learning_rate": 0.00011155207734584263,
+      "loss": 0.7387,
+      "step": 899
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3981204039595053,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7428,
+      "step": 900
+    },
+    {
+      "epoch": 0.4805333333333333,
+      "grad_norm": 0.48300892687341584,
+      "learning_rate": 0.00011120871311898254,
+      "loss": 0.7706,
+      "step": 901
+    },
+    {
+      "epoch": 0.48106666666666664,
+      "grad_norm": 0.6114796805270047,
+      "learning_rate": 0.0001110369805428146,
+      "loss": 0.7725,
+      "step": 902
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.47829190947630557,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.7122,
+      "step": 903
+    },
+    {
+      "epoch": 0.48213333333333336,
+      "grad_norm": 0.47509235009328055,
+      "learning_rate": 0.0001106934170290991,
+      "loss": 0.7474,
+      "step": 904
+    },
+    {
+      "epoch": 0.4826666666666667,
+      "grad_norm": 0.4163568295788633,
+      "learning_rate": 0.00011052158711748434,
+      "loss": 0.6821,
+      "step": 905
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.47479401809599403,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.7569,
+      "step": 906
+    },
+    {
+      "epoch": 0.48373333333333335,
+      "grad_norm": 0.5258693091671406,
+      "learning_rate": 0.00011017783355029026,
+      "loss": 0.7352,
+      "step": 907
+    },
+    {
+      "epoch": 0.4842666666666667,
+      "grad_norm": 0.3963908329305212,
+      "learning_rate": 0.00011000591092121127,
+      "loss": 0.7156,
+      "step": 908
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.3992466986292596,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7247,
+      "step": 909
+    },
+    {
+      "epoch": 0.48533333333333334,
+      "grad_norm": 0.5036668467046255,
+      "learning_rate": 0.0001096619765390232,
+      "loss": 0.8379,
+      "step": 910
+    },
+    {
+      "epoch": 0.48586666666666667,
+      "grad_norm": 0.5227935206101292,
+      "learning_rate": 0.00010948996581295436,
+      "loss": 0.7455,
+      "step": 911
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3897132094791309,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7122,
+      "step": 912
+    },
+    {
+      "epoch": 0.48693333333333333,
+      "grad_norm": 0.4437971490014331,
+      "learning_rate": 0.00010914585985911632,
+      "loss": 0.6656,
+      "step": 913
+    },
+    {
+      "epoch": 0.48746666666666666,
+      "grad_norm": 0.4402770228498105,
+      "learning_rate": 0.00010897376565889971,
+      "loss": 0.7002,
+      "step": 914
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.3981184649726954,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.7065,
+      "step": 915
+    },
+    {
+      "epoch": 0.4885333333333333,
+      "grad_norm": 0.38734965456860326,
+      "learning_rate": 0.00010862949738136681,
+      "loss": 0.671,
+      "step": 916
+    },
+    {
+      "epoch": 0.48906666666666665,
+      "grad_norm": 0.45967307493813275,
+      "learning_rate": 0.00010845732433208779,
+      "loss": 0.7007,
+      "step": 917
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.4415502638993889,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.8262,
+      "step": 918
+    },
+    {
+      "epoch": 0.4901333333333333,
+      "grad_norm": 0.3852510268950323,
+      "learning_rate": 0.00010811290298317755,
+      "loss": 0.6025,
+      "step": 919
+    },
+    {
+      "epoch": 0.49066666666666664,
+      "grad_norm": 0.43729735653926355,
+      "learning_rate": 0.00010794065571204072,
+      "loss": 0.6736,
+      "step": 920
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.483931726724541,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7594,
+      "step": 921
+    },
+    {
+      "epoch": 0.49173333333333336,
+      "grad_norm": 0.4711825376788208,
+      "learning_rate": 0.00010759609054818458,
+      "loss": 0.7427,
+      "step": 922
+    },
+    {
+      "epoch": 0.4922666666666667,
+      "grad_norm": 0.4158881135613916,
+      "learning_rate": 0.00010742377368438914,
+      "loss": 0.6529,
+      "step": 923
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4229997503304087,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.6851,
+      "step": 924
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 0.41529759982962977,
+      "learning_rate": 0.00010707907396588361,
+      "loss": 0.6806,
+      "step": 925
+    },
+    {
+      "epoch": 0.4938666666666667,
+      "grad_norm": 0.3823254705941637,
+      "learning_rate": 0.0001069066921404992,
+      "loss": 0.6424,
+      "step": 926
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.5036785948104029,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7438,
+      "step": 927
+    },
+    {
+      "epoch": 0.49493333333333334,
+      "grad_norm": 0.4657051853486107,
+      "learning_rate": 0.00010656186713125689,
+      "loss": 0.7515,
+      "step": 928
+    },
+    {
+      "epoch": 0.49546666666666667,
+      "grad_norm": 0.3659442262962532,
+      "learning_rate": 0.0001063894249770989,
+      "loss": 0.6756,
+      "step": 929
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4763932272432071,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.6691,
+      "step": 930
+    },
+    {
+      "epoch": 0.4965333333333333,
+      "grad_norm": 0.422659681385368,
+      "learning_rate": 0.00010604448394439983,
+      "loss": 0.7436,
+      "step": 931
+    },
+    {
+      "epoch": 0.49706666666666666,
+      "grad_norm": 0.45119229899083296,
+      "learning_rate": 0.00010587198609590505,
+      "loss": 0.7411,
+      "step": 932
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.3813081513393947,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.6633,
+      "step": 933
+    },
+    {
+      "epoch": 0.4981333333333333,
+      "grad_norm": 0.4049726948935483,
+      "learning_rate": 0.00010552693831014726,
+      "loss": 0.6458,
+      "step": 934
+    },
+    {
+      "epoch": 0.49866666666666665,
+      "grad_norm": 0.47243820951738996,
+      "learning_rate": 0.0001053543894032493,
+      "loss": 0.7511,
+      "step": 935
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.4278506901664067,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.6735,
+      "step": 936
+    },
+    {
+      "epoch": 0.4997333333333333,
+      "grad_norm": 0.40469358253849136,
+      "learning_rate": 0.00010500924413769988,
+      "loss": 0.6765,
+      "step": 937
+    },
+    {
+      "epoch": 0.5002666666666666,
+      "grad_norm": 0.5868985156349849,
+      "learning_rate": 0.00010483664880970457,
+      "loss": 0.763,
+      "step": 938
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.48585845000143807,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7767,
+      "step": 939
+    },
+    {
+      "epoch": 0.5013333333333333,
+      "grad_norm": 0.41423105216125516,
+      "learning_rate": 0.00010449141534025045,
+      "loss": 0.7211,
+      "step": 940
+    },
+    {
+      "epoch": 0.5018666666666667,
+      "grad_norm": 0.46837065093491187,
+      "learning_rate": 0.00010431877822971117,
+      "loss": 0.7128,
+      "step": 941
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.549481629853275,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.8253,
+      "step": 942
+    },
+    {
+      "epoch": 0.5029333333333333,
+      "grad_norm": 0.4633116098923696,
+      "learning_rate": 0.00010397346583460971,
+      "loss": 0.785,
+      "step": 943
+    },
+    {
+      "epoch": 0.5034666666666666,
+      "grad_norm": 0.3943224231160869,
+      "learning_rate": 0.0001038007915812028,
+      "loss": 0.7156,
+      "step": 944
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.44379164362150214,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.6587,
+      "step": 945
+    },
+    {
+      "epoch": 0.5045333333333333,
+      "grad_norm": 0.3626841490388854,
+      "learning_rate": 0.0001034554095408326,
+      "loss": 0.6571,
+      "step": 946
+    },
+    {
+      "epoch": 0.5050666666666667,
+      "grad_norm": 0.4436091443937909,
+      "learning_rate": 0.00010328270278523256,
+      "loss": 0.7152,
+      "step": 947
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.43650784153712674,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7308,
+      "step": 948
+    },
+    {
+      "epoch": 0.5061333333333333,
+      "grad_norm": 0.4203241445174268,
+      "learning_rate": 0.00010293726038184393,
+      "loss": 0.7165,
+      "step": 949
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.4930095807984508,
+      "learning_rate": 0.00010276452576559879,
+      "loss": 0.7946,
+      "step": 950
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.3928223631421178,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.614,
+      "step": 951
+    },
+    {
+      "epoch": 0.5077333333333334,
+      "grad_norm": 0.4388642643374217,
+      "learning_rate": 0.00010241903228306431,
+      "loss": 0.7225,
+      "step": 952
+    },
+    {
+      "epoch": 0.5082666666666666,
+      "grad_norm": 0.44617337236204435,
+      "learning_rate": 0.0001022462744484709,
+      "loss": 0.6776,
+      "step": 953
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.4251188578699655,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.6613,
+      "step": 954
+    },
+    {
+      "epoch": 0.5093333333333333,
+      "grad_norm": 0.36939164932819946,
+      "learning_rate": 0.00010190073917203589,
+      "loss": 0.6789,
+      "step": 955
+    },
+    {
+      "epoch": 0.5098666666666667,
+      "grad_norm": 0.37730336076580073,
+      "learning_rate": 0.00010172796276201503,
+      "loss": 0.6928,
+      "step": 956
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.4311008595792593,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.7407,
+      "step": 957
+    },
+    {
+      "epoch": 0.5109333333333334,
+      "grad_norm": 0.40750017747843514,
+      "learning_rate": 0.00010138239497804804,
+      "loss": 0.687,
+      "step": 958
+    },
+    {
+      "epoch": 0.5114666666666666,
+      "grad_norm": 0.5241761573136788,
+      "learning_rate": 0.00010120960463601976,
+      "loss": 0.756,
+      "step": 959
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.5111238515251669,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7308,
+      "step": 960
+    },
+    {
+      "epoch": 0.5125333333333333,
+      "grad_norm": 0.39767583558950237,
+      "learning_rate": 0.00010086401363176305,
+      "loss": 0.6527,
+      "step": 961
+    },
+    {
+      "epoch": 0.5130666666666667,
+      "grad_norm": 0.4565231247409654,
+      "learning_rate": 0.00010069121400152181,
+      "loss": 0.698,
+      "step": 962
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.41515013887448426,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.6635,
+      "step": 963
+    },
+    {
+      "epoch": 0.5141333333333333,
+      "grad_norm": 0.4190816416159948,
+      "learning_rate": 0.0001003456090648416,
+      "loss": 0.6625,
+      "step": 964
+    },
+    {
+      "epoch": 0.5146666666666667,
+      "grad_norm": 0.4079093991482028,
+      "learning_rate": 0.00010017280479043147,
+      "loss": 0.6897,
+      "step": 965
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.431786861547457,
+      "learning_rate": 0.0001,
+      "loss": 0.727,
+      "step": 966
+    },
+    {
+      "epoch": 0.5157333333333334,
+      "grad_norm": 0.4289266535551944,
+      "learning_rate": 9.982719520956855e-05,
+      "loss": 0.6796,
+      "step": 967
+    },
+    {
+      "epoch": 0.5162666666666667,
+      "grad_norm": 0.37447856671063556,
+      "learning_rate": 9.965439093515841e-05,
+      "loss": 0.675,
+      "step": 968
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.5552197028303565,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7469,
+      "step": 969
+    },
+    {
+      "epoch": 0.5173333333333333,
+      "grad_norm": 0.4798313893553535,
+      "learning_rate": 9.930878599847821e-05,
+      "loss": 0.7336,
+      "step": 970
+    },
+    {
+      "epoch": 0.5178666666666667,
+      "grad_norm": 0.43961855872777655,
+      "learning_rate": 9.913598636823693e-05,
+      "loss": 0.7894,
+      "step": 971
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.43490323371471584,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.6511,
+      "step": 972
+    },
+    {
+      "epoch": 0.5189333333333334,
+      "grad_norm": 0.3619713521872154,
+      "learning_rate": 9.879039536398024e-05,
+      "loss": 0.6846,
+      "step": 973
+    },
+    {
+      "epoch": 0.5194666666666666,
+      "grad_norm": 0.3763967755843891,
+      "learning_rate": 9.861760502195197e-05,
+      "loss": 0.6752,
+      "step": 974
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.6151090094687006,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.79,
+      "step": 975
+    },
+    {
+      "epoch": 0.5205333333333333,
+      "grad_norm": 0.47605106200301756,
+      "learning_rate": 9.827203723798498e-05,
+      "loss": 0.7487,
+      "step": 976
+    },
+    {
+      "epoch": 0.5210666666666667,
+      "grad_norm": 0.43782978426625524,
+      "learning_rate": 9.809926082796415e-05,
+      "loss": 0.7237,
+      "step": 977
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.6640548217093781,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.7742,
+      "step": 978
+    },
+    {
+      "epoch": 0.5221333333333333,
+      "grad_norm": 0.43626631149333445,
+      "learning_rate": 9.775372555152912e-05,
+      "loss": 0.7686,
+      "step": 979
+    },
+    {
+      "epoch": 0.5226666666666666,
+      "grad_norm": 0.4356100076927952,
+      "learning_rate": 9.758096771693573e-05,
+      "loss": 0.6895,
+      "step": 980
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.49130526928805984,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6991,
+      "step": 981
+    },
+    {
+      "epoch": 0.5237333333333334,
+      "grad_norm": 0.41749768320352587,
+      "learning_rate": 9.723547423440122e-05,
+      "loss": 0.6479,
+      "step": 982
+    },
+    {
+      "epoch": 0.5242666666666667,
+      "grad_norm": 0.45570962951682303,
+      "learning_rate": 9.70627396181561e-05,
+      "loss": 0.7182,
+      "step": 983
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4760911403014369,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7697,
+      "step": 984
+    },
+    {
+      "epoch": 0.5253333333333333,
+      "grad_norm": 0.41078163296557063,
+      "learning_rate": 9.671729721476746e-05,
+      "loss": 0.6836,
+      "step": 985
+    },
+    {
+      "epoch": 0.5258666666666667,
+      "grad_norm": 0.4381182811100299,
+      "learning_rate": 9.654459045916743e-05,
+      "loss": 0.6954,
+      "step": 986
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.4264639388270093,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7135,
+      "step": 987
+    },
+    {
+      "epoch": 0.5269333333333334,
+      "grad_norm": 0.46443605284534845,
+      "learning_rate": 9.619920841879725e-05,
+      "loss": 0.7778,
+      "step": 988
+    },
+    {
+      "epoch": 0.5274666666666666,
+      "grad_norm": 0.42285933591883057,
+      "learning_rate": 9.602653416539031e-05,
+      "loss": 0.7208,
+      "step": 989
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.44579506138805214,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.7628,
+      "step": 990
+    },
+    {
+      "epoch": 0.5285333333333333,
+      "grad_norm": 0.39990901331092055,
+      "learning_rate": 9.568122177028884e-05,
+      "loss": 0.6565,
+      "step": 991
+    },
+    {
+      "epoch": 0.5290666666666667,
+      "grad_norm": 0.4368271543482817,
+      "learning_rate": 9.550858465974958e-05,
+      "loss": 0.7688,
+      "step": 992
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.37718811129992086,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.6496,
+      "step": 993
+    },
+    {
+      "epoch": 0.5301333333333333,
+      "grad_norm": 0.47708446838070123,
+      "learning_rate": 9.516335119029546e-05,
+      "loss": 0.7916,
+      "step": 994
+    },
+    {
+      "epoch": 0.5306666666666666,
+      "grad_norm": 0.45465817813415427,
+      "learning_rate": 9.499075586230013e-05,
+      "loss": 0.7285,
+      "step": 995
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.46190590584113933,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.7116,
+      "step": 996
+    },
+    {
+      "epoch": 0.5317333333333333,
+      "grad_norm": 0.44740890183548143,
+      "learning_rate": 9.464561059675073e-05,
+      "loss": 0.68,
+      "step": 997
+    },
+    {
+      "epoch": 0.5322666666666667,
+      "grad_norm": 0.40837738818897246,
+      "learning_rate": 9.44730616898528e-05,
+      "loss": 0.66,
+      "step": 998
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.4393471858313049,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.6862,
+      "step": 999
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.4110038871541141,
+      "learning_rate": 9.412801390409497e-05,
+      "loss": 0.7181,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5338666666666667,
+      "grad_norm": 0.3920397012663661,
+      "learning_rate": 9.395551605560018e-05,
+      "loss": 0.6742,
+      "step": 1001
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.4658582980896299,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.6823,
+      "step": 1002
+    },
+    {
+      "epoch": 0.5349333333333334,
+      "grad_norm": 0.5644670189980434,
+      "learning_rate": 9.361057502290113e-05,
+      "loss": 0.7212,
+      "step": 1003
+    },
+    {
+      "epoch": 0.5354666666666666,
+      "grad_norm": 0.5101367429450542,
+      "learning_rate": 9.343813286874312e-05,
+      "loss": 0.7766,
+      "step": 1004
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.5466866564805702,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.7976,
+      "step": 1005
+    },
+    {
+      "epoch": 0.5365333333333333,
+      "grad_norm": 0.4940172741215113,
+      "learning_rate": 9.309330785950086e-05,
+      "loss": 0.6682,
+      "step": 1006
+    },
+    {
+      "epoch": 0.5370666666666667,
+      "grad_norm": 0.4345453586038417,
+      "learning_rate": 9.292092603411641e-05,
+      "loss": 0.7241,
+      "step": 1007
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.478324668709217,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7239,
+      "step": 1008
+    },
+    {
+      "epoch": 0.5381333333333334,
+      "grad_norm": 0.45033807864970016,
+      "learning_rate": 9.257622631561085e-05,
+      "loss": 0.6897,
+      "step": 1009
+    },
+    {
+      "epoch": 0.5386666666666666,
+      "grad_norm": 0.5348174689129647,
+      "learning_rate": 9.240390945181543e-05,
+      "loss": 0.6957,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.3823896966212316,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.715,
+      "step": 1011
+    },
+    {
+      "epoch": 0.5397333333333333,
+      "grad_norm": 0.46073164939223454,
+      "learning_rate": 9.205934428795929e-05,
+      "loss": 0.6939,
+      "step": 1012
+    },
+    {
+      "epoch": 0.5402666666666667,
+      "grad_norm": 0.4801510110366479,
+      "learning_rate": 9.188709701682247e-05,
+      "loss": 0.7548,
+      "step": 1013
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.4728619897015319,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7014,
+      "step": 1014
+    },
+    {
+      "epoch": 0.5413333333333333,
+      "grad_norm": 0.4218190375002509,
+      "learning_rate": 9.154267566791223e-05,
+      "loss": 0.6732,
+      "step": 1015
+    },
+    {
+      "epoch": 0.5418666666666667,
+      "grad_norm": 0.5422609846004213,
+      "learning_rate": 9.137050261863324e-05,
+      "loss": 0.7532,
+      "step": 1016
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.39433382681198076,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.6575,
+      "step": 1017
+    },
+    {
+      "epoch": 0.5429333333333334,
+      "grad_norm": 0.42359330314860427,
+      "learning_rate": 9.102623434110028e-05,
+      "loss": 0.6779,
+      "step": 1018
+    },
+    {
+      "epoch": 0.5434666666666667,
+      "grad_norm": 0.44146456227553116,
+      "learning_rate": 9.085414014088369e-05,
+      "loss": 0.6854,
+      "step": 1019
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.47167580768786777,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.653,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5445333333333333,
+      "grad_norm": 0.45133233237427317,
+      "learning_rate": 9.051003418704565e-05,
+      "loss": 0.7409,
+      "step": 1021
+    },
+    {
+      "epoch": 0.5450666666666667,
+      "grad_norm": 0.4201773986645445,
+      "learning_rate": 9.033802346097682e-05,
+      "loss": 0.7056,
+      "step": 1022
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.4023223296247732,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.7154,
+      "step": 1023
+    },
+    {
+      "epoch": 0.5461333333333334,
+      "grad_norm": 0.4692166344755956,
+      "learning_rate": 8.999408907878877e-05,
+      "loss": 0.6946,
+      "step": 1024
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 0.44277364227217714,
+      "learning_rate": 8.982216644970979e-05,
+      "loss": 0.7222,
+      "step": 1025
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.480199163868813,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7443,
+      "step": 1026
+    },
+    {
+      "epoch": 0.5477333333333333,
+      "grad_norm": 0.5001288292413456,
+      "learning_rate": 8.947841288251568e-05,
+      "loss": 0.7551,
+      "step": 1027
+    },
+    {
+      "epoch": 0.5482666666666667,
+      "grad_norm": 0.5044486579782337,
+      "learning_rate": 8.930658297090091e-05,
+      "loss": 0.8154,
+      "step": 1028
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.5511924644753977,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.7777,
+      "step": 1029
+    },
+    {
+      "epoch": 0.5493333333333333,
+      "grad_norm": 0.406075888614496,
+      "learning_rate": 8.896301945718541e-05,
+      "loss": 0.691,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5498666666666666,
+      "grad_norm": 0.4101635069428767,
+      "learning_rate": 8.879128688101749e-05,
+      "loss": 0.6752,
+      "step": 1031
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.4353275594534333,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.705,
+      "step": 1032
+    },
+    {
+      "epoch": 0.5509333333333334,
+      "grad_norm": 0.5381104154274325,
+      "learning_rate": 8.844792265415738e-05,
+      "loss": 0.7253,
+      "step": 1033
+    },
+    {
+      "epoch": 0.5514666666666667,
+      "grad_norm": 0.3675840727089868,
+      "learning_rate": 8.827629202880293e-05,
+      "loss": 0.7126,
+      "step": 1034
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.6471266189518096,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7445,
+      "step": 1035
+    },
+    {
+      "epoch": 0.5525333333333333,
+      "grad_norm": 0.49705826944673037,
+      "learning_rate": 8.793313631681915e-05,
+      "loss": 0.7579,
+      "step": 1036
+    },
+    {
+      "epoch": 0.5530666666666667,
+      "grad_norm": 0.3810241330113108,
+      "learning_rate": 8.776161225490489e-05,
+      "loss": 0.6612,
+      "step": 1037
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.41067557306498564,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.7437,
+      "step": 1038
+    },
+    {
+      "epoch": 0.5541333333333334,
+      "grad_norm": 0.3699384289898883,
+      "learning_rate": 8.741867428021446e-05,
+      "loss": 0.6543,
+      "step": 1039
+    },
+    {
+      "epoch": 0.5546666666666666,
+      "grad_norm": 0.43438727622729145,
+      "learning_rate": 8.724726139150318e-05,
+      "loss": 0.6551,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.41481861561299377,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.7064,
+      "step": 1041
+    },
+    {
+      "epoch": 0.5557333333333333,
+      "grad_norm": 0.415666985443102,
+      "learning_rate": 8.690455037067141e-05,
+      "loss": 0.6883,
+      "step": 1042
+    },
+    {
+      "epoch": 0.5562666666666667,
+      "grad_norm": 0.6092680663548886,
+      "learning_rate": 8.673325326193806e-05,
+      "loss": 0.7205,
+      "step": 1043
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.5012608718252068,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.6799,
+      "step": 1044
+    },
+    {
+      "epoch": 0.5573333333333333,
+      "grad_norm": 0.5032541903633672,
+      "learning_rate": 8.639077840543077e-05,
+      "loss": 0.7814,
+      "step": 1045
+    },
+    {
+      "epoch": 0.5578666666666666,
+      "grad_norm": 0.41741920545163397,
+      "learning_rate": 8.621960168033867e-05,
+      "loss": 0.7096,
+      "step": 1046
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.46294385164549157,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.7573,
+      "step": 1047
+    },
+    {
+      "epoch": 0.5589333333333333,
+      "grad_norm": 0.4904451208041044,
+      "learning_rate": 8.587737219227462e-05,
+      "loss": 0.7633,
+      "step": 1048
+    },
+    {
+      "epoch": 0.5594666666666667,
+      "grad_norm": 0.5795592645359291,
+      "learning_rate": 8.570632045125185e-05,
+      "loss": 0.7309,
+      "step": 1049
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3815941219656394,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6547,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5605333333333333,
+      "grad_norm": 0.41043655870937906,
+      "learning_rate": 8.536434552915556e-05,
+      "loss": 0.7217,
+      "step": 1051
+    },
+    {
+      "epoch": 0.5610666666666667,
+      "grad_norm": 0.48331976809373395,
+      "learning_rate": 8.519342336927105e-05,
+      "loss": 0.7879,
+      "step": 1052
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.40530779548903756,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6537,
+      "step": 1053
+    },
+    {
+      "epoch": 0.5621333333333334,
+      "grad_norm": 0.4275792943736877,
+      "learning_rate": 8.485171220382545e-05,
+      "loss": 0.698,
+      "step": 1054
+    },
+    {
+      "epoch": 0.5626666666666666,
+      "grad_norm": 0.5413070152254348,
+      "learning_rate": 8.468092421866573e-05,
+      "loss": 0.7096,
+      "step": 1055
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.41309864800045915,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7467,
+      "step": 1056
+    },
+    {
+      "epoch": 0.5637333333333333,
+      "grad_norm": 0.6646512903066634,
+      "learning_rate": 8.433948599346516e-05,
+      "loss": 0.6962,
+      "step": 1057
+    },
+    {
+      "epoch": 0.5642666666666667,
+      "grad_norm": 0.3871055361932072,
+      "learning_rate": 8.416883677301069e-05,
+      "loss": 0.6916,
+      "step": 1058
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.4483959435891016,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6676,
+      "step": 1059
+    },
+    {
+      "epoch": 0.5653333333333334,
+      "grad_norm": 0.453937128181304,
+      "learning_rate": 8.382768066431425e-05,
+      "loss": 0.7195,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5658666666666666,
+      "grad_norm": 0.38165176510745086,
+      "learning_rate": 8.36571747948162e-05,
+      "loss": 0.6322,
+      "step": 1061
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4581503378797954,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.6548,
+      "step": 1062
+    },
+    {
+      "epoch": 0.5669333333333333,
+      "grad_norm": 0.5391863734865159,
+      "learning_rate": 8.33163099713009e-05,
+      "loss": 0.7539,
+      "step": 1063
+    },
+    {
+      "epoch": 0.5674666666666667,
+      "grad_norm": 0.3754003717719721,
+      "learning_rate": 8.31459520351578e-05,
+      "loss": 0.6789,
+      "step": 1064
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.4972463056027732,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.6535,
+      "step": 1065
+    },
+    {
+      "epoch": 0.5685333333333333,
+      "grad_norm": 0.42390724311570416,
+      "learning_rate": 8.280538765767235e-05,
+      "loss": 0.6328,
+      "step": 1066
+    },
+    {
+      "epoch": 0.5690666666666667,
+      "grad_norm": 0.675660727677251,
+      "learning_rate": 8.263518223330697e-05,
+      "loss": 0.652,
+      "step": 1067
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.6435985943801711,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7919,
+      "step": 1068
+    },
+    {
+      "epoch": 0.5701333333333334,
+      "grad_norm": 0.5006136042376417,
+      "learning_rate": 8.22949274546255e-05,
+      "loss": 0.723,
+      "step": 1069
+    },
+    {
+      "epoch": 0.5706666666666667,
+      "grad_norm": 0.39625902620982,
+      "learning_rate": 8.212487911636184e-05,
+      "loss": 0.6576,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.4545922958527993,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.7205,
+      "step": 1071
+    },
+    {
+      "epoch": 0.5717333333333333,
+      "grad_norm": 0.4495068479338101,
+      "learning_rate": 8.178494308093789e-05,
+      "loss": 0.7018,
+      "step": 1072
+    },
+    {
+      "epoch": 0.5722666666666667,
+      "grad_norm": 0.4709380463003231,
+      "learning_rate": 8.161505639887817e-05,
+      "loss": 0.6701,
+      "step": 1073
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.4628075359403977,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.6762,
+      "step": 1074
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 0.5041511456595938,
+      "learning_rate": 8.127544824259889e-05,
+      "loss": 0.6889,
+      "step": 1075
+    },
+    {
+      "epoch": 0.5738666666666666,
+      "grad_norm": 0.4840293659189363,
+      "learning_rate": 8.110572778250085e-05,
+      "loss": 0.7337,
+      "step": 1076
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.48667160173127905,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.6733,
+      "step": 1077
+    },
+    {
+      "epoch": 0.5749333333333333,
+      "grad_norm": 0.4617452426260254,
+      "learning_rate": 8.076645663244168e-05,
+      "loss": 0.6993,
+      "step": 1078
+    },
+    {
+      "epoch": 0.5754666666666667,
+      "grad_norm": 0.45671699472528243,
+      "learning_rate": 8.059690695559568e-05,
+      "loss": 0.7137,
+      "step": 1079
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3916027221909562,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.6407,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5765333333333333,
+      "grad_norm": 0.45910141535244414,
+      "learning_rate": 8.025798192977481e-05,
+      "loss": 0.7296,
+      "step": 1081
+    },
+    {
+      "epoch": 0.5770666666666666,
+      "grad_norm": 0.4819780361829293,
+      "learning_rate": 8.008860759288147e-05,
+      "loss": 0.6869,
+      "step": 1082
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.44847446180169626,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.647,
+      "step": 1083
+    },
+    {
+      "epoch": 0.5781333333333334,
+      "grad_norm": 0.43917620108531935,
+      "learning_rate": 7.975003780001485e-05,
+      "loss": 0.7588,
+      "step": 1084
+    },
+    {
+      "epoch": 0.5786666666666667,
+      "grad_norm": 0.5162715358080738,
+      "learning_rate": 7.958084335506239e-05,
+      "loss": 0.8084,
+      "step": 1085
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.472443797093647,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6519,
+      "step": 1086
+    },
+    {
+      "epoch": 0.5797333333333333,
+      "grad_norm": 0.4983222958473055,
+      "learning_rate": 7.924263789431912e-05,
+      "loss": 0.7586,
+      "step": 1087
+    },
+    {
+      "epoch": 0.5802666666666667,
+      "grad_norm": 0.3696287349960062,
+      "learning_rate": 7.907362788846116e-05,
+      "loss": 0.7146,
+      "step": 1088
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.38460780298635355,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.6523,
+      "step": 1089
+    },
+    {
+      "epoch": 0.5813333333333334,
+      "grad_norm": 0.40394723209746247,
+      "learning_rate": 7.873579584921869e-05,
+      "loss": 0.7032,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5818666666666666,
+      "grad_norm": 0.40004900093478846,
+      "learning_rate": 7.856697482465196e-05,
+      "loss": 0.6215,
+      "step": 1091
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.40870489691697814,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6469,
+      "step": 1092
+    },
+    {
+      "epoch": 0.5829333333333333,
+      "grad_norm": 0.4377794759908524,
+      "learning_rate": 7.822952528625191e-05,
+      "loss": 0.6999,
+      "step": 1093
+    },
+    {
+      "epoch": 0.5834666666666667,
+      "grad_norm": 0.41096064783451314,
+      "learning_rate": 7.806089778009421e-05,
+      "loss": 0.6945,
+      "step": 1094
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.5247766871411668,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.7336,
+      "step": 1095
+    },
+    {
+      "epoch": 0.5845333333333333,
+      "grad_norm": 0.3913091904296901,
+      "learning_rate": 7.772383981159849e-05,
+      "loss": 0.6639,
+      "step": 1096
+    },
+    {
+      "epoch": 0.5850666666666666,
+      "grad_norm": 0.4321583957999197,
+      "learning_rate": 7.755541035576677e-05,
+      "loss": 0.6763,
+      "step": 1097
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.4265285273317709,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.7191,
+      "step": 1098
+    },
+    {
+      "epoch": 0.5861333333333333,
+      "grad_norm": 0.42939990938106737,
+      "learning_rate": 7.721875301571359e-05,
+      "loss": 0.6835,
+      "step": 1099
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.4380068744737683,
+      "learning_rate": 7.705052613680211e-05,
+      "loss": 0.6991,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.4562897063384828,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.6915,
+      "step": 1101
+    },
+    {
+      "epoch": 0.5877333333333333,
+      "grad_norm": 0.44342254603261516,
+      "learning_rate": 7.671427847296275e-05,
+      "loss": 0.6327,
+      "step": 1102
+    },
+    {
+      "epoch": 0.5882666666666667,
+      "grad_norm": 0.400996985089008,
+      "learning_rate": 7.654625869212146e-05,
+      "loss": 0.6373,
+      "step": 1103
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.4241732013273108,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.6915,
+      "step": 1104
+    },
+    {
+      "epoch": 0.5893333333333334,
+      "grad_norm": 0.4533395746973437,
+      "learning_rate": 7.6210429741257e-05,
+      "loss": 0.6822,
+      "step": 1105
+    },
+    {
+      "epoch": 0.5898666666666667,
+      "grad_norm": 0.4123774386387129,
+      "learning_rate": 7.604262157407007e-05,
+      "loss": 0.6568,
+      "step": 1106
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.3893319046364079,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.7115,
+      "step": 1107
+    },
+    {
+      "epoch": 0.5909333333333333,
+      "grad_norm": 0.4211570834760039,
+      "learning_rate": 7.570722036168854e-05,
+      "loss": 0.6263,
+      "step": 1108
+    },
+    {
+      "epoch": 0.5914666666666667,
+      "grad_norm": 0.3956464152384994,
+      "learning_rate": 7.55396283180529e-05,
+      "loss": 0.6815,
+      "step": 1109
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4130677399041131,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.6834,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5925333333333334,
+      "grad_norm": 0.43691215060528366,
+      "learning_rate": 7.520466385816671e-05,
+      "loss": 0.7108,
+      "step": 1111
+    },
+    {
+      "epoch": 0.5930666666666666,
+      "grad_norm": 0.4604241063294278,
+      "learning_rate": 7.503729244217086e-05,
+      "loss": 0.722,
+      "step": 1112
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.3826368858897757,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6454,
+      "step": 1113
+    },
+    {
+      "epoch": 0.5941333333333333,
+      "grad_norm": 0.4449266192184151,
+      "learning_rate": 7.470277373705461e-05,
+      "loss": 0.6153,
+      "step": 1114
+    },
+    {
+      "epoch": 0.5946666666666667,
+      "grad_norm": 0.4495658094813891,
+      "learning_rate": 7.453562744685778e-05,
+      "loss": 0.6583,
+      "step": 1115
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.44963815352240066,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.727,
+      "step": 1116
+    },
+    {
+      "epoch": 0.5957333333333333,
+      "grad_norm": 0.43131840431294677,
+      "learning_rate": 7.42015634868062e-05,
+      "loss": 0.688,
+      "step": 1117
+    },
+    {
+      "epoch": 0.5962666666666666,
+      "grad_norm": 0.3733896381579521,
+      "learning_rate": 7.403464681451715e-05,
+      "loss": 0.6977,
+      "step": 1118
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.5175259996500226,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6559,
+      "step": 1119
+    },
+    {
+      "epoch": 0.5973333333333334,
+      "grad_norm": 0.43135933793781067,
+      "learning_rate": 7.370104657760361e-05,
+      "loss": 0.6584,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5978666666666667,
+      "grad_norm": 0.4409186145412831,
+      "learning_rate": 7.353436400916004e-05,
+      "loss": 0.7349,
+      "step": 1121
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4236489251890136,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.6436,
+      "step": 1122
+    },
+    {
+      "epoch": 0.5989333333333333,
+      "grad_norm": 0.41546574374402656,
+      "learning_rate": 7.320123646099519e-05,
+      "loss": 0.6781,
+      "step": 1123
+    },
+    {
+      "epoch": 0.5994666666666667,
+      "grad_norm": 0.4234569524859422,
+      "learning_rate": 7.303479247604332e-05,
+      "loss": 0.71,
+      "step": 1124
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.5272396751585382,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7429,
+      "step": 1125
+    },
+    {
+      "epoch": 0.6005333333333334,
+      "grad_norm": 0.580902171591594,
+      "learning_rate": 7.270214656953415e-05,
+      "loss": 0.7402,
+      "step": 1126
+    },
+    {
+      "epoch": 0.6010666666666666,
+      "grad_norm": 0.4777516255620962,
+      "learning_rate": 7.253594564130804e-05,
+      "loss": 0.7329,
+      "step": 1127
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.5089033422117083,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.6889,
+      "step": 1128
+    },
+    {
+      "epoch": 0.6021333333333333,
+      "grad_norm": 0.45600715463903174,
+      "learning_rate": 7.22037903164173e-05,
+      "loss": 0.6518,
+      "step": 1129
+    },
+    {
+      "epoch": 0.6026666666666667,
+      "grad_norm": 0.4294783096789488,
+      "learning_rate": 7.203783691161883e-05,
+      "loss": 0.7051,
+      "step": 1130
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.4576375770301386,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.693,
+      "step": 1131
+    },
+    {
+      "epoch": 0.6037333333333333,
+      "grad_norm": 0.5066575878603606,
+      "learning_rate": 7.170618109512465e-05,
+      "loss": 0.7335,
+      "step": 1132
+    },
+    {
+      "epoch": 0.6042666666666666,
+      "grad_norm": 0.43331802255903185,
+      "learning_rate": 7.154047967380354e-05,
+      "loss": 0.7554,
+      "step": 1133
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.48686219681220616,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.7369,
+      "step": 1134
+    },
+    {
+      "epoch": 0.6053333333333333,
+      "grad_norm": 0.446045834246583,
+      "learning_rate": 7.12093322790597e-05,
+      "loss": 0.6921,
+      "step": 1135
+    },
+    {
+      "epoch": 0.6058666666666667,
+      "grad_norm": 0.41500722997459405,
+      "learning_rate": 7.104388729449338e-05,
+      "loss": 0.6977,
+      "step": 1136
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.5460447655345045,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7253,
+      "step": 1137
+    },
+    {
+      "epoch": 0.6069333333333333,
+      "grad_norm": 0.4023608330688251,
+      "learning_rate": 7.071325722118963e-05,
+      "loss": 0.6058,
+      "step": 1138
+    },
+    {
+      "epoch": 0.6074666666666667,
+      "grad_norm": 0.41756688712429046,
+      "learning_rate": 7.054807311976379e-05,
+      "loss": 0.6962,
+      "step": 1139
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4063462226629591,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6942,
+      "step": 1140
+    },
+    {
+      "epoch": 0.6085333333333334,
+      "grad_norm": 0.4070642991216119,
+      "learning_rate": 7.021796925368667e-05,
+      "loss": 0.6769,
+      "step": 1141
+    },
+    {
+      "epoch": 0.6090666666666666,
+      "grad_norm": 0.4268586750200847,
+      "learning_rate": 7.005305047477566e-05,
+      "loss": 0.7499,
+      "step": 1142
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.387444132229861,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6531,
+      "step": 1143
+    },
+    {
+      "epoch": 0.6101333333333333,
+      "grad_norm": 0.49725188955591815,
+      "learning_rate": 6.972348168756983e-05,
+      "loss": 0.6627,
+      "step": 1144
+    },
+    {
+      "epoch": 0.6106666666666667,
+      "grad_norm": 0.4436536946783928,
+      "learning_rate": 6.955883266341741e-05,
+      "loss": 0.684,
+      "step": 1145
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.4885179213576605,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.7008,
+      "step": 1146
+    },
+    {
+      "epoch": 0.6117333333333334,
+      "grad_norm": 0.4381017756174437,
+      "learning_rate": 6.922980781234699e-05,
+      "loss": 0.7327,
+      "step": 1147
+    },
+    {
+      "epoch": 0.6122666666666666,
+      "grad_norm": 0.44024311320657594,
+      "learning_rate": 6.906543296794714e-05,
+      "loss": 0.6876,
+      "step": 1148
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.47779912093946264,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6258,
+      "step": 1149
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.46478338177148526,
+      "learning_rate": 6.873696089565786e-05,
+      "loss": 0.6318,
+      "step": 1150
+    },
+    {
+      "epoch": 0.6138666666666667,
+      "grad_norm": 0.3871467694683644,
+      "learning_rate": 6.85728646486359e-05,
+      "loss": 0.6567,
+      "step": 1151
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.5101724117368083,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.7032,
+      "step": 1152
+    },
+    {
+      "epoch": 0.6149333333333333,
+      "grad_norm": 0.40078174085852053,
+      "learning_rate": 6.82449541829174e-05,
+      "loss": 0.6403,
+      "step": 1153
+    },
+    {
+      "epoch": 0.6154666666666667,
+      "grad_norm": 0.4719771558727841,
+      "learning_rate": 6.80811409434113e-05,
+      "loss": 0.6991,
+      "step": 1154
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.4660594729728226,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.6738,
+      "step": 1155
+    },
+    {
+      "epoch": 0.6165333333333334,
+      "grad_norm": 0.5176869958498652,
+      "learning_rate": 6.775380089695986e-05,
+      "loss": 0.7341,
+      "step": 1156
+    },
+    {
+      "epoch": 0.6170666666666667,
+      "grad_norm": 0.4136478069853565,
+      "learning_rate": 6.759027506750158e-05,
+      "loss": 0.7043,
+      "step": 1157
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.6700799476808592,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7718,
+      "step": 1158
+    },
+    {
+      "epoch": 0.6181333333333333,
+      "grad_norm": 0.5370758797710055,
+      "learning_rate": 6.726351423768322e-05,
+      "loss": 0.7118,
+      "step": 1159
+    },
+    {
+      "epoch": 0.6186666666666667,
+      "grad_norm": 0.3962294485767609,
+      "learning_rate": 6.710028021308061e-05,
+      "loss": 0.6653,
+      "step": 1160
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.4125661428087789,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.6779,
+      "step": 1161
+    },
+    {
+      "epoch": 0.6197333333333334,
+      "grad_norm": 0.47755432152361943,
+      "learning_rate": 6.677410738169485e-05,
+      "loss": 0.7293,
+      "step": 1162
+    },
+    {
+      "epoch": 0.6202666666666666,
+      "grad_norm": 0.4738519129094318,
+      "learning_rate": 6.661116954891328e-05,
+      "loss": 0.7,
+      "step": 1163
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.37907866317871847,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.6715,
+      "step": 1164
+    },
+    {
+      "epoch": 0.6213333333333333,
+      "grad_norm": 0.39135369922972446,
+      "learning_rate": 6.62855934819569e-05,
+      "loss": 0.6745,
+      "step": 1165
+    },
+    {
+      "epoch": 0.6218666666666667,
+      "grad_norm": 0.4464862790280168,
+      "learning_rate": 6.612295622000162e-05,
+      "loss": 0.6871,
+      "step": 1166
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.350240959730002,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6546,
+      "step": 1167
+    },
+    {
+      "epoch": 0.6229333333333333,
+      "grad_norm": 0.41028407238467257,
+      "learning_rate": 6.579798566743314e-05,
+      "loss": 0.6545,
+      "step": 1168
+    },
+    {
+      "epoch": 0.6234666666666666,
+      "grad_norm": 0.40466767126694686,
+      "learning_rate": 6.563565334723134e-05,
+      "loss": 0.6674,
+      "step": 1169
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.447732905913209,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7267,
+      "step": 1170
+    },
+    {
+      "epoch": 0.6245333333333334,
+      "grad_norm": 0.5026122896326217,
+      "learning_rate": 6.531129704273604e-05,
+      "loss": 0.7362,
+      "step": 1171
+    },
+    {
+      "epoch": 0.6250666666666667,
+      "grad_norm": 0.39518768955958067,
+      "learning_rate": 6.514927402701964e-05,
+      "loss": 0.6672,
+      "step": 1172
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.4753065981760208,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.7225,
+      "step": 1173
+    },
+    {
+      "epoch": 0.6261333333333333,
+      "grad_norm": 0.3898190332927234,
+      "learning_rate": 6.48255406877745e-05,
+      "loss": 0.6345,
+      "step": 1174
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 0.39211498039162146,
+      "learning_rate": 6.466383133096267e-05,
+      "loss": 0.6628,
+      "step": 1175
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3737364169920672,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.656,
+      "step": 1176
+    },
+    {
+      "epoch": 0.6277333333333334,
+      "grad_norm": 0.5663306152299664,
+      "learning_rate": 6.434072965740242e-05,
+      "loss": 0.6456,
+      "step": 1177
+    },
+    {
+      "epoch": 0.6282666666666666,
+      "grad_norm": 0.3959433909405213,
+      "learning_rate": 6.417933830548467e-05,
+      "loss": 0.707,
+      "step": 1178
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.33966680414474154,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.6116,
+      "step": 1179
+    },
+    {
+      "epoch": 0.6293333333333333,
+      "grad_norm": 0.5014488892121401,
+      "learning_rate": 6.385687698106781e-05,
+      "loss": 0.7801,
+      "step": 1180
+    },
+    {
+      "epoch": 0.6298666666666667,
+      "grad_norm": 0.5480557671766723,
+      "learning_rate": 6.369580797148718e-05,
+      "loss": 0.6983,
+      "step": 1181
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.4305705098423576,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6718,
+      "step": 1182
+    },
+    {
+      "epoch": 0.6309333333333333,
+      "grad_norm": 0.5638322507392332,
+      "learning_rate": 6.337399566246257e-05,
+      "loss": 0.7683,
+      "step": 1183
+    },
+    {
+      "epoch": 0.6314666666666666,
+      "grad_norm": 0.45442711692530413,
+      "learning_rate": 6.321325332399903e-05,
+      "loss": 0.6978,
+      "step": 1184
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.43412941480796136,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7614,
+      "step": 1185
+    },
+    {
+      "epoch": 0.6325333333333333,
+      "grad_norm": 0.3814499211124445,
+      "learning_rate": 6.289209867917312e-05,
+      "loss": 0.656,
+      "step": 1186
+    },
+    {
+      "epoch": 0.6330666666666667,
+      "grad_norm": 0.4240228713220223,
+      "learning_rate": 6.273168733182722e-05,
+      "loss": 0.7085,
+      "step": 1187
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.43844576076097735,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.7193,
+      "step": 1188
+    },
+    {
+      "epoch": 0.6341333333333333,
+      "grad_norm": 0.40423358488133027,
+      "learning_rate": 6.241119898233144e-05,
+      "loss": 0.7132,
+      "step": 1189
+    },
+    {
+      "epoch": 0.6346666666666667,
+      "grad_norm": 0.45430769254372433,
+      "learning_rate": 6.225112293720836e-05,
+      "loss": 0.7051,
+      "step": 1190
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.38735075879438313,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.6902,
+      "step": 1191
+    },
+    {
+      "epoch": 0.6357333333333334,
+      "grad_norm": 0.4476681106012167,
+      "learning_rate": 6.19313094962673e-05,
+      "loss": 0.6721,
+      "step": 1192
+    },
+    {
+      "epoch": 0.6362666666666666,
+      "grad_norm": 0.42450882043204863,
+      "learning_rate": 6.177157305546078e-05,
+      "loss": 0.6489,
+      "step": 1193
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.4759730254749979,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.7535,
+      "step": 1194
+    },
+    {
+      "epoch": 0.6373333333333333,
+      "grad_norm": 0.4478631755841564,
+      "learning_rate": 6.145244311816063e-05,
+      "loss": 0.7192,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6378666666666667,
+      "grad_norm": 0.40612761287808763,
+      "learning_rate": 6.129305057463741e-05,
+      "loss": 0.6223,
+      "step": 1196
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.466293785073214,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.7579,
+      "step": 1197
+    },
+    {
+      "epoch": 0.6389333333333334,
+      "grad_norm": 0.3831884127623581,
+      "learning_rate": 6.0974612717695004e-05,
+      "loss": 0.6691,
+      "step": 1198
+    },
+    {
+      "epoch": 0.6394666666666666,
+      "grad_norm": 0.41725252769187654,
+      "learning_rate": 6.0815568355179556e-05,
+      "loss": 0.7244,
+      "step": 1199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.4410592672061088,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.7287,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6405333333333333,
+      "grad_norm": 0.530556815181464,
+      "learning_rate": 6.0497831136711836e-05,
+      "loss": 0.7611,
+      "step": 1201
+    },
+    {
+      "epoch": 0.6410666666666667,
+      "grad_norm": 0.48591244087808333,
+      "learning_rate": 6.0339139229571116e-05,
+      "loss": 0.6385,
+      "step": 1202
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.4193839412184438,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.689,
+      "step": 1203
+    },
+    {
+      "epoch": 0.6421333333333333,
+      "grad_norm": 0.4214027254503024,
+      "learning_rate": 6.002211118886514e-05,
+      "loss": 0.7114,
+      "step": 1204
+    },
+    {
+      "epoch": 0.6426666666666667,
+      "grad_norm": 0.42500594343946496,
+      "learning_rate": 5.986377600199371e-05,
+      "loss": 0.673,
+      "step": 1205
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.40163503630139474,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6312,
+      "step": 1206
+    },
+    {
+      "epoch": 0.6437333333333334,
+      "grad_norm": 0.4116961444002072,
+      "learning_rate": 5.9547465659277215e-05,
+      "loss": 0.6433,
+      "step": 1207
+    },
+    {
+      "epoch": 0.6442666666666667,
+      "grad_norm": 0.5148769388922457,
+      "learning_rate": 5.938949144798279e-05,
+      "loss": 0.7168,
+      "step": 1208
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.3636014962202317,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6258,
+      "step": 1209
+    },
+    {
+      "epoch": 0.6453333333333333,
+      "grad_norm": 0.4428307925836097,
+      "learning_rate": 5.907390730419507e-05,
+      "loss": 0.6607,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6458666666666667,
+      "grad_norm": 0.5207998952605079,
+      "learning_rate": 5.8916298314083915e-05,
+      "loss": 0.7153,
+      "step": 1211
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.3870667355173029,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6459,
+      "step": 1212
+    },
+    {
+      "epoch": 0.6469333333333334,
+      "grad_norm": 0.39873895800873504,
+      "learning_rate": 5.860144885064751e-05,
+      "loss": 0.6063,
+      "step": 1213
+    },
+    {
+      "epoch": 0.6474666666666666,
+      "grad_norm": 0.4269796326987652,
+      "learning_rate": 5.8444209317510514e-05,
+      "loss": 0.659,
+      "step": 1214
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.40107403756608534,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6843,
+      "step": 1215
+    },
+    {
+      "epoch": 0.6485333333333333,
+      "grad_norm": 0.47558164818682663,
+      "learning_rate": 5.813010299610313e-05,
+      "loss": 0.7933,
+      "step": 1216
+    },
+    {
+      "epoch": 0.6490666666666667,
+      "grad_norm": 0.4164460670334844,
+      "learning_rate": 5.797323714580192e-05,
+      "loss": 0.6681,
+      "step": 1217
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.4133344695022582,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6839,
+      "step": 1218
+    },
+    {
+      "epoch": 0.6501333333333333,
+      "grad_norm": 0.43040993213510487,
+      "learning_rate": 5.765988240812921e-05,
+      "loss": 0.704,
+      "step": 1219
+    },
+    {
+      "epoch": 0.6506666666666666,
+      "grad_norm": 0.49518477477559136,
+      "learning_rate": 5.750339445648252e-05,
+      "loss": 0.7085,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.4850571750526892,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.6833,
+      "step": 1221
+    },
+    {
+      "epoch": 0.6517333333333334,
+      "grad_norm": 0.3987796390002741,
+      "learning_rate": 5.7190799724050924e-05,
+      "loss": 0.6622,
+      "step": 1222
+    },
+    {
+      "epoch": 0.6522666666666667,
+      "grad_norm": 0.4755184923128712,
+      "learning_rate": 5.7034693876721376e-05,
+      "loss": 0.719,
+      "step": 1223
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.45590352150965446,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.7036,
+      "step": 1224
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 0.43236767622934885,
+      "learning_rate": 5.6722867550612116e-05,
+      "loss": 0.702,
+      "step": 1225
+    },
+    {
+      "epoch": 0.6538666666666667,
+      "grad_norm": 0.4025775660821899,
+      "learning_rate": 5.6567148002993164e-05,
+      "loss": 0.6568,
+      "step": 1226
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.4541628654123149,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.6244,
+      "step": 1227
+    },
+    {
+      "epoch": 0.6549333333333334,
+      "grad_norm": 0.453429834217993,
+      "learning_rate": 5.625609846363622e-05,
+      "loss": 0.6979,
+      "step": 1228
+    },
+    {
+      "epoch": 0.6554666666666666,
+      "grad_norm": 0.43851463617889785,
+      "learning_rate": 5.6100769400739383e-05,
+      "loss": 0.6896,
+      "step": 1229
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.6467991129526891,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.768,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6565333333333333,
+      "grad_norm": 0.6128183852221792,
+      "learning_rate": 5.579050500768836e-05,
+      "loss": 0.685,
+      "step": 1231
+    },
+    {
+      "epoch": 0.6570666666666667,
+      "grad_norm": 0.5330439921196597,
+      "learning_rate": 5.5635570604030705e-05,
+      "loss": 0.7423,
+      "step": 1232
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.4176249851405337,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.7399,
+      "step": 1233
+    },
+    {
+      "epoch": 0.6581333333333333,
+      "grad_norm": 0.4868808485395327,
+      "learning_rate": 5.53260996957381e-05,
+      "loss": 0.7761,
+      "step": 1234
+    },
+    {
+      "epoch": 0.6586666666666666,
+      "grad_norm": 0.44448990010738193,
+      "learning_rate": 5.5171564115230254e-05,
+      "loss": 0.6793,
+      "step": 1235
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.3831915951362028,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6671,
+      "step": 1236
+    },
+    {
+      "epoch": 0.6597333333333333,
+      "grad_norm": 0.48647800046584916,
+      "learning_rate": 5.486289500882355e-05,
+      "loss": 0.6366,
+      "step": 1237
+    },
+    {
+      "epoch": 0.6602666666666667,
+      "grad_norm": 0.5028292794485312,
+      "learning_rate": 5.47087624046575e-05,
+      "loss": 0.6782,
+      "step": 1238
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.3969565263432366,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.6453,
+      "step": 1239
+    },
+    {
+      "epoch": 0.6613333333333333,
+      "grad_norm": 0.5597339694128612,
+      "learning_rate": 5.4400903395715366e-05,
+      "loss": 0.7484,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6618666666666667,
+      "grad_norm": 0.4407762969942455,
+      "learning_rate": 5.424717791025302e-05,
+      "loss": 0.7436,
+      "step": 1241
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4701653531005452,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.5762,
+      "step": 1242
+    },
+    {
+      "epoch": 0.6629333333333334,
+      "grad_norm": 0.4905199446737774,
+      "learning_rate": 5.394013727258254e-05,
+      "loss": 0.7413,
+      "step": 1243
+    },
+    {
+      "epoch": 0.6634666666666666,
+      "grad_norm": 0.4323538996619484,
+      "learning_rate": 5.378682303724435e-05,
+      "loss": 0.6507,
+      "step": 1244
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.41336776735073155,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6721,
+      "step": 1245
+    },
+    {
+      "epoch": 0.6645333333333333,
+      "grad_norm": 0.3865817908859722,
+      "learning_rate": 5.348060902265871e-05,
+      "loss": 0.6448,
+      "step": 1246
+    },
+    {
+      "epoch": 0.6650666666666667,
+      "grad_norm": 0.40584853488622363,
+      "learning_rate": 5.332771015781275e-05,
+      "loss": 0.6629,
+      "step": 1247
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.40284245486592835,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.6846,
+      "step": 1248
+    },
+    {
+      "epoch": 0.6661333333333334,
+      "grad_norm": 0.5105925650064085,
+      "learning_rate": 5.302233099590928e-05,
+      "loss": 0.6974,
+      "step": 1249
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.48321533653328735,
+      "learning_rate": 5.286985161076029e-05,
+      "loss": 0.6283,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.43708617207234074,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6495,
+      "step": 1251
+    },
+    {
+      "epoch": 0.6677333333333333,
+      "grad_norm": 0.44014677243922035,
+      "learning_rate": 5.2565315508699376e-05,
+      "loss": 0.6814,
+      "step": 1252
+    },
+    {
+      "epoch": 0.6682666666666667,
+      "grad_norm": 0.4711462895209837,
+      "learning_rate": 5.2413259701178505e-05,
+      "loss": 0.6951,
+      "step": 1253
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.4161264510871055,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6857,
+      "step": 1254
+    },
+    {
+      "epoch": 0.6693333333333333,
+      "grad_norm": 0.4775308524055948,
+      "learning_rate": 5.210957484346314e-05,
+      "loss": 0.6804,
+      "step": 1255
+    },
+    {
+      "epoch": 0.6698666666666667,
+      "grad_norm": 0.5808846635377214,
+      "learning_rate": 5.195794670011776e-05,
+      "loss": 0.7775,
+      "step": 1256
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.44018239694044253,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.7215,
+      "step": 1257
+    },
+    {
+      "epoch": 0.6709333333333334,
+      "grad_norm": 0.44483930698309265,
+      "learning_rate": 5.165512124837344e-05,
+      "loss": 0.6953,
+      "step": 1258
+    },
+    {
+      "epoch": 0.6714666666666667,
+      "grad_norm": 0.5334705549207137,
+      "learning_rate": 5.150392484425728e-05,
+      "loss": 0.6846,
+      "step": 1259
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4635334204863553,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.7009,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6725333333333333,
+      "grad_norm": 0.48990461615390385,
+      "learning_rate": 5.120196693701267e-05,
+      "loss": 0.7134,
+      "step": 1261
+    },
+    {
+      "epoch": 0.6730666666666667,
+      "grad_norm": 0.4650288660872971,
+      "learning_rate": 5.105120633557634e-05,
+      "loss": 0.6391,
+      "step": 1262
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.5077059699948259,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.725,
+      "step": 1263
+    },
+    {
+      "epoch": 0.6741333333333334,
+      "grad_norm": 0.5310166258131124,
+      "learning_rate": 5.075012408804458e-05,
+      "loss": 0.7759,
+      "step": 1264
+    },
+    {
+      "epoch": 0.6746666666666666,
+      "grad_norm": 0.5158015698100694,
+      "learning_rate": 5.059980334102637e-05,
+      "loss": 0.6699,
+      "step": 1265
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.45919070240719934,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.7224,
+      "step": 1266
+    },
+    {
+      "epoch": 0.6757333333333333,
+      "grad_norm": 0.40816446901171416,
+      "learning_rate": 5.0299604844886985e-05,
+      "loss": 0.6087,
+      "step": 1267
+    },
+    {
+      "epoch": 0.6762666666666667,
+      "grad_norm": 0.418617164731529,
+      "learning_rate": 5.014972799220403e-05,
+      "loss": 0.7321,
+      "step": 1268
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.3971450135224963,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.6935,
+      "step": 1269
+    },
+    {
+      "epoch": 0.6773333333333333,
+      "grad_norm": 0.41857208990235506,
+      "learning_rate": 4.985042131538545e-05,
+      "loss": 0.7104,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6778666666666666,
+      "grad_norm": 0.43524793559071867,
+      "learning_rate": 4.9700992385024934e-05,
+      "loss": 0.7494,
+      "step": 1271
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.47499544549078676,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6974,
+      "step": 1272
+    },
+    {
+      "epoch": 0.6789333333333334,
+      "grad_norm": 0.3653990388932472,
+      "learning_rate": 4.940258557148765e-05,
+      "loss": 0.6517,
+      "step": 1273
+    },
+    {
+      "epoch": 0.6794666666666667,
+      "grad_norm": 0.3826926723394425,
+      "learning_rate": 4.9253608579398855e-05,
+      "loss": 0.6352,
+      "step": 1274
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.47546251831972147,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.8002,
+      "step": 1275
+    },
+    {
+      "epoch": 0.6805333333333333,
+      "grad_norm": 0.46996949758067974,
+      "learning_rate": 4.895610964891923e-05,
+      "loss": 0.701,
+      "step": 1276
+    },
+    {
+      "epoch": 0.6810666666666667,
+      "grad_norm": 0.4397199542107964,
+      "learning_rate": 4.880758859890536e-05,
+      "loss": 0.6772,
+      "step": 1277
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.40995388485555545,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6291,
+      "step": 1278
+    },
+    {
+      "epoch": 0.6821333333333334,
+      "grad_norm": 0.4292507699261795,
+      "learning_rate": 4.851100554686021e-05,
+      "loss": 0.7056,
+      "step": 1279
+    },
+    {
+      "epoch": 0.6826666666666666,
+      "grad_norm": 0.3959481857025995,
+      "learning_rate": 4.836294443047088e-05,
+      "loss": 0.6909,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.5011243027705662,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.7071,
+      "step": 1281
+    },
+    {
+      "epoch": 0.6837333333333333,
+      "grad_norm": 0.5264871079596701,
+      "learning_rate": 4.8067285227622404e-05,
+      "loss": 0.7536,
+      "step": 1282
+    },
+    {
+      "epoch": 0.6842666666666667,
+      "grad_norm": 0.4196101521504747,
+      "learning_rate": 4.791968802404648e-05,
+      "loss": 0.6729,
+      "step": 1283
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.4701477446986873,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.7349,
+      "step": 1284
+    },
+    {
+      "epoch": 0.6853333333333333,
+      "grad_norm": 0.45922516003781183,
+      "learning_rate": 4.762496061632814e-05,
+      "loss": 0.6913,
+      "step": 1285
+    },
+    {
+      "epoch": 0.6858666666666666,
+      "grad_norm": 0.5222404404840132,
+      "learning_rate": 4.747783129228656e-05,
+      "loss": 0.6479,
+      "step": 1286
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.41816096701873523,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.5806,
+      "step": 1287
+    },
+    {
+      "epoch": 0.6869333333333333,
+      "grad_norm": 0.487397058916506,
+      "learning_rate": 4.718404360058966e-05,
+      "loss": 0.6774,
+      "step": 1288
+    },
+    {
+      "epoch": 0.6874666666666667,
+      "grad_norm": 0.43079324609284403,
+      "learning_rate": 4.7037386110228985e-05,
+      "loss": 0.6776,
+      "step": 1289
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.46102733535716617,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.6919,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6885333333333333,
+      "grad_norm": 0.3953396431957988,
+      "learning_rate": 4.6744546030189486e-05,
+      "loss": 0.6575,
+      "step": 1291
+    },
+    {
+      "epoch": 0.6890666666666667,
+      "grad_norm": 0.39399303788299483,
+      "learning_rate": 4.659836431497563e-05,
+      "loss": 0.6213,
+      "step": 1292
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.3759089877093692,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.621,
+      "step": 1293
+    },
+    {
+      "epoch": 0.6901333333333334,
+      "grad_norm": 0.40925533431762995,
+      "learning_rate": 4.630647971676232e-05,
+      "loss": 0.6247,
+      "step": 1294
+    },
+    {
+      "epoch": 0.6906666666666667,
+      "grad_norm": 0.505159326387147,
+      "learning_rate": 4.6160777705374524e-05,
+      "loss": 0.7192,
+      "step": 1295
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.4221276559855802,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.7134,
+      "step": 1296
+    },
+    {
+      "epoch": 0.6917333333333333,
+      "grad_norm": 0.3655578499295446,
+      "learning_rate": 4.586985643347717e-05,
+      "loss": 0.5917,
+      "step": 1297
+    },
+    {
+      "epoch": 0.6922666666666667,
+      "grad_norm": 0.434260553186921,
+      "learning_rate": 4.572463804170263e-05,
+      "loss": 0.6226,
+      "step": 1298
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.41927843946365817,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6162,
+      "step": 1299
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.43675092772193574,
+      "learning_rate": 4.543468791472131e-05,
+      "loss": 0.6711,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6938666666666666,
+      "grad_norm": 0.4482351276308452,
+      "learning_rate": 4.5289957045349653e-05,
+      "loss": 0.6319,
+      "step": 1301
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.45343576093395144,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.7511,
+      "step": 1302
+    },
+    {
+      "epoch": 0.6949333333333333,
+      "grad_norm": 0.4028826179925407,
+      "learning_rate": 4.5000985855784746e-05,
+      "loss": 0.6549,
+      "step": 1303
+    },
+    {
+      "epoch": 0.6954666666666667,
+      "grad_norm": 0.48046840157404724,
+      "learning_rate": 4.485674639850333e-05,
+      "loss": 0.6538,
+      "step": 1304
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.42045322713418654,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.6759,
+      "step": 1305
+    },
+    {
+      "epoch": 0.6965333333333333,
+      "grad_norm": 0.44657651646268753,
+      "learning_rate": 4.456876191254582e-05,
+      "loss": 0.7518,
+      "step": 1306
+    },
+    {
+      "epoch": 0.6970666666666666,
+      "grad_norm": 0.4556318718130518,
+      "learning_rate": 4.442501774383515e-05,
+      "loss": 0.7657,
+      "step": 1307
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.4397741660582045,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.7605,
+      "step": 1308
+    },
+    {
+      "epoch": 0.6981333333333334,
+      "grad_norm": 0.4531289604393625,
+      "learning_rate": 4.413802770115816e-05,
+      "loss": 0.7293,
+      "step": 1309
+    },
+    {
+      "epoch": 0.6986666666666667,
+      "grad_norm": 0.46589715983901325,
+      "learning_rate": 4.399478268418771e-05,
+      "loss": 0.6772,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.5073510560869048,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.6744,
+      "step": 1311
+    },
+    {
+      "epoch": 0.6997333333333333,
+      "grad_norm": 0.4157927043911074,
+      "learning_rate": 4.3708794797738375e-05,
+      "loss": 0.6682,
+      "step": 1312
+    },
+    {
+      "epoch": 0.7002666666666667,
+      "grad_norm": 0.4770391448646444,
+      "learning_rate": 4.3566052782262735e-05,
+      "loss": 0.6817,
+      "step": 1313
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.4801890769977989,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6698,
+      "step": 1314
+    },
+    {
+      "epoch": 0.7013333333333334,
+      "grad_norm": 0.4734320911245157,
+      "learning_rate": 4.328107473805487e-05,
+      "loss": 0.7149,
+      "step": 1315
+    },
+    {
+      "epoch": 0.7018666666666666,
+      "grad_norm": 0.4420285003704658,
+      "learning_rate": 4.3138839560310303e-05,
+      "loss": 0.6655,
+      "step": 1316
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.42934215063887937,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6605,
+      "step": 1317
+    },
+    {
+      "epoch": 0.7029333333333333,
+      "grad_norm": 0.5043512124137548,
+      "learning_rate": 4.2854879017217894e-05,
+      "loss": 0.7483,
+      "step": 1318
+    },
+    {
+      "epoch": 0.7034666666666667,
+      "grad_norm": 0.48819416093841506,
+      "learning_rate": 4.271315449981934e-05,
+      "loss": 0.7453,
+      "step": 1319
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.42948686180896956,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.6741,
+      "step": 1320
+    },
+    {
+      "epoch": 0.7045333333333333,
+      "grad_norm": 0.4634141322694243,
+      "learning_rate": 4.2430219089370823e-05,
+      "loss": 0.6771,
+      "step": 1321
+    },
+    {
+      "epoch": 0.7050666666666666,
+      "grad_norm": 0.4040112591373145,
+      "learning_rate": 4.228900904120895e-05,
+      "loss": 0.6696,
+      "step": 1322
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.4972940744230627,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.7481,
+      "step": 1323
+    },
+    {
+      "epoch": 0.7061333333333333,
+      "grad_norm": 0.4779896136914934,
+      "learning_rate": 4.200710636738189e-05,
+      "loss": 0.7624,
+      "step": 1324
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 0.35441590514537924,
+      "learning_rate": 4.1866414583520877e-05,
+      "loss": 0.587,
+      "step": 1325
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.44120062830045637,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.671,
+      "step": 1326
+    },
+    {
+      "epoch": 0.7077333333333333,
+      "grad_norm": 0.3569951734313286,
+      "learning_rate": 4.158555222253771e-05,
+      "loss": 0.6416,
+      "step": 1327
+    },
+    {
+      "epoch": 0.7082666666666667,
+      "grad_norm": 0.4853865033135809,
+      "learning_rate": 4.14453824841132e-05,
+      "loss": 0.7498,
+      "step": 1328
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.5425103499740149,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6326,
+      "step": 1329
+    },
+    {
+      "epoch": 0.7093333333333334,
+      "grad_norm": 0.49556347361308867,
+      "learning_rate": 4.1165567984237764e-05,
+      "loss": 0.6519,
+      "step": 1330
+    },
+    {
+      "epoch": 0.7098666666666666,
+      "grad_norm": 0.5412599360095961,
+      "learning_rate": 4.102592405835536e-05,
+      "loss": 0.6511,
+      "step": 1331
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4397286600871548,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7173,
+      "step": 1332
+    },
+    {
+      "epoch": 0.7109333333333333,
+      "grad_norm": 0.36734758083748076,
+      "learning_rate": 4.074716493968975e-05,
+      "loss": 0.6393,
+      "step": 1333
+    },
+    {
+      "epoch": 0.7114666666666667,
+      "grad_norm": 0.4783721529678446,
+      "learning_rate": 4.060805057932359e-05,
+      "loss": 0.7696,
+      "step": 1334
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.40301976864384675,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6797,
+      "step": 1335
+    },
+    {
+      "epoch": 0.7125333333333334,
+      "grad_norm": 0.35473872328811157,
+      "learning_rate": 4.0330354333606234e-05,
+      "loss": 0.6079,
+      "step": 1336
+    },
+    {
+      "epoch": 0.7130666666666666,
+      "grad_norm": 0.4013938109382642,
+      "learning_rate": 4.019177327749822e-05,
+      "loss": 0.6165,
+      "step": 1337
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.44058459786803794,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.7382,
+      "step": 1338
+    },
+    {
+      "epoch": 0.7141333333333333,
+      "grad_norm": 0.5328855124479709,
+      "learning_rate": 3.991514736790258e-05,
+      "loss": 0.651,
+      "step": 1339
+    },
+    {
+      "epoch": 0.7146666666666667,
+      "grad_norm": 0.4455698946397586,
+      "learning_rate": 3.977710334046193e-05,
+      "loss": 0.7481,
+      "step": 1340
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.43625608319365694,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.6686,
+      "step": 1341
+    },
+    {
+      "epoch": 0.7157333333333333,
+      "grad_norm": 0.39478402973427,
+      "learning_rate": 3.950155520139581e-05,
+      "loss": 0.6772,
+      "step": 1342
+    },
+    {
+      "epoch": 0.7162666666666667,
+      "grad_norm": 0.4447714164559942,
+      "learning_rate": 3.936405191259891e-05,
+      "loss": 0.6152,
+      "step": 1343
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3709666292725546,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6522,
+      "step": 1344
+    },
+    {
+      "epoch": 0.7173333333333334,
+      "grad_norm": 0.49935376346546406,
+      "learning_rate": 3.9089588949504655e-05,
+      "loss": 0.5718,
+      "step": 1345
+    },
+    {
+      "epoch": 0.7178666666666667,
+      "grad_norm": 0.5193097691899753,
+      "learning_rate": 3.895263009479534e-05,
+      "loss": 0.7117,
+      "step": 1346
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.49367808147553527,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6369,
+      "step": 1347
+    },
+    {
+      "epoch": 0.7189333333333333,
+      "grad_norm": 0.4446072810452819,
+      "learning_rate": 3.867925968395085e-05,
+      "loss": 0.6319,
+      "step": 1348
+    },
+    {
+      "epoch": 0.7194666666666667,
+      "grad_norm": 0.44230619891195616,
+      "learning_rate": 3.854284894414122e-05,
+      "loss": 0.7182,
+      "step": 1349
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.38420112215989166,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.6216,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7205333333333334,
+      "grad_norm": 0.5019080492723267,
+      "learning_rate": 3.82705784324618e-05,
+      "loss": 0.6856,
+      "step": 1351
+    },
+    {
+      "epoch": 0.7210666666666666,
+      "grad_norm": 0.40799296100479354,
+      "learning_rate": 3.8134719473633094e-05,
+      "loss": 0.6419,
+      "step": 1352
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.36619573541564876,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.5831,
+      "step": 1353
+    },
+    {
+      "epoch": 0.7221333333333333,
+      "grad_norm": 0.4294973033120836,
+      "learning_rate": 3.786355617847385e-05,
+      "loss": 0.701,
+      "step": 1354
+    },
+    {
+      "epoch": 0.7226666666666667,
+      "grad_norm": 0.42997115843154643,
+      "learning_rate": 3.772825265187802e-05,
+      "loss": 0.6917,
+      "step": 1355
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.5505262459058446,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.6789,
+      "step": 1356
+    },
+    {
+      "epoch": 0.7237333333333333,
+      "grad_norm": 0.4484169997765625,
+      "learning_rate": 3.7458203860837234e-05,
+      "loss": 0.6884,
+      "step": 1357
+    },
+    {
+      "epoch": 0.7242666666666666,
+      "grad_norm": 0.3253818373046138,
+      "learning_rate": 3.732345940279893e-05,
+      "loss": 0.5682,
+      "step": 1358
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.5005046168097154,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.7798,
+      "step": 1359
+    },
+    {
+      "epoch": 0.7253333333333334,
+      "grad_norm": 0.4535851767705107,
+      "learning_rate": 3.705453237352227e-05,
+      "loss": 0.6535,
+      "step": 1360
+    },
+    {
+      "epoch": 0.7258666666666667,
+      "grad_norm": 0.37017306277182493,
+      "learning_rate": 3.692035060534088e-05,
+      "loss": 0.6378,
+      "step": 1361
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.4659719483337839,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.686,
+      "step": 1362
+    },
+    {
+      "epoch": 0.7269333333333333,
+      "grad_norm": 0.39666192885243245,
+      "learning_rate": 3.665255256532638e-05,
+      "loss": 0.6586,
+      "step": 1363
+    },
+    {
+      "epoch": 0.7274666666666667,
+      "grad_norm": 0.3715412733899542,
+      "learning_rate": 3.651893709317887e-05,
+      "loss": 0.6385,
+      "step": 1364
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4029055369944867,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6223,
+      "step": 1365
+    },
+    {
+      "epoch": 0.7285333333333334,
+      "grad_norm": 0.3572924866367461,
+      "learning_rate": 3.625227523958252e-05,
+      "loss": 0.5971,
+      "step": 1366
+    },
+    {
+      "epoch": 0.7290666666666666,
+      "grad_norm": 0.4426536277758904,
+      "learning_rate": 3.611922965442648e-05,
+      "loss": 0.6824,
+      "step": 1367
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.4441399528703781,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.6384,
+      "step": 1368
+    },
+    {
+      "epoch": 0.7301333333333333,
+      "grad_norm": 0.43418683226601634,
+      "learning_rate": 3.5853711153868965e-05,
+      "loss": 0.6839,
+      "step": 1369
+    },
+    {
+      "epoch": 0.7306666666666667,
+      "grad_norm": 0.4411283154623925,
+      "learning_rate": 3.5721239031346066e-05,
+      "loss": 0.6961,
+      "step": 1370
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.4195467102302851,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.7042,
+      "step": 1371
+    },
+    {
+      "epoch": 0.7317333333333333,
+      "grad_norm": 0.39319018327375055,
+      "learning_rate": 3.545687101972013e-05,
+      "loss": 0.6923,
+      "step": 1372
+    },
+    {
+      "epoch": 0.7322666666666666,
+      "grad_norm": 0.4232049942394652,
+      "learning_rate": 3.53249759200601e-05,
+      "loss": 0.6429,
+      "step": 1373
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4127115823043955,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6523,
+      "step": 1374
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.4071491423804467,
+      "learning_rate": 3.506176550233863e-05,
+      "loss": 0.7078,
+      "step": 1375
+    },
+    {
+      "epoch": 0.7338666666666667,
+      "grad_norm": 0.38089821859697426,
+      "learning_rate": 3.4930450970263485e-05,
+      "loss": 0.6483,
+      "step": 1376
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.4405816078153508,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6595,
+      "step": 1377
+    },
+    {
+      "epoch": 0.7349333333333333,
+      "grad_norm": 0.4887840782636558,
+      "learning_rate": 3.46684052203088e-05,
+      "loss": 0.6609,
+      "step": 1378
+    },
+    {
+      "epoch": 0.7354666666666667,
+      "grad_norm": 0.5416620265321336,
+      "learning_rate": 3.4537674784937614e-05,
+      "loss": 0.7102,
+      "step": 1379
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.45612068099197606,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.6926,
+      "step": 1380
+    },
+    {
+      "epoch": 0.7365333333333334,
+      "grad_norm": 0.41827704069272087,
+      "learning_rate": 3.427680074531113e-05,
+      "loss": 0.6795,
+      "step": 1381
+    },
+    {
+      "epoch": 0.7370666666666666,
+      "grad_norm": 0.4216592821850561,
+      "learning_rate": 3.4146657920065285e-05,
+      "loss": 0.6102,
+      "step": 1382
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.5968250083973519,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7302,
+      "step": 1383
+    },
+    {
+      "epoch": 0.7381333333333333,
+      "grad_norm": 0.410665467136115,
+      "learning_rate": 3.388696260183832e-05,
+      "loss": 0.6755,
+      "step": 1384
+    },
+    {
+      "epoch": 0.7386666666666667,
+      "grad_norm": 0.4695062858668635,
+      "learning_rate": 3.3757410884346894e-05,
+      "loss": 0.738,
+      "step": 1385
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.46271244818895624,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6636,
+      "step": 1386
+    },
+    {
+      "epoch": 0.7397333333333334,
+      "grad_norm": 0.4015570788728205,
+      "learning_rate": 3.3498901266912396e-05,
+      "loss": 0.6179,
+      "step": 1387
+    },
+    {
+      "epoch": 0.7402666666666666,
+      "grad_norm": 0.48364235248570314,
+      "learning_rate": 3.336994413891828e-05,
+      "loss": 0.7225,
+      "step": 1388
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.3973194957699503,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.6019,
+      "step": 1389
+    },
+    {
+      "epoch": 0.7413333333333333,
+      "grad_norm": 0.40642190153774327,
+      "learning_rate": 3.3112627169802946e-05,
+      "loss": 0.6479,
+      "step": 1390
+    },
+    {
+      "epoch": 0.7418666666666667,
+      "grad_norm": 0.42111818419243835,
+      "learning_rate": 3.298426809706928e-05,
+      "loss": 0.6101,
+      "step": 1391
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.479618840905392,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.7012,
+      "step": 1392
+    },
+    {
+      "epoch": 0.7429333333333333,
+      "grad_norm": 0.572763218453302,
+      "learning_rate": 3.2728150691747115e-05,
+      "loss": 0.753,
+      "step": 1393
+    },
+    {
+      "epoch": 0.7434666666666667,
+      "grad_norm": 0.42415380076495746,
+      "learning_rate": 3.2600393123964113e-05,
+      "loss": 0.6656,
+      "step": 1394
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.44506141267476257,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6995,
+      "step": 1395
+    },
+    {
+      "epoch": 0.7445333333333334,
+      "grad_norm": 0.42713836436030106,
+      "learning_rate": 3.234548216567049e-05,
+      "loss": 0.7182,
+      "step": 1396
+    },
+    {
+      "epoch": 0.7450666666666667,
+      "grad_norm": 0.3998359784462856,
+      "learning_rate": 3.2218329536362704e-05,
+      "loss": 0.6857,
+      "step": 1397
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.5075873742854828,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.5661,
+      "step": 1398
+    },
+    {
+      "epoch": 0.7461333333333333,
+      "grad_norm": 0.43959844272118975,
+      "learning_rate": 3.196463187590929e-05,
+      "loss": 0.6635,
+      "step": 1399
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.45097273226825346,
+      "learning_rate": 3.1838087602343344e-05,
+      "loss": 0.6798,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.4528735769012832,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6903,
+      "step": 1401
+    },
+    {
+      "epoch": 0.7477333333333334,
+      "grad_norm": 0.4493233856167751,
+      "learning_rate": 3.158561005793402e-05,
+      "loss": 0.7353,
+      "step": 1402
+    },
+    {
+      "epoch": 0.7482666666666666,
+      "grad_norm": 1.1432675217729333,
+      "learning_rate": 3.145967754102691e-05,
+      "loss": 0.6526,
+      "step": 1403
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.4719850237881296,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.6482,
+      "step": 1404
+    },
+    {
+      "epoch": 0.7493333333333333,
+      "grad_norm": 0.4575611421586871,
+      "learning_rate": 3.120842689807468e-05,
+      "loss": 0.6597,
+      "step": 1405
+    },
+    {
+      "epoch": 0.7498666666666667,
+      "grad_norm": 0.3955243959116776,
+      "learning_rate": 3.108310952230212e-05,
+      "loss": 0.6432,
+      "step": 1406
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.4209895420383914,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6398,
+      "step": 1407
+    },
+    {
+      "epoch": 0.7509333333333333,
+      "grad_norm": 0.436750904523161,
+      "learning_rate": 3.083309253324651e-05,
+      "loss": 0.6819,
+      "step": 1408
+    },
+    {
+      "epoch": 0.7514666666666666,
+      "grad_norm": 0.44006405567494983,
+      "learning_rate": 3.070839366655215e-05,
+      "loss": 0.6409,
+      "step": 1409
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.41012660114209337,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6675,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7525333333333334,
+      "grad_norm": 0.40466806228202684,
+      "learning_rate": 3.0459617050677868e-05,
+      "loss": 0.6112,
+      "step": 1411
+    },
+    {
+      "epoch": 0.7530666666666667,
+      "grad_norm": 0.4320520288424673,
+      "learning_rate": 3.0335540044382694e-05,
+      "loss": 0.6214,
+      "step": 1412
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.6547957865987912,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.7243,
+      "step": 1413
+    },
+    {
+      "epoch": 0.7541333333333333,
+      "grad_norm": 0.493665262749511,
+      "learning_rate": 3.008801048763914e-05,
+      "loss": 0.6408,
+      "step": 1414
+    },
+    {
+      "epoch": 0.7546666666666667,
+      "grad_norm": 0.46362516582819296,
+      "learning_rate": 2.996455867635155e-05,
+      "loss": 0.7521,
+      "step": 1415
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.4838596305339148,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.7279,
+      "step": 1416
+    },
+    {
+      "epoch": 0.7557333333333334,
+      "grad_norm": 0.40205261407352516,
+      "learning_rate": 2.9718282831172883e-05,
+      "loss": 0.6594,
+      "step": 1417
+    },
+    {
+      "epoch": 0.7562666666666666,
+      "grad_norm": 0.6608564183232708,
+      "learning_rate": 2.9595459532698854e-05,
+      "loss": 0.7107,
+      "step": 1418
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.40477314830392686,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6404,
+      "step": 1419
+    },
+    {
+      "epoch": 0.7573333333333333,
+      "grad_norm": 0.45862989831373807,
+      "learning_rate": 2.9350444017825385e-05,
+      "loss": 0.6943,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7578666666666667,
+      "grad_norm": 0.4499846395944625,
+      "learning_rate": 2.922825253307947e-05,
+      "loss": 0.6938,
+      "step": 1421
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.5009883034739381,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.7392,
+      "step": 1422
+    },
+    {
+      "epoch": 0.7589333333333333,
+      "grad_norm": 0.39787126435833514,
+      "learning_rate": 2.898450393337977e-05,
+      "loss": 0.6446,
+      "step": 1423
+    },
+    {
+      "epoch": 0.7594666666666666,
+      "grad_norm": 0.4962610327432991,
+      "learning_rate": 2.8862947546296315e-05,
+      "loss": 0.6979,
+      "step": 1424
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.4112045306396125,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.6257,
+      "step": 1425
+    },
+    {
+      "epoch": 0.7605333333333333,
+      "grad_norm": 0.4206304928677344,
+      "learning_rate": 2.8620472412590228e-05,
+      "loss": 0.7027,
+      "step": 1426
+    },
+    {
+      "epoch": 0.7610666666666667,
+      "grad_norm": 0.42846636415891637,
+      "learning_rate": 2.8499554390035143e-05,
+      "loss": 0.6885,
+      "step": 1427
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3764584034790101,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6238,
+      "step": 1428
+    },
+    {
+      "epoch": 0.7621333333333333,
+      "grad_norm": 0.40290372899365723,
+      "learning_rate": 2.8258359238917665e-05,
+      "loss": 0.6628,
+      "step": 1429
+    },
+    {
+      "epoch": 0.7626666666666667,
+      "grad_norm": 0.3824708549754406,
+      "learning_rate": 2.8138082830600554e-05,
+      "loss": 0.6348,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.4431141193201716,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6318,
+      "step": 1431
+    },
+    {
+      "epoch": 0.7637333333333334,
+      "grad_norm": 0.37440098757183116,
+      "learning_rate": 2.7898174144266732e-05,
+      "loss": 0.638,
+      "step": 1432
+    },
+    {
+      "epoch": 0.7642666666666666,
+      "grad_norm": 0.5076540492769912,
+      "learning_rate": 2.7778542582653744e-05,
+      "loss": 0.7331,
+      "step": 1433
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.43367732412243143,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.5986,
+      "step": 1434
+    },
+    {
+      "epoch": 0.7653333333333333,
+      "grad_norm": 0.39999747699379296,
+      "learning_rate": 2.753992680872457e-05,
+      "loss": 0.6826,
+      "step": 1435
+    },
+    {
+      "epoch": 0.7658666666666667,
+      "grad_norm": 0.4122860559375757,
+      "learning_rate": 2.7420943308951284e-05,
+      "loss": 0.6402,
+      "step": 1436
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.5332219220904169,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.6996,
+      "step": 1437
+    },
+    {
+      "epoch": 0.7669333333333334,
+      "grad_norm": 0.39457104219654593,
+      "learning_rate": 2.7183626860300247e-05,
+      "loss": 0.6206,
+      "step": 1438
+    },
+    {
+      "epoch": 0.7674666666666666,
+      "grad_norm": 0.4125437184555351,
+      "learning_rate": 2.7065294620085424e-05,
+      "loss": 0.6496,
+      "step": 1439
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4434116647478168,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6972,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7685333333333333,
+      "grad_norm": 0.5242067106090085,
+      "learning_rate": 2.6829283874666233e-05,
+      "loss": 0.6346,
+      "step": 1441
+    },
+    {
+      "epoch": 0.7690666666666667,
+      "grad_norm": 0.48697919458894945,
+      "learning_rate": 2.6711606074225782e-05,
+      "loss": 0.6727,
+      "step": 1442
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.6840449583991132,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6906,
+      "step": 1443
+    },
+    {
+      "epoch": 0.7701333333333333,
+      "grad_norm": 0.5723928648456147,
+      "learning_rate": 2.647690737490106e-05,
+      "loss": 0.7639,
+      "step": 1444
+    },
+    {
+      "epoch": 0.7706666666666667,
+      "grad_norm": 0.4842751313789253,
+      "learning_rate": 2.6359887176862718e-05,
+      "loss": 0.6301,
+      "step": 1445
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.37937441711304115,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.663,
+      "step": 1446
+    },
+    {
+      "epoch": 0.7717333333333334,
+      "grad_norm": 0.45872658764132634,
+      "learning_rate": 2.6126506831233344e-05,
+      "loss": 0.7325,
+      "step": 1447
+    },
+    {
+      "epoch": 0.7722666666666667,
+      "grad_norm": 0.4032654765599899,
+      "learning_rate": 2.6010147380551475e-05,
+      "loss": 0.6517,
+      "step": 1448
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.6299189974694519,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.7193,
+      "step": 1449
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.3459084928602634,
+      "learning_rate": 2.577809166078716e-05,
+      "loss": 0.6186,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7738666666666667,
+      "grad_norm": 0.40655230914119433,
+      "learning_rate": 2.566239608465838e-05,
+      "loss": 0.6147,
+      "step": 1451
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.47992854967493603,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.668,
+      "step": 1452
+    },
+    {
+      "epoch": 0.7749333333333334,
+      "grad_norm": 0.40885929670105964,
+      "learning_rate": 2.543167122732918e-05,
+      "loss": 0.6854,
+      "step": 1453
+    },
+    {
+      "epoch": 0.7754666666666666,
+      "grad_norm": 0.562744141891325,
+      "learning_rate": 2.5316642635108244e-05,
+      "loss": 0.7349,
+      "step": 1454
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.579209398186038,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.6996,
+      "step": 1455
+    },
+    {
+      "epoch": 0.7765333333333333,
+      "grad_norm": 0.40889853330697956,
+      "learning_rate": 2.508725484101684e-05,
+      "loss": 0.5663,
+      "step": 1456
+    },
+    {
+      "epoch": 0.7770666666666667,
+      "grad_norm": 0.4039180918173561,
+      "learning_rate": 2.4972896324133144e-05,
+      "loss": 0.5978,
+      "step": 1457
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.4260067864950819,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6329,
+      "step": 1458
+    },
+    {
+      "epoch": 0.7781333333333333,
+      "grad_norm": 0.35314156930204316,
+      "learning_rate": 2.4744851758148156e-05,
+      "loss": 0.5872,
+      "step": 1459
+    },
+    {
+      "epoch": 0.7786666666666666,
+      "grad_norm": 0.40319276048692143,
+      "learning_rate": 2.4631166390022574e-05,
+      "loss": 0.6405,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.43965680601175855,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6824,
+      "step": 1461
+    },
+    {
+      "epoch": 0.7797333333333333,
+      "grad_norm": 0.4064539636314437,
+      "learning_rate": 2.4404471180913058e-05,
+      "loss": 0.6392,
+      "step": 1462
+    },
+    {
+      "epoch": 0.7802666666666667,
+      "grad_norm": 0.4483840815129231,
+      "learning_rate": 2.429146201687538e-05,
+      "loss": 0.7326,
+      "step": 1463
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.4523115976497022,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6374,
+      "step": 1464
+    },
+    {
+      "epoch": 0.7813333333333333,
+      "grad_norm": 0.5091395654116365,
+      "learning_rate": 2.4066122257145894e-05,
+      "loss": 0.6796,
+      "step": 1465
+    },
+    {
+      "epoch": 0.7818666666666667,
+      "grad_norm": 0.4599915732411999,
+      "learning_rate": 2.3953792334352787e-05,
+      "loss": 0.6411,
+      "step": 1466
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.42810617569498916,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6739,
+      "step": 1467
+    },
+    {
+      "epoch": 0.7829333333333334,
+      "grad_norm": 0.42906586147037934,
+      "learning_rate": 2.3729814080079816e-05,
+      "loss": 0.6877,
+      "step": 1468
+    },
+    {
+      "epoch": 0.7834666666666666,
+      "grad_norm": 0.38841250504046165,
+      "learning_rate": 2.361816641743303e-05,
+      "loss": 0.6279,
+      "step": 1469
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4342986841252191,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.7165,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7845333333333333,
+      "grad_norm": 0.38872010965643383,
+      "learning_rate": 2.339555568810221e-05,
+      "loss": 0.6231,
+      "step": 1471
+    },
+    {
+      "epoch": 0.7850666666666667,
+      "grad_norm": 0.5407543818226905,
+      "learning_rate": 2.328459328616759e-05,
+      "loss": 0.6601,
+      "step": 1472
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.444503105857603,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6142,
+      "step": 1473
+    },
+    {
+      "epoch": 0.7861333333333334,
+      "grad_norm": 0.379561000692178,
+      "learning_rate": 2.306335606451181e-05,
+      "loss": 0.6243,
+      "step": 1474
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 0.4121710483336557,
+      "learning_rate": 2.295308190543859e-05,
+      "loss": 0.6316,
+      "step": 1475
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.4691290071900537,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6443,
+      "step": 1476
+    },
+    {
+      "epoch": 0.7877333333333333,
+      "grad_norm": 0.4055354723342302,
+      "learning_rate": 2.2733224137277366e-05,
+      "loss": 0.6107,
+      "step": 1477
+    },
+    {
+      "epoch": 0.7882666666666667,
+      "grad_norm": 0.3929569369202823,
+      "learning_rate": 2.262364118471805e-05,
+      "loss": 0.6733,
+      "step": 1478
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.4814978653308864,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6407,
+      "step": 1479
+    },
+    {
+      "epoch": 0.7893333333333333,
+      "grad_norm": 0.3981649138803261,
+      "learning_rate": 2.2405168778797646e-05,
+      "loss": 0.6531,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7898666666666667,
+      "grad_norm": 0.36836314340158854,
+      "learning_rate": 2.2296279977828337e-05,
+      "loss": 0.6226,
+      "step": 1481
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.4481966583544684,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.7159,
+      "step": 1482
+    },
+    {
+      "epoch": 0.7909333333333334,
+      "grad_norm": 0.3500837632158312,
+      "learning_rate": 2.2079198805662914e-05,
+      "loss": 0.6112,
+      "step": 1483
+    },
+    {
+      "epoch": 0.7914666666666667,
+      "grad_norm": 0.4783183264793033,
+      "learning_rate": 2.1971007082704164e-05,
+      "loss": 0.635,
+      "step": 1484
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.4443236553599415,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6441,
+      "step": 1485
+    },
+    {
+      "epoch": 0.7925333333333333,
+      "grad_norm": 0.4541438465337239,
+      "learning_rate": 2.1755322978418137e-05,
+      "loss": 0.6763,
+      "step": 1486
+    },
+    {
+      "epoch": 0.7930666666666667,
+      "grad_norm": 0.4718351012226869,
+      "learning_rate": 2.1647831241156302e-05,
+      "loss": 0.6387,
+      "step": 1487
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.4241830736068816,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6314,
+      "step": 1488
+    },
+    {
+      "epoch": 0.7941333333333334,
+      "grad_norm": 0.4680461026945068,
+      "learning_rate": 2.1433550001327373e-05,
+      "loss": 0.6812,
+      "step": 1489
+    },
+    {
+      "epoch": 0.7946666666666666,
+      "grad_norm": 0.7163228104444151,
+      "learning_rate": 2.1326761138636553e-05,
+      "loss": 0.6863,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.4554616306459936,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.615,
+      "step": 1491
+    },
+    {
+      "epoch": 0.7957333333333333,
+      "grad_norm": 0.4007787544244606,
+      "learning_rate": 2.111388852214001e-05,
+      "loss": 0.6999,
+      "step": 1492
+    },
+    {
+      "epoch": 0.7962666666666667,
+      "grad_norm": 0.5228251349902756,
+      "learning_rate": 2.1007805404004242e-05,
+      "loss": 0.7226,
+      "step": 1493
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.4230339780418657,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.6562,
+      "step": 1494
+    },
+    {
+      "epoch": 0.7973333333333333,
+      "grad_norm": 0.4289200465615318,
+      "learning_rate": 2.0796347131858186e-05,
+      "loss": 0.6975,
+      "step": 1495
+    },
+    {
+      "epoch": 0.7978666666666666,
+      "grad_norm": 0.469280677300091,
+      "learning_rate": 2.069097260929439e-05,
+      "loss": 0.6408,
+      "step": 1496
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.4367332567861892,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6777,
+      "step": 1497
+    },
+    {
+      "epoch": 0.7989333333333334,
+      "grad_norm": 0.3819927261154882,
+      "learning_rate": 2.048093436450603e-05,
+      "loss": 0.6572,
+      "step": 1498
+    },
+    {
+      "epoch": 0.7994666666666667,
+      "grad_norm": 0.545684475160914,
+      "learning_rate": 2.0376271269487514e-05,
+      "loss": 0.7169,
+      "step": 1499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3756034309671269,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6111,
+      "step": 1500
+    },
+    {
+      "epoch": 0.8005333333333333,
+      "grad_norm": 0.47738980941475234,
+      "learning_rate": 2.0167658696900317e-05,
+      "loss": 0.6274,
+      "step": 1501
+    },
+    {
+      "epoch": 0.8010666666666667,
+      "grad_norm": 0.4614568106789699,
+      "learning_rate": 2.0063709842280432e-05,
+      "loss": 0.7233,
+      "step": 1502
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.4170144770443061,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.6834,
+      "step": 1503
+    },
+    {
+      "epoch": 0.8021333333333334,
+      "grad_norm": 0.4378867660574267,
+      "learning_rate": 1.985652854842247e-05,
+      "loss": 0.6897,
+      "step": 1504
+    },
+    {
+      "epoch": 0.8026666666666666,
+      "grad_norm": 0.6109643981863047,
+      "learning_rate": 1.9753296727859195e-05,
+      "loss": 0.6896,
+      "step": 1505
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.4362002872919862,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6262,
+      "step": 1506
+    },
+    {
+      "epoch": 0.8037333333333333,
+      "grad_norm": 0.44474390683765214,
+      "learning_rate": 1.9547552280792524e-05,
+      "loss": 0.5892,
+      "step": 1507
+    },
+    {
+      "epoch": 0.8042666666666667,
+      "grad_norm": 0.5702420791951768,
+      "learning_rate": 1.9445040268673298e-05,
+      "loss": 0.7219,
+      "step": 1508
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.4787500446260359,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.6837,
+      "step": 1509
+    },
+    {
+      "epoch": 0.8053333333333333,
+      "grad_norm": 0.48508785097643087,
+      "learning_rate": 1.9240738197844278e-05,
+      "loss": 0.67,
+      "step": 1510
+    },
+    {
+      "epoch": 0.8058666666666666,
+      "grad_norm": 0.42022236520136474,
+      "learning_rate": 1.9138948749211472e-05,
+      "loss": 0.7159,
+      "step": 1511
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.4527569141085781,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6423,
+      "step": 1512
+    },
+    {
+      "epoch": 0.8069333333333333,
+      "grad_norm": 0.39016311585877517,
+      "learning_rate": 1.8936094545302095e-05,
+      "loss": 0.6467,
+      "step": 1513
+    },
+    {
+      "epoch": 0.8074666666666667,
+      "grad_norm": 0.5433799100774432,
+      "learning_rate": 1.883503039577894e-05,
+      "loss": 0.6036,
+      "step": 1514
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.38182838533340996,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.606,
+      "step": 1515
+    },
+    {
+      "epoch": 0.8085333333333333,
+      "grad_norm": 0.4369063274675231,
+      "learning_rate": 1.8633629510559314e-05,
+      "loss": 0.6541,
+      "step": 1516
+    },
+    {
+      "epoch": 0.8090666666666667,
+      "grad_norm": 0.46879166057021016,
+      "learning_rate": 1.8533293376276472e-05,
+      "loss": 0.6798,
+      "step": 1517
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3369820745212901,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.5896,
+      "step": 1518
+    },
+    {
+      "epoch": 0.8101333333333334,
+      "grad_norm": 0.5947328223831719,
+      "learning_rate": 1.8333351222458407e-05,
+      "loss": 0.772,
+      "step": 1519
+    },
+    {
+      "epoch": 0.8106666666666666,
+      "grad_norm": 0.4633678727479024,
+      "learning_rate": 1.8233745799980817e-05,
+      "loss": 0.6713,
+      "step": 1520
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.5685596062239611,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.73,
+      "step": 1521
+    },
+    {
+      "epoch": 0.8117333333333333,
+      "grad_norm": 0.4081445228850184,
+      "learning_rate": 1.803526775107217e-05,
+      "loss": 0.6344,
+      "step": 1522
+    },
+    {
+      "epoch": 0.8122666666666667,
+      "grad_norm": 0.44003364356713587,
+      "learning_rate": 1.7936395717326704e-05,
+      "loss": 0.6511,
+      "step": 1523
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.5786424582544375,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.5641,
+      "step": 1524
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 0.43733236504905026,
+      "learning_rate": 1.773938710748706e-05,
+      "loss": 0.665,
+      "step": 1525
+    },
+    {
+      "epoch": 0.8138666666666666,
+      "grad_norm": 0.4502124006287469,
+      "learning_rate": 1.7641251119690505e-05,
+      "loss": 0.6443,
+      "step": 1526
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.44235441651659346,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.65,
+      "step": 1527
+    },
+    {
+      "epoch": 0.8149333333333333,
+      "grad_norm": 0.5184934723274678,
+      "learning_rate": 1.744571724358789e-05,
+      "loss": 0.6528,
+      "step": 1528
+    },
+    {
+      "epoch": 0.8154666666666667,
+      "grad_norm": 0.4166153228231402,
+      "learning_rate": 1.7348319939175637e-05,
+      "loss": 0.6568,
+      "step": 1529
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.5684615427199803,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.7484,
+      "step": 1530
+    },
+    {
+      "epoch": 0.8165333333333333,
+      "grad_norm": 0.3977201241748141,
+      "learning_rate": 1.715426605184407e-05,
+      "loss": 0.6442,
+      "step": 1531
+    },
+    {
+      "epoch": 0.8170666666666667,
+      "grad_norm": 0.48124047636989997,
+      "learning_rate": 1.705761004839911e-05,
+      "loss": 0.7476,
+      "step": 1532
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.6256985102564921,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7582,
+      "step": 1533
+    },
+    {
+      "epoch": 0.8181333333333334,
+      "grad_norm": 0.4135303417301385,
+      "learning_rate": 1.6865041365097435e-05,
+      "loss": 0.6429,
+      "step": 1534
+    },
+    {
+      "epoch": 0.8186666666666667,
+      "grad_norm": 0.4542126357851726,
+      "learning_rate": 1.676912926028007e-05,
+      "loss": 0.6585,
+      "step": 1535
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3936954529296244,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.7135,
+      "step": 1536
+    },
+    {
+      "epoch": 0.8197333333333333,
+      "grad_norm": 0.48552027393072117,
+      "learning_rate": 1.6578050956351886e-05,
+      "loss": 0.6239,
+      "step": 1537
+    },
+    {
+      "epoch": 0.8202666666666667,
+      "grad_norm": 0.4455902312851424,
+      "learning_rate": 1.6482885327829913e-05,
+      "loss": 0.5849,
+      "step": 1538
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.4632386386190594,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.7358,
+      "step": 1539
+    },
+    {
+      "epoch": 0.8213333333333334,
+      "grad_norm": 0.4750722058952672,
+      "learning_rate": 1.6293302538564382e-05,
+      "loss": 0.6443,
+      "step": 1540
+    },
+    {
+      "epoch": 0.8218666666666666,
+      "grad_norm": 0.3797520738465224,
+      "learning_rate": 1.619888594394382e-05,
+      "loss": 0.6288,
+      "step": 1541
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.4437672820295281,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6568,
+      "step": 1542
+    },
+    {
+      "epoch": 0.8229333333333333,
+      "grad_norm": 0.41281488934315297,
+      "learning_rate": 1.601080376443763e-05,
+      "loss": 0.6181,
+      "step": 1543
+    },
+    {
+      "epoch": 0.8234666666666667,
+      "grad_norm": 0.6409353771099737,
+      "learning_rate": 1.5917138741193973e-05,
+      "loss": 0.6241,
+      "step": 1544
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.4639950500009388,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.7206,
+      "step": 1545
+    },
+    {
+      "epoch": 0.8245333333333333,
+      "grad_norm": 0.4341948855687017,
+      "learning_rate": 1.573056222621453e-05,
+      "loss": 0.6599,
+      "step": 1546
+    },
+    {
+      "epoch": 0.8250666666666666,
+      "grad_norm": 0.4144951273474816,
+      "learning_rate": 1.5637651291624523e-05,
+      "loss": 0.6564,
+      "step": 1547
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.48211622062561166,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.7183,
+      "step": 1548
+    },
+    {
+      "epoch": 0.8261333333333334,
+      "grad_norm": 0.5195647645247848,
+      "learning_rate": 1.5452585455473977e-05,
+      "loss": 0.7231,
+      "step": 1549
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.567201243175725,
+      "learning_rate": 1.536043110654809e-05,
+      "loss": 0.6932,
+      "step": 1550
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.4730026109960757,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6919,
+      "step": 1551
+    },
+    {
+      "epoch": 0.8277333333333333,
+      "grad_norm": 0.45433871843404566,
+      "learning_rate": 1.5176880922928616e-05,
+      "loss": 0.6607,
+      "step": 1552
+    },
+    {
+      "epoch": 0.8282666666666667,
+      "grad_norm": 0.4097757902258825,
+      "learning_rate": 1.5085485636343755e-05,
+      "loss": 0.6481,
+      "step": 1553
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.4445355264901289,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6355,
+      "step": 1554
+    },
+    {
+      "epoch": 0.8293333333333334,
+      "grad_norm": 0.44696609144607774,
+      "learning_rate": 1.4903456038223939e-05,
+      "loss": 0.6668,
+      "step": 1555
+    },
+    {
+      "epoch": 0.8298666666666666,
+      "grad_norm": 0.411072395882937,
+      "learning_rate": 1.4812822270257009e-05,
+      "loss": 0.6622,
+      "step": 1556
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.42983422126722076,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.7323,
+      "step": 1557
+    },
+    {
+      "epoch": 0.8309333333333333,
+      "grad_norm": 0.4965015650726598,
+      "learning_rate": 1.4632318149739177e-05,
+      "loss": 0.6629,
+      "step": 1558
+    },
+    {
+      "epoch": 0.8314666666666667,
+      "grad_norm": 0.43943615031041566,
+      "learning_rate": 1.454244833620102e-05,
+      "loss": 0.6918,
+      "step": 1559
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3798145018855565,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.64,
+      "step": 1560
+    },
+    {
+      "epoch": 0.8325333333333333,
+      "grad_norm": 0.4237693029700599,
+      "learning_rate": 1.4363474544389877e-05,
+      "loss": 0.6481,
+      "step": 1561
+    },
+    {
+      "epoch": 0.8330666666666666,
+      "grad_norm": 0.4354980981388452,
+      "learning_rate": 1.4274371100559791e-05,
+      "loss": 0.6503,
+      "step": 1562
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.49188118000721065,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.6602,
+      "step": 1563
+    },
+    {
+      "epoch": 0.8341333333333333,
+      "grad_norm": 0.5150270739089561,
+      "learning_rate": 1.409693244743192e-05,
+      "loss": 0.7321,
+      "step": 1564
+    },
+    {
+      "epoch": 0.8346666666666667,
+      "grad_norm": 0.4432625754648843,
+      "learning_rate": 1.4008597767992871e-05,
+      "loss": 0.686,
+      "step": 1565
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.5666187564839488,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.7402,
+      "step": 1566
+    },
+    {
+      "epoch": 0.8357333333333333,
+      "grad_norm": 0.3587049280419388,
+      "learning_rate": 1.3832699022267515e-05,
+      "loss": 0.5906,
+      "step": 1567
+    },
+    {
+      "epoch": 0.8362666666666667,
+      "grad_norm": 0.3913212053081317,
+      "learning_rate": 1.37451354812416e-05,
+      "loss": 0.6042,
+      "step": 1568
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.4018650792169943,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6515,
+      "step": 1569
+    },
+    {
+      "epoch": 0.8373333333333334,
+      "grad_norm": 0.3905729772016787,
+      "learning_rate": 1.3570781370252582e-05,
+      "loss": 0.6916,
+      "step": 1570
+    },
+    {
+      "epoch": 0.8378666666666666,
+      "grad_norm": 0.45168546096280476,
+      "learning_rate": 1.3483991320937306e-05,
+      "loss": 0.6439,
+      "step": 1571
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.39448987239315225,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.6338,
+      "step": 1572
+    },
+    {
+      "epoch": 0.8389333333333333,
+      "grad_norm": 0.41538214396139544,
+      "learning_rate": 1.3311186530505838e-05,
+      "loss": 0.6622,
+      "step": 1573
+    },
+    {
+      "epoch": 0.8394666666666667,
+      "grad_norm": 0.3518625690462678,
+      "learning_rate": 1.322517230541096e-05,
+      "loss": 0.5916,
+      "step": 1574
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.5183792247040455,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6675,
+      "step": 1575
+    },
+    {
+      "epoch": 0.8405333333333334,
+      "grad_norm": 0.37766917561812363,
+      "learning_rate": 1.30539214797198e-05,
+      "loss": 0.6529,
+      "step": 1576
+    },
+    {
+      "epoch": 0.8410666666666666,
+      "grad_norm": 0.3643754756719974,
+      "learning_rate": 1.2968685390504465e-05,
+      "loss": 0.5925,
+      "step": 1577
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.373712533344589,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6373,
+      "step": 1578
+    },
+    {
+      "epoch": 0.8421333333333333,
+      "grad_norm": 0.48997948446422157,
+      "learning_rate": 1.2798993131973091e-05,
+      "loss": 0.6207,
+      "step": 1579
+    },
+    {
+      "epoch": 0.8426666666666667,
+      "grad_norm": 0.47513872374997856,
+      "learning_rate": 1.2714537469383858e-05,
+      "loss": 0.6708,
+      "step": 1580
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.5118449162386708,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6799,
+      "step": 1581
+    },
+    {
+      "epoch": 0.8437333333333333,
+      "grad_norm": 0.3854080092446448,
+      "learning_rate": 1.2546408338544769e-05,
+      "loss": 0.6293,
+      "step": 1582
+    },
+    {
+      "epoch": 0.8442666666666667,
+      "grad_norm": 0.43422935312312305,
+      "learning_rate": 1.2462735372353996e-05,
+      "loss": 0.6373,
+      "step": 1583
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.4173380998324428,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6201,
+      "step": 1584
+    },
+    {
+      "epoch": 0.8453333333333334,
+      "grad_norm": 0.38922889827805607,
+      "learning_rate": 1.2296173887730123e-05,
+      "loss": 0.6496,
+      "step": 1585
+    },
+    {
+      "epoch": 0.8458666666666667,
+      "grad_norm": 0.4887497044658339,
+      "learning_rate": 1.2213285866674905e-05,
+      "loss": 0.7261,
+      "step": 1586
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.34874728843648506,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.5998,
+      "step": 1587
+    },
+    {
+      "epoch": 0.8469333333333333,
+      "grad_norm": 0.412856453140932,
+      "learning_rate": 1.2048296504658207e-05,
+      "loss": 0.6768,
+      "step": 1588
+    },
+    {
+      "epoch": 0.8474666666666667,
+      "grad_norm": 0.4742852328898488,
+      "learning_rate": 1.1966195656380031e-05,
+      "loss": 0.6613,
+      "step": 1589
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4380092885638821,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6444,
+      "step": 1590
+    },
+    {
+      "epoch": 0.8485333333333334,
+      "grad_norm": 0.4284426210759624,
+      "learning_rate": 1.1802782851111205e-05,
+      "loss": 0.6596,
+      "step": 1591
+    },
+    {
+      "epoch": 0.8490666666666666,
+      "grad_norm": 0.5193185731149842,
+      "learning_rate": 1.1721471382096027e-05,
+      "loss": 0.7499,
+      "step": 1592
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.4049456112109701,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6365,
+      "step": 1593
+    },
+    {
+      "epoch": 0.8501333333333333,
+      "grad_norm": 0.4543968233765363,
+      "learning_rate": 1.1559639525345311e-05,
+      "loss": 0.7466,
+      "step": 1594
+    },
+    {
+      "epoch": 0.8506666666666667,
+      "grad_norm": 0.48525162102566444,
+      "learning_rate": 1.1479119620864276e-05,
+      "loss": 0.7101,
+      "step": 1595
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.49142803118979916,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.7279,
+      "step": 1596
+    },
+    {
+      "epoch": 0.8517333333333333,
+      "grad_norm": 0.4121167023556667,
+      "learning_rate": 1.1318873061913405e-05,
+      "loss": 0.612,
+      "step": 1597
+    },
+    {
+      "epoch": 0.8522666666666666,
+      "grad_norm": 0.4392176348033429,
+      "learning_rate": 1.123914688596409e-05,
+      "loss": 0.6512,
+      "step": 1598
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.3763130191660776,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6198,
+      "step": 1599
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.41785549470405486,
+      "learning_rate": 1.1080489931489391e-05,
+      "loss": 0.622,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8538666666666667,
+      "grad_norm": 0.41218349773786217,
+      "learning_rate": 1.1001559626737756e-05,
+      "loss": 0.6049,
+      "step": 1601
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.46669989087698827,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6865,
+      "step": 1602
+    },
+    {
+      "epoch": 0.8549333333333333,
+      "grad_norm": 0.4355246424305574,
+      "learning_rate": 1.0844496540694515e-05,
+      "loss": 0.6605,
+      "step": 1603
+    },
+    {
+      "epoch": 0.8554666666666667,
+      "grad_norm": 0.389477566637251,
+      "learning_rate": 1.0766364228417148e-05,
+      "loss": 0.6501,
+      "step": 1604
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.44038632705053193,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.7111,
+      "step": 1605
+    },
+    {
+      "epoch": 0.8565333333333334,
+      "grad_norm": 0.4806305889169313,
+      "learning_rate": 1.0610899231924886e-05,
+      "loss": 0.6465,
+      "step": 1606
+    },
+    {
+      "epoch": 0.8570666666666666,
+      "grad_norm": 0.46298817064117004,
+      "learning_rate": 1.0533567011952094e-05,
+      "loss": 0.6892,
+      "step": 1607
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.5003699140691916,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.7541,
+      "step": 1608
+    },
+    {
+      "epoch": 0.8581333333333333,
+      "grad_norm": 0.4394608676070686,
+      "learning_rate": 1.0379704283181179e-05,
+      "loss": 0.6181,
+      "step": 1609
+    },
+    {
+      "epoch": 0.8586666666666667,
+      "grad_norm": 0.4829426268887746,
+      "learning_rate": 1.0303174233840528e-05,
+      "loss": 0.7067,
+      "step": 1610
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.41210876324544893,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6117,
+      "step": 1611
+    },
+    {
+      "epoch": 0.8597333333333333,
+      "grad_norm": 0.42686456913939924,
+      "learning_rate": 1.0150917907899926e-05,
+      "loss": 0.6556,
+      "step": 1612
+    },
+    {
+      "epoch": 0.8602666666666666,
+      "grad_norm": 0.3866631681828193,
+      "learning_rate": 1.007519208596045e-05,
+      "loss": 0.6473,
+      "step": 1613
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.42655242487824907,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6375,
+      "step": 1614
+    },
+    {
+      "epoch": 0.8613333333333333,
+      "grad_norm": 0.4389474578671961,
+      "learning_rate": 9.924546254786493e-06,
+      "loss": 0.6484,
+      "step": 1615
+    },
+    {
+      "epoch": 0.8618666666666667,
+      "grad_norm": 0.3820630345668428,
+      "learning_rate": 9.849626695403324e-06,
+      "loss": 0.6199,
+      "step": 1616
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.4046582607113642,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6241,
+      "step": 1617
+    },
+    {
+      "epoch": 0.8629333333333333,
+      "grad_norm": 0.4311763839878247,
+      "learning_rate": 9.700595407649805e-06,
+      "loss": 0.6651,
+      "step": 1618
+    },
+    {
+      "epoch": 0.8634666666666667,
+      "grad_norm": 0.4542880511580375,
+      "learning_rate": 9.62648412430951e-06,
+      "loss": 0.6579,
+      "step": 1619
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.4120580504746446,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6905,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8645333333333334,
+      "grad_norm": 0.4884266870435224,
+      "learning_rate": 9.479071385238892e-06,
+      "loss": 0.6292,
+      "step": 1621
+    },
+    {
+      "epoch": 0.8650666666666667,
+      "grad_norm": 0.43728759161149866,
+      "learning_rate": 9.40577036970538e-06,
+      "loss": 0.7067,
+      "step": 1622
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.439111482727896,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6407,
+      "step": 1623
+    },
+    {
+      "epoch": 0.8661333333333333,
+      "grad_norm": 0.4845461222834999,
+      "learning_rate": 9.259980141081115e-06,
+      "loss": 0.7314,
+      "step": 1624
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.42911042479983624,
+      "learning_rate": 9.187491363342093e-06,
+      "loss": 0.6611,
+      "step": 1625
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3968703255307971,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6534,
+      "step": 1626
+    },
+    {
+      "epoch": 0.8677333333333334,
+      "grad_norm": 0.38961375599421977,
+      "learning_rate": 9.043327563322112e-06,
+      "loss": 0.6134,
+      "step": 1627
+    },
+    {
+      "epoch": 0.8682666666666666,
+      "grad_norm": 0.46070695937734074,
+      "learning_rate": 8.971652971536148e-06,
+      "loss": 0.6964,
+      "step": 1628
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.48319645317624976,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6803,
+      "step": 1629
+    },
+    {
+      "epoch": 0.8693333333333333,
+      "grad_norm": 0.43922212202329997,
+      "learning_rate": 8.829119474567671e-06,
+      "loss": 0.6305,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8698666666666667,
+      "grad_norm": 0.4385169927193048,
+      "learning_rate": 8.758260995011825e-06,
+      "loss": 0.6984,
+      "step": 1631
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4530229102352299,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6638,
+      "step": 1632
+    },
+    {
+      "epoch": 0.8709333333333333,
+      "grad_norm": 0.513428802989218,
+      "learning_rate": 8.617361631727138e-06,
+      "loss": 0.7451,
+      "step": 1633
+    },
+    {
+      "epoch": 0.8714666666666666,
+      "grad_norm": 0.40863775505880795,
+      "learning_rate": 8.547321168745193e-06,
+      "loss": 0.6235,
+      "step": 1634
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.5514454993610067,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.769,
+      "step": 1635
+    },
+    {
+      "epoch": 0.8725333333333334,
+      "grad_norm": 0.4075714742225624,
+      "learning_rate": 8.408059725858719e-06,
+      "loss": 0.602,
+      "step": 1636
+    },
+    {
+      "epoch": 0.8730666666666667,
+      "grad_norm": 0.4042193874742804,
+      "learning_rate": 8.338839161809997e-06,
+      "loss": 0.6839,
+      "step": 1637
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.38368033873891205,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.6281,
+      "step": 1638
+    },
+    {
+      "epoch": 0.8741333333333333,
+      "grad_norm": 0.47887727965872756,
+      "learning_rate": 8.201219382016556e-06,
+      "loss": 0.6253,
+      "step": 1639
+    },
+    {
+      "epoch": 0.8746666666666667,
+      "grad_norm": 0.41406932588159323,
+      "learning_rate": 8.132820577225387e-06,
+      "loss": 0.5932,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.4321653709688338,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6539,
+      "step": 1641
+    },
+    {
+      "epoch": 0.8757333333333334,
+      "grad_norm": 0.4325485273086699,
+      "learning_rate": 7.996846159099557e-06,
+      "loss": 0.6799,
+      "step": 1642
+    },
+    {
+      "epoch": 0.8762666666666666,
+      "grad_norm": 0.49747056271681245,
+      "learning_rate": 7.929270951805178e-06,
+      "loss": 0.7104,
+      "step": 1643
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.4124555922656816,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6821,
+      "step": 1644
+    },
+    {
+      "epoch": 0.8773333333333333,
+      "grad_norm": 0.4278702974851724,
+      "learning_rate": 7.794945549701993e-06,
+      "loss": 0.6409,
+      "step": 1645
+    },
+    {
+      "epoch": 0.8778666666666667,
+      "grad_norm": 0.5268838421986082,
+      "learning_rate": 7.728195756009204e-06,
+      "loss": 0.6701,
+      "step": 1646
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.3356705873094502,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6254,
+      "step": 1647
+    },
+    {
+      "epoch": 0.8789333333333333,
+      "grad_norm": 0.5115665554429503,
+      "learning_rate": 7.595522979965819e-06,
+      "loss": 0.7473,
+      "step": 1648
+    },
+    {
+      "epoch": 0.8794666666666666,
+      "grad_norm": 0.5549275423071007,
+      "learning_rate": 7.529600393796232e-06,
+      "loss": 0.634,
+      "step": 1649
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4129681953465492,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6858,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8805333333333333,
+      "grad_norm": 0.4334896205553804,
+      "learning_rate": 7.3985838094349444e-06,
+      "loss": 0.7082,
+      "step": 1651
+    },
+    {
+      "epoch": 0.8810666666666667,
+      "grad_norm": 0.37738828750884534,
+      "learning_rate": 7.333490202478666e-06,
+      "loss": 0.6513,
+      "step": 1652
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.7062015918845946,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.69,
+      "step": 1653
+    },
+    {
+      "epoch": 0.8821333333333333,
+      "grad_norm": 0.3933121471822579,
+      "learning_rate": 7.204133330911178e-06,
+      "loss": 0.6891,
+      "step": 1654
+    },
+    {
+      "epoch": 0.8826666666666667,
+      "grad_norm": 0.46153451371390386,
+      "learning_rate": 7.1398704525792e-06,
+      "loss": 0.5924,
+      "step": 1655
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.4619194720743569,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6446,
+      "step": 1656
+    },
+    {
+      "epoch": 0.8837333333333334,
+      "grad_norm": 0.40485432193236764,
+      "learning_rate": 7.012176770311862e-06,
+      "loss": 0.6527,
+      "step": 1657
+    },
+    {
+      "epoch": 0.8842666666666666,
+      "grad_norm": 0.46498987579859147,
+      "learning_rate": 6.948746347689183e-06,
+      "loss": 0.6156,
+      "step": 1658
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.4273623814044134,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6114,
+      "step": 1659
+    },
+    {
+      "epoch": 0.8853333333333333,
+      "grad_norm": 0.45165622762854996,
+      "learning_rate": 6.8227192865295995e-06,
+      "loss": 0.6164,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8858666666666667,
+      "grad_norm": 0.5411455976859217,
+      "learning_rate": 6.760123024328624e-06,
+      "loss": 0.7047,
+      "step": 1661
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.4170236243872799,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.5988,
+      "step": 1662
+    },
+    {
+      "epoch": 0.8869333333333334,
+      "grad_norm": 0.419468252827379,
+      "learning_rate": 6.635765971293484e-06,
+      "loss": 0.5675,
+      "step": 1663
+    },
+    {
+      "epoch": 0.8874666666666666,
+      "grad_norm": 0.42088947831228424,
+      "learning_rate": 6.5740055518083375e-06,
+      "loss": 0.6784,
+      "step": 1664
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.42553680475341854,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6179,
+      "step": 1665
+    },
+    {
+      "epoch": 0.8885333333333333,
+      "grad_norm": 0.4851315382837286,
+      "learning_rate": 6.451321849032288e-06,
+      "loss": 0.6756,
+      "step": 1666
+    },
+    {
+      "epoch": 0.8890666666666667,
+      "grad_norm": 0.39282675919244703,
+      "learning_rate": 6.390398932093555e-06,
+      "loss": 0.6327,
+      "step": 1667
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.5588568816765518,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6282,
+      "step": 1668
+    },
+    {
+      "epoch": 0.8901333333333333,
+      "grad_norm": 0.4698368238260259,
+      "learning_rate": 6.269391876739495e-06,
+      "loss": 0.6312,
+      "step": 1669
+    },
+    {
+      "epoch": 0.8906666666666667,
+      "grad_norm": 0.4367667295862253,
+      "learning_rate": 6.209308099669597e-06,
+      "loss": 0.6252,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.4577544590908375,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.7267,
+      "step": 1671
+    },
+    {
+      "epoch": 0.8917333333333334,
+      "grad_norm": 0.3917080039263942,
+      "learning_rate": 6.089980943839924e-06,
+      "loss": 0.5493,
+      "step": 1672
+    },
+    {
+      "epoch": 0.8922666666666667,
+      "grad_norm": 0.4628547087686605,
+      "learning_rate": 6.030737921409169e-06,
+      "loss": 0.645,
+      "step": 1673
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.4100854716226644,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6528,
+      "step": 1674
+    },
+    {
+      "epoch": 0.8933333333333333,
+      "grad_norm": 0.469383484226112,
+      "learning_rate": 5.913093872058528e-06,
+      "loss": 0.7227,
+      "step": 1675
+    },
+    {
+      "epoch": 0.8938666666666667,
+      "grad_norm": 0.42939021894209495,
+      "learning_rate": 5.854693196441641e-06,
+      "loss": 0.6351,
+      "step": 1676
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.4587514440256765,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6145,
+      "step": 1677
+    },
+    {
+      "epoch": 0.8949333333333334,
+      "grad_norm": 0.40796818894275366,
+      "learning_rate": 5.738735415290642e-06,
+      "loss": 0.6208,
+      "step": 1678
+    },
+    {
+      "epoch": 0.8954666666666666,
+      "grad_norm": 0.5245166810352607,
+      "learning_rate": 5.681178656024055e-06,
+      "loss": 0.7941,
+      "step": 1679
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.41858576012266857,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6605,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8965333333333333,
+      "grad_norm": 0.5029441424031276,
+      "learning_rate": 5.566910259474289e-06,
+      "loss": 0.654,
+      "step": 1681
+    },
+    {
+      "epoch": 0.8970666666666667,
+      "grad_norm": 0.39588977096060385,
+      "learning_rate": 5.510198963413881e-06,
+      "loss": 0.5754,
+      "step": 1682
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.43398608387611426,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6965,
+      "step": 1683
+    },
+    {
+      "epoch": 0.8981333333333333,
+      "grad_norm": 0.3956062769487065,
+      "learning_rate": 5.397623022464226e-06,
+      "loss": 0.6066,
+      "step": 1684
+    },
+    {
+      "epoch": 0.8986666666666666,
+      "grad_norm": 0.5799303774810434,
+      "learning_rate": 5.341758713743828e-06,
+      "loss": 0.7393,
+      "step": 1685
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.40473359878926773,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6818,
+      "step": 1686
+    },
+    {
+      "epoch": 0.8997333333333334,
+      "grad_norm": 0.4557557799799093,
+      "learning_rate": 5.230878253907912e-06,
+      "loss": 0.7586,
+      "step": 1687
+    },
+    {
+      "epoch": 0.9002666666666667,
+      "grad_norm": 0.48245542943285297,
+      "learning_rate": 5.175862433898282e-06,
+      "loss": 0.6346,
+      "step": 1688
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.5110922381694472,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6719,
+      "step": 1689
+    },
+    {
+      "epoch": 0.9013333333333333,
+      "grad_norm": 0.3613012154317809,
+      "learning_rate": 5.066680435123106e-06,
+      "loss": 0.6581,
+      "step": 1690
+    },
+    {
+      "epoch": 0.9018666666666667,
+      "grad_norm": 0.41451177222561064,
+      "learning_rate": 5.012514582391592e-06,
+      "loss": 0.7078,
+      "step": 1691
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.43499608088847636,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6433,
+      "step": 1692
+    },
+    {
+      "epoch": 0.9029333333333334,
+      "grad_norm": 0.4059383660825497,
+      "learning_rate": 4.905033978977491e-06,
+      "loss": 0.6467,
+      "step": 1693
+    },
+    {
+      "epoch": 0.9034666666666666,
+      "grad_norm": 0.5582984266495177,
+      "learning_rate": 4.851719549248301e-06,
+      "loss": 0.7557,
+      "step": 1694
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.4954697061395202,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.7238,
+      "step": 1695
+    },
+    {
+      "epoch": 0.9045333333333333,
+      "grad_norm": 0.43864156821664785,
+      "learning_rate": 4.745943229770122e-06,
+      "loss": 0.643,
+      "step": 1696
+    },
+    {
+      "epoch": 0.9050666666666667,
+      "grad_norm": 0.49264354917470277,
+      "learning_rate": 4.693481655885257e-06,
+      "loss": 0.6634,
+      "step": 1697
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.6162265954390206,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.7582,
+      "step": 1698
+    },
+    {
+      "epoch": 0.9061333333333333,
+      "grad_norm": 0.4710076943669117,
+      "learning_rate": 4.58941246311464e-06,
+      "loss": 0.6638,
+      "step": 1699
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.40355569907650257,
+      "learning_rate": 4.537805154995278e-06,
+      "loss": 0.6128,
+      "step": 1700
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.3829409072081762,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6676,
+      "step": 1701
+    },
+    {
+      "epoch": 0.9077333333333333,
+      "grad_norm": 0.46670947512097,
+      "learning_rate": 4.435445885824285e-06,
+      "loss": 0.6814,
+      "step": 1702
+    },
+    {
+      "epoch": 0.9082666666666667,
+      "grad_norm": 0.4542642570304578,
+      "learning_rate": 4.384694230432984e-06,
+      "loss": 0.6082,
+      "step": 1703
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.38769564929516004,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6271,
+      "step": 1704
+    },
+    {
+      "epoch": 0.9093333333333333,
+      "grad_norm": 0.5254532367814335,
+      "learning_rate": 4.2840476357989825e-06,
+      "loss": 0.7208,
+      "step": 1705
+    },
+    {
+      "epoch": 0.9098666666666667,
+      "grad_norm": 0.3977113454905695,
+      "learning_rate": 4.2341529971023255e-06,
+      "loss": 0.6636,
+      "step": 1706
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.4424789105180282,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.7182,
+      "step": 1707
+    },
+    {
+      "epoch": 0.9109333333333334,
+      "grad_norm": 0.440539538012321,
+      "learning_rate": 4.135221781914034e-06,
+      "loss": 0.6679,
+      "step": 1708
+    },
+    {
+      "epoch": 0.9114666666666666,
+      "grad_norm": 0.4212358326675856,
+      "learning_rate": 4.0861855008460405e-06,
+      "loss": 0.659,
+      "step": 1709
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.43795798542433434,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.5912,
+      "step": 1710
+    },
+    {
+      "epoch": 0.9125333333333333,
+      "grad_norm": 0.42344934101655535,
+      "learning_rate": 3.988972323910778e-06,
+      "loss": 0.6783,
+      "step": 1711
+    },
+    {
+      "epoch": 0.9130666666666667,
+      "grad_norm": 0.3830263313919214,
+      "learning_rate": 3.9407957183368095e-06,
+      "loss": 0.5908,
+      "step": 1712
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.39445880826051793,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.5571,
+      "step": 1713
+    },
+    {
+      "epoch": 0.9141333333333334,
+      "grad_norm": 0.5477526778829184,
+      "learning_rate": 3.845303192289074e-06,
+      "loss": 0.6966,
+      "step": 1714
+    },
+    {
+      "epoch": 0.9146666666666666,
+      "grad_norm": 0.385803456198678,
+      "learning_rate": 3.797987556970495e-06,
+      "loss": 0.5765,
+      "step": 1715
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.430827174675466,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.658,
+      "step": 1716
+    },
+    {
+      "epoch": 0.9157333333333333,
+      "grad_norm": 0.47765928045648276,
+      "learning_rate": 3.7042182482018075e-06,
+      "loss": 0.6866,
+      "step": 1717
+    },
+    {
+      "epoch": 0.9162666666666667,
+      "grad_norm": 0.42680757206130226,
+      "learning_rate": 3.6577648547611033e-06,
+      "loss": 0.6118,
+      "step": 1718
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.5240190067743186,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.7096,
+      "step": 1719
+    },
+    {
+      "epoch": 0.9173333333333333,
+      "grad_norm": 0.4836966132884837,
+      "learning_rate": 3.565721283350931e-06,
+      "loss": 0.6967,
+      "step": 1720
+    },
+    {
+      "epoch": 0.9178666666666667,
+      "grad_norm": 0.5477118813907119,
+      "learning_rate": 3.5201313802375456e-06,
+      "loss": 0.7548,
+      "step": 1721
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.4053454839277045,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6784,
+      "step": 1722
+    },
+    {
+      "epoch": 0.9189333333333334,
+      "grad_norm": 0.44014920474258146,
+      "learning_rate": 3.4298160198856568e-06,
+      "loss": 0.6489,
+      "step": 1723
+    },
+    {
+      "epoch": 0.9194666666666667,
+      "grad_norm": 0.39387381096905677,
+      "learning_rate": 3.3850908323424967e-06,
+      "loss": 0.6563,
+      "step": 1724
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.43899280415011493,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.7034,
+      "step": 1725
+    },
+    {
+      "epoch": 0.9205333333333333,
+      "grad_norm": 0.5309862604436678,
+      "learning_rate": 3.296506110302422e-06,
+      "loss": 0.7453,
+      "step": 1726
+    },
+    {
+      "epoch": 0.9210666666666667,
+      "grad_norm": 0.4032446104401672,
+      "learning_rate": 3.252646840332918e-06,
+      "loss": 0.6091,
+      "step": 1727
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.402482542954159,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6215,
+      "step": 1728
+    },
+    {
+      "epoch": 0.9221333333333334,
+      "grad_norm": 0.3656568731027382,
+      "learning_rate": 3.1657951373467497e-06,
+      "loss": 0.6173,
+      "step": 1729
+    },
+    {
+      "epoch": 0.9226666666666666,
+      "grad_norm": 0.3649188766171377,
+      "learning_rate": 3.1228029636824475e-06,
+      "loss": 0.6313,
+      "step": 1730
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.404074350484019,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6398,
+      "step": 1731
+    },
+    {
+      "epoch": 0.9237333333333333,
+      "grad_norm": 0.7792532589315108,
+      "learning_rate": 3.037686613916857e-06,
+      "loss": 0.6445,
+      "step": 1732
+    },
+    {
+      "epoch": 0.9242666666666667,
+      "grad_norm": 0.4072870206549389,
+      "learning_rate": 2.995562691985898e-06,
+      "loss": 0.6979,
+      "step": 1733
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.5802046062782825,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.7653,
+      "step": 1734
+    },
+    {
+      "epoch": 0.9253333333333333,
+      "grad_norm": 0.6134815136581775,
+      "learning_rate": 2.912183982969385e-06,
+      "loss": 0.6964,
+      "step": 1735
+    },
+    {
+      "epoch": 0.9258666666666666,
+      "grad_norm": 0.4566500800470463,
+      "learning_rate": 2.8709294448653225e-06,
+      "loss": 0.6755,
+      "step": 1736
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.3889385811942882,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6367,
+      "step": 1737
+    },
+    {
+      "epoch": 0.9269333333333334,
+      "grad_norm": 0.42132189571813033,
+      "learning_rate": 2.789290617426765e-06,
+      "loss": 0.6709,
+      "step": 1738
+    },
+    {
+      "epoch": 0.9274666666666667,
+      "grad_norm": 0.5502658206991807,
+      "learning_rate": 2.748906571878207e-06,
+      "loss": 0.6844,
+      "step": 1739
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4109979916703212,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.6656,
+      "step": 1740
+    },
+    {
+      "epoch": 0.9285333333333333,
+      "grad_norm": 0.5349626472192117,
+      "learning_rate": 2.6690098200866098e-06,
+      "loss": 0.6792,
+      "step": 1741
+    },
+    {
+      "epoch": 0.9290666666666667,
+      "grad_norm": 0.3929259820512547,
+      "learning_rate": 2.6294973524274125e-06,
+      "loss": 0.5987,
+      "step": 1742
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.4238283605425105,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6342,
+      "step": 1743
+    },
+    {
+      "epoch": 0.9301333333333334,
+      "grad_norm": 0.3474079856752595,
+      "learning_rate": 2.551344823532964e-06,
+      "loss": 0.5476,
+      "step": 1744
+    },
+    {
+      "epoch": 0.9306666666666666,
+      "grad_norm": 0.4366692181154197,
+      "learning_rate": 2.5127049956730207e-06,
+      "loss": 0.6367,
+      "step": 1745
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.44436214778993216,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.6152,
+      "step": 1746
+    },
+    {
+      "epoch": 0.9317333333333333,
+      "grad_norm": 0.3348354336295145,
+      "learning_rate": 2.436298790049363e-06,
+      "loss": 0.5874,
+      "step": 1747
+    },
+    {
+      "epoch": 0.9322666666666667,
+      "grad_norm": 0.38847198321126125,
+      "learning_rate": 2.3985326404461604e-06,
+      "loss": 0.6504,
+      "step": 1748
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.40424377782673143,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6582,
+      "step": 1749
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.41002965922987233,
+      "learning_rate": 2.3238748115339324e-06,
+      "loss": 0.6866,
+      "step": 1750
+    },
+    {
+      "epoch": 0.9338666666666666,
+      "grad_norm": 0.4067090297962575,
+      "learning_rate": 2.286983355164529e-06,
+      "loss": 0.6706,
+      "step": 1751
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.417347825399175,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6539,
+      "step": 1752
+    },
+    {
+      "epoch": 0.9349333333333333,
+      "grad_norm": 0.4726019554872595,
+      "learning_rate": 2.2140759094162467e-06,
+      "loss": 0.6016,
+      "step": 1753
+    },
+    {
+      "epoch": 0.9354666666666667,
+      "grad_norm": 0.41764107695884006,
+      "learning_rate": 2.178060137750071e-06,
+      "loss": 0.6043,
+      "step": 1754
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.36693776341659945,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.6144,
+      "step": 1755
+    },
+    {
+      "epoch": 0.9365333333333333,
+      "grad_norm": 0.39934235720381156,
+      "learning_rate": 2.106905034576112e-06,
+      "loss": 0.6735,
+      "step": 1756
+    },
+    {
+      "epoch": 0.9370666666666667,
+      "grad_norm": 0.4965655480591488,
+      "learning_rate": 2.0717659155482738e-06,
+      "loss": 0.7142,
+      "step": 1757
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.42875820347453275,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.6845,
+      "step": 1758
+    },
+    {
+      "epoch": 0.9381333333333334,
+      "grad_norm": 0.4055547482098708,
+      "learning_rate": 2.002365067264289e-06,
+      "loss": 0.5752,
+      "step": 1759
+    },
+    {
+      "epoch": 0.9386666666666666,
+      "grad_norm": 0.49959247579552135,
+      "learning_rate": 1.968103545249611e-06,
+      "loss": 0.6544,
+      "step": 1760
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.43426902373524473,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6196,
+      "step": 1761
+    },
+    {
+      "epoch": 0.9397333333333333,
+      "grad_norm": 0.4382330506712958,
+      "learning_rate": 1.900458817025097e-06,
+      "loss": 0.5806,
+      "step": 1762
+    },
+    {
+      "epoch": 0.9402666666666667,
+      "grad_norm": 0.45469155884809576,
+      "learning_rate": 1.8670758128126909e-06,
+      "loss": 0.6073,
+      "step": 1763
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.42681364610491224,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6396,
+      "step": 1764
+    },
+    {
+      "epoch": 0.9413333333333334,
+      "grad_norm": 0.4478423617494056,
+      "learning_rate": 1.8011890226208527e-06,
+      "loss": 0.6639,
+      "step": 1765
+    },
+    {
+      "epoch": 0.9418666666666666,
+      "grad_norm": 0.40181955384189383,
+      "learning_rate": 1.7686854333893833e-06,
+      "loss": 0.6568,
+      "step": 1766
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.4081222757236124,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6855,
+      "step": 1767
+    },
+    {
+      "epoch": 0.9429333333333333,
+      "grad_norm": 0.4236295476459045,
+      "learning_rate": 1.7045583519583074e-06,
+      "loss": 0.6624,
+      "step": 1768
+    },
+    {
+      "epoch": 0.9434666666666667,
+      "grad_norm": 0.4412142904850606,
+      "learning_rate": 1.6729350512519005e-06,
+      "loss": 0.7157,
+      "step": 1769
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4643292919097104,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6453,
+      "step": 1770
+    },
+    {
+      "epoch": 0.9445333333333333,
+      "grad_norm": 0.47109321823024464,
+      "learning_rate": 1.6105694020169593e-06,
+      "loss": 0.643,
+      "step": 1771
+    },
+    {
+      "epoch": 0.9450666666666667,
+      "grad_norm": 0.4169201898978646,
+      "learning_rate": 1.5798272397217095e-06,
+      "loss": 0.6233,
+      "step": 1772
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.4420269742790322,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.621,
+      "step": 1773
+    },
+    {
+      "epoch": 0.9461333333333334,
+      "grad_norm": 0.5310896307326314,
+      "learning_rate": 1.5192246987791981e-06,
+      "loss": 0.6592,
+      "step": 1774
+    },
+    {
+      "epoch": 0.9466666666666667,
+      "grad_norm": 0.57693411108257,
+      "learning_rate": 1.489364501100332e-06,
+      "loss": 0.7531,
+      "step": 1775
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.40027668439784575,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.63,
+      "step": 1776
+    },
+    {
+      "epoch": 0.9477333333333333,
+      "grad_norm": 0.45554959525323824,
+      "learning_rate": 1.430526697162482e-06,
+      "loss": 0.6944,
+      "step": 1777
+    },
+    {
+      "epoch": 0.9482666666666667,
+      "grad_norm": 0.5800681695499549,
+      "learning_rate": 1.4015492666021312e-06,
+      "loss": 0.6839,
+      "step": 1778
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.45277302383436674,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.6468,
+      "step": 1779
+    },
+    {
+      "epoch": 0.9493333333333334,
+      "grad_norm": 0.45271890416023614,
+      "learning_rate": 1.344477780953346e-06,
+      "loss": 0.671,
+      "step": 1780
+    },
+    {
+      "epoch": 0.9498666666666666,
+      "grad_norm": 0.42363472987303374,
+      "learning_rate": 1.3163838962890195e-06,
+      "loss": 0.6872,
+      "step": 1781
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.44406485937332685,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6385,
+      "step": 1782
+    },
+    {
+      "epoch": 0.9509333333333333,
+      "grad_norm": 0.5587188392936556,
+      "learning_rate": 1.261080262743297e-06,
+      "loss": 0.6416,
+      "step": 1783
+    },
+    {
+      "epoch": 0.9514666666666667,
+      "grad_norm": 0.34896010757851925,
+      "learning_rate": 1.2338706790069431e-06,
+      "loss": 0.5476,
+      "step": 1784
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.37165423361942146,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.5964,
+      "step": 1785
+    },
+    {
+      "epoch": 0.9525333333333333,
+      "grad_norm": 0.5551780434620531,
+      "learning_rate": 1.1803363838667092e-06,
+      "loss": 0.8058,
+      "step": 1786
+    },
+    {
+      "epoch": 0.9530666666666666,
+      "grad_norm": 0.4459932791245074,
+      "learning_rate": 1.1540118323243865e-06,
+      "loss": 0.6536,
+      "step": 1787
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.4417695511572837,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6405,
+      "step": 1788
+    },
+    {
+      "epoch": 0.9541333333333334,
+      "grad_norm": 0.45979733840097287,
+      "learning_rate": 1.1022483143405705e-06,
+      "loss": 0.6757,
+      "step": 1789
+    },
+    {
+      "epoch": 0.9546666666666667,
+      "grad_norm": 0.418860251545236,
+      "learning_rate": 1.076809502472831e-06,
+      "loss": 0.7194,
+      "step": 1790
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.43516647592718966,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6277,
+      "step": 1791
+    },
+    {
+      "epoch": 0.9557333333333333,
+      "grad_norm": 0.4298094963049132,
+      "learning_rate": 1.0268181528061749e-06,
+      "loss": 0.6526,
+      "step": 1792
+    },
+    {
+      "epoch": 0.9562666666666667,
+      "grad_norm": 0.4003875180697773,
+      "learning_rate": 1.0022657642890231e-06,
+      "loss": 0.6357,
+      "step": 1793
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.40805824598866824,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.6297,
+      "step": 1794
+    },
+    {
+      "epoch": 0.9573333333333334,
+      "grad_norm": 0.37244563492274635,
+      "learning_rate": 9.540479264726676e-07,
+      "loss": 0.5876,
+      "step": 1795
+    },
+    {
+      "epoch": 0.9578666666666666,
+      "grad_norm": 0.4177365233111911,
+      "learning_rate": 9.303826211592315e-07,
+      "loss": 0.6027,
+      "step": 1796
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.4353412020431162,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.69,
+      "step": 1797
+    },
+    {
+      "epoch": 0.9589333333333333,
+      "grad_norm": 0.4120364477361252,
+      "learning_rate": 8.839395910626213e-07,
+      "loss": 0.6957,
+      "step": 1798
+    },
+    {
+      "epoch": 0.9594666666666667,
+      "grad_norm": 0.4239066267875004,
+      "learning_rate": 8.611620049653879e-07,
+      "loss": 0.6385,
+      "step": 1799
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.5342632200761349,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.7627,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9605333333333334,
+      "grad_norm": 0.36817388197940243,
+      "learning_rate": 8.16495030759501e-07,
+      "loss": 0.5786,
+      "step": 1801
+    },
+    {
+      "epoch": 0.9610666666666666,
+      "grad_norm": 0.48533062698194884,
+      "learning_rate": 7.946057760332193e-07,
+      "loss": 0.6796,
+      "step": 1802
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.37074100093183415,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6123,
+      "step": 1803
+    },
+    {
+      "epoch": 0.9621333333333333,
+      "grad_norm": 0.4141493977118879,
+      "learning_rate": 7.517160581569372e-07,
+      "loss": 0.7048,
+      "step": 1804
+    },
+    {
+      "epoch": 0.9626666666666667,
+      "grad_norm": 0.415217243574443,
+      "learning_rate": 7.307157230821426e-07,
+      "loss": 0.6546,
+      "step": 1805
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.3910056185139864,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6141,
+      "step": 1806
+    },
+    {
+      "epoch": 0.9637333333333333,
+      "grad_norm": 0.5974574992263897,
+      "learning_rate": 6.896044142100433e-07,
+      "loss": 0.6627,
+      "step": 1807
+    },
+    {
+      "epoch": 0.9642666666666667,
+      "grad_norm": 0.5139361713417283,
+      "learning_rate": 6.694935631773258e-07,
+      "loss": 0.6813,
+      "step": 1808
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 1.1734032114524424,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6291,
+      "step": 1809
+    },
+    {
+      "epoch": 0.9653333333333334,
+      "grad_norm": 0.47747074418518726,
+      "learning_rate": 6.301617681886863e-07,
+      "loss": 0.7475,
+      "step": 1810
+    },
+    {
+      "epoch": 0.9658666666666667,
+      "grad_norm": 0.4311561298803717,
+      "learning_rate": 6.109409416834688e-07,
+      "loss": 0.6713,
+      "step": 1811
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.504562022783302,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6561,
+      "step": 1812
+    },
+    {
+      "epoch": 0.9669333333333333,
+      "grad_norm": 0.4773157125725159,
+      "learning_rate": 5.733897176325665e-07,
+      "loss": 0.6655,
+      "step": 1813
+    },
+    {
+      "epoch": 0.9674666666666667,
+      "grad_norm": 0.38604019845998105,
+      "learning_rate": 5.550594322205504e-07,
+      "loss": 0.6234,
+      "step": 1814
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.3945973662477523,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6264,
+      "step": 1815
+    },
+    {
+      "epoch": 0.9685333333333334,
+      "grad_norm": 0.43171237307055293,
+      "learning_rate": 5.192897883082747e-07,
+      "loss": 0.6261,
+      "step": 1816
+    },
+    {
+      "epoch": 0.9690666666666666,
+      "grad_norm": 0.3528323500298493,
+      "learning_rate": 5.018505366216175e-07,
+      "loss": 0.5892,
+      "step": 1817
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.4019638365618088,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6306,
+      "step": 1818
+    },
+    {
+      "epoch": 0.9701333333333333,
+      "grad_norm": 0.4045721317863742,
+      "learning_rate": 4.678634341683252e-07,
+      "loss": 0.6476,
+      "step": 1819
+    },
+    {
+      "epoch": 0.9706666666666667,
+      "grad_norm": 0.3957708441520497,
+      "learning_rate": 4.5131568489236166e-07,
+      "loss": 0.6462,
+      "step": 1820
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.47367736399522625,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.65,
+      "step": 1821
+    },
+    {
+      "epoch": 0.9717333333333333,
+      "grad_norm": 0.5023741182081118,
+      "learning_rate": 4.191120373120749e-07,
+      "loss": 0.6989,
+      "step": 1822
+    },
+    {
+      "epoch": 0.9722666666666666,
+      "grad_norm": 0.4551564703113851,
+      "learning_rate": 4.034562351727389e-07,
+      "loss": 0.5782,
+      "step": 1823
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.42327891958627145,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.7395,
+      "step": 1824
+    },
+    {
+      "epoch": 0.9733333333333334,
+      "grad_norm": 0.48474062249707706,
+      "learning_rate": 3.73036907948543e-07,
+      "loss": 0.6498,
+      "step": 1825
+    },
+    {
+      "epoch": 0.9738666666666667,
+      "grad_norm": 0.5051658504621682,
+      "learning_rate": 3.582734737004101e-07,
+      "loss": 0.6711,
+      "step": 1826
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.3645228532726314,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.6617,
+      "step": 1827
+    },
+    {
+      "epoch": 0.9749333333333333,
+      "grad_norm": 0.3975382127868151,
+      "learning_rate": 3.296392843612273e-07,
+      "loss": 0.6635,
+      "step": 1828
+    },
+    {
+      "epoch": 0.9754666666666667,
+      "grad_norm": 0.4276411180800346,
+      "learning_rate": 3.1576861477621287e-07,
+      "loss": 0.6609,
+      "step": 1829
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.43384637336354825,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.637,
+      "step": 1830
+    },
+    {
+      "epoch": 0.9765333333333334,
+      "grad_norm": 0.4387910782380915,
+      "learning_rate": 2.889203328748424e-07,
+      "loss": 0.6301,
+      "step": 1831
+    },
+    {
+      "epoch": 0.9770666666666666,
+      "grad_norm": 0.4999299860408389,
+      "learning_rate": 2.759428007315212e-07,
+      "loss": 0.6644,
+      "step": 1832
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.3880675231249164,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.5845,
+      "step": 1833
+    },
+    {
+      "epoch": 0.9781333333333333,
+      "grad_norm": 0.44004972785404506,
+      "learning_rate": 2.5088114782392257e-07,
+      "loss": 0.6673,
+      "step": 1834
+    },
+    {
+      "epoch": 0.9786666666666667,
+      "grad_norm": 0.4460458904659256,
+      "learning_rate": 2.3879710189753656e-07,
+      "loss": 0.6807,
+      "step": 1835
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.39196638469319506,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.5982,
+      "step": 1836
+    },
+    {
+      "epoch": 0.9797333333333333,
+      "grad_norm": 0.4268942353014067,
+      "learning_rate": 2.15522751523467e-07,
+      "loss": 0.5791,
+      "step": 1837
+    },
+    {
+      "epoch": 0.9802666666666666,
+      "grad_norm": 0.4165890060081552,
+      "learning_rate": 2.0433251657653308e-07,
+      "loss": 0.6684,
+      "step": 1838
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.4198533615173099,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6663,
+      "step": 1839
+    },
+    {
+      "epoch": 0.9813333333333333,
+      "grad_norm": 0.428303158648751,
+      "learning_rate": 1.8284609424142895e-07,
+      "loss": 0.6244,
+      "step": 1840
+    },
+    {
+      "epoch": 0.9818666666666667,
+      "grad_norm": 0.44118042394269336,
+      "learning_rate": 1.7254997101500137e-07,
+      "loss": 0.6533,
+      "step": 1841
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.462106473691002,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6862,
+      "step": 1842
+    },
+    {
+      "epoch": 0.9829333333333333,
+      "grad_norm": 0.45822369136078545,
+      "learning_rate": 1.5285205417319149e-07,
+      "loss": 0.6385,
+      "step": 1843
+    },
+    {
+      "epoch": 0.9834666666666667,
+      "grad_norm": 0.4416151139936162,
+      "learning_rate": 1.4345031937879062e-07,
+      "loss": 0.5906,
+      "step": 1844
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.45141304264926185,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6291,
+      "step": 1845
+    },
+    {
+      "epoch": 0.9845333333333334,
+      "grad_norm": 0.4248290538485947,
+      "learning_rate": 1.255414374179531e-07,
+      "loss": 0.6195,
+      "step": 1846
+    },
+    {
+      "epoch": 0.9850666666666666,
+      "grad_norm": 0.4654856422485027,
+      "learning_rate": 1.170343437301491e-07,
+      "loss": 0.7059,
+      "step": 1847
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3657999180986568,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.5697,
+      "step": 1848
+    },
+    {
+      "epoch": 0.9861333333333333,
+      "grad_norm": 0.45784132439167596,
+      "learning_rate": 1.0091497795706728e-07,
+      "loss": 0.6836,
+      "step": 1849
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.5252729583036887,
+      "learning_rate": 9.330275400666332e-08,
+      "loss": 0.6651,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.4437070604664506,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6383,
+      "step": 1851
+    },
+    {
+      "epoch": 0.9877333333333334,
+      "grad_norm": 0.48425128455025135,
+      "learning_rate": 7.8973337634336e-08,
+      "loss": 0.6486,
+      "step": 1852
+    },
+    {
+      "epoch": 0.9882666666666666,
+      "grad_norm": 0.44305311958641147,
+      "learning_rate": 7.225618800222877e-08,
+      "loss": 0.702,
+      "step": 1853
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.4719295320937795,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.5969,
+      "step": 1854
+    },
+    {
+      "epoch": 0.9893333333333333,
+      "grad_norm": 0.45356385639317337,
+      "learning_rate": 5.971710613821291e-08,
+      "loss": 0.667,
+      "step": 1855
+    },
+    {
+      "epoch": 0.9898666666666667,
+      "grad_norm": 0.4223167880531397,
+      "learning_rate": 5.389521134989695e-08,
+      "loss": 0.6455,
+      "step": 1856
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.443812423913915,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6557,
+      "step": 1857
+    },
+    {
+      "epoch": 0.9909333333333333,
+      "grad_norm": 0.4098485018285767,
+      "learning_rate": 4.314680098592705e-08,
+      "loss": 0.6313,
+      "step": 1858
+    },
+    {
+      "epoch": 0.9914666666666667,
+      "grad_norm": 0.4181108336526171,
+      "learning_rate": 3.8220317506654226e-08,
+      "loss": 0.6448,
+      "step": 1859
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4616096472968845,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6449,
+      "step": 1860
+    },
+    {
+      "epoch": 0.9925333333333334,
+      "grad_norm": 0.43449513413519597,
+      "learning_rate": 2.9262867509605163e-08,
+      "loss": 0.6886,
+      "step": 1861
+    },
+    {
+      "epoch": 0.9930666666666667,
+      "grad_norm": 0.4735156314757002,
+      "learning_rate": 2.5231927740154704e-08,
+      "loss": 0.663,
+      "step": 1862
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.3816892624469625,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6468,
+      "step": 1863
+    },
+    {
+      "epoch": 0.9941333333333333,
+      "grad_norm": 0.4392699187162705,
+      "learning_rate": 1.8065678844314538e-08,
+      "loss": 0.698,
+      "step": 1864
+    },
+    {
+      "epoch": 0.9946666666666667,
+      "grad_norm": 0.4374484167172048,
+      "learning_rate": 1.4930391117451426e-08,
+      "loss": 0.6153,
+      "step": 1865
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.4643901351796245,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.5761,
+      "step": 1866
+    },
+    {
+      "epoch": 0.9957333333333334,
+      "grad_norm": 0.44992439538756956,
+      "learning_rate": 9.555535917993297e-09,
+      "loss": 0.7871,
+      "step": 1867
+    },
+    {
+      "epoch": 0.9962666666666666,
+      "grad_norm": 0.4749681289221362,
+      "learning_rate": 7.315984495548378e-09,
+      "loss": 0.6527,
+      "step": 1868
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.37467779687969793,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6104,
+      "step": 1869
+    },
+    {
+      "epoch": 0.9973333333333333,
+      "grad_norm": 0.4132835221926209,
+      "learning_rate": 3.732667443390181e-09,
+      "loss": 0.6794,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9978666666666667,
+      "grad_norm": 0.5321217168579051,
+      "learning_rate": 2.388912514017516e-09,
+      "loss": 0.6663,
+      "step": 1871
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.41446334553886355,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.5672,
+      "step": 1872
+    },
+    {
+      "epoch": 0.9989333333333333,
+      "grad_norm": 0.46896441580567816,
+      "learning_rate": 5.972299119250125e-10,
+      "loss": 0.6666,
+      "step": 1873
+    },
+    {
+      "epoch": 0.9994666666666666,
+      "grad_norm": 0.5147065885708658,
+      "learning_rate": 1.4930758944764479e-10,
+      "loss": 0.6069,
+      "step": 1874
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.44285490265539945,
+      "learning_rate": 0.0,
+      "loss": 0.6387,
+      "step": 1875
+    },
+    {
+      "epoch": 1.0,
+      "step": 1875,
+      "total_flos": 1615624508571648.0,
+      "train_loss": 0.7278803406397502,
+      "train_runtime": 28964.1933,
+      "train_samples_per_second": 1.036,
+      "train_steps_per_second": 0.065
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1615624508571648.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e899733f07bd8e65f6027369fa419218820e94d
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "q_proj",
+    "down_proj",
+    "up_proj",
+    "v_proj",
+    "k_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3567871915d25449d57d4fb0d14d5ebacb292356
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63e78c71e82a882872cc4892cd944059d50f1d4b2184eb25c2484af31f6223d2
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dbfcbdb872aa490da15788530450efd5b4eecc48
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c06f078c202ca187faae7464d772cd8ccc710939433a26fed21074276a4b4e38
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3df8d424a3af07f79fc45f0abb5b44fafe033db6
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9363979537893283,
+      "learning_rate": 2e-05,
+      "loss": 1.326,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8117640234578937,
+      "learning_rate": 4e-05,
+      "loss": 1.2227,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7794277562491366,
+      "learning_rate": 6e-05,
+      "loss": 1.2748,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.7435848597042757,
+      "learning_rate": 8e-05,
+      "loss": 1.2728,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.6152566686715281,
+      "learning_rate": 0.0001,
+      "loss": 0.9413,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.9534557360712878,
+      "learning_rate": 0.00012,
+      "loss": 1.1385,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.8954079865789398,
+      "learning_rate": 0.00014,
+      "loss": 0.9773,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.9440691389909568,
+      "learning_rate": 0.00016,
+      "loss": 1.1365,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.6351021865944403,
+      "learning_rate": 0.00018,
+      "loss": 0.9804,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.6133804482420512,
+      "learning_rate": 0.0002,
+      "loss": 0.8835,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5663860089193683,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.8981,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.49963909488217073,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.8886,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5599956879255387,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.9021,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.6737809693905482,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.9343,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.6902953980011141,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.9473,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.603551811924562,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9259,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.5540107660433663,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.846,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5330555970663188,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.8927,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5804294290736568,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 1.0149,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4956640641721262,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.8418,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4996576021612778,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.915,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.49443061973090485,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.9021,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.5409978992320507,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.9499,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.45638969485150216,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8657,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5028902860236372,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.8335,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.49120503529481024,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.7453,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5840564683830789,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.9129,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.503872495625192,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.8842,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.5430006396028205,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.8861,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5944535180864793,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.9127,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.477912643699261,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.8509,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.6145012676574753,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.9374,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.5612215454429993,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.8671,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.5296079193299826,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8027,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5766512796370208,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.9182,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.6273138421530724,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.9288,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5084629286636579,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.7973,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.4488607448801517,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8201,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.5448883697942045,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.9148,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4557519242439591,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.8354,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.4880677089404733,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.745,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5357339810746296,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.866,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5016337897862774,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.7784,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5689677962703366,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8541,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.5538540615030474,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.899,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.8508959404814713,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8346,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.5666950364950541,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.9008,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.49530552873035577,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.8171,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.44079239666446396,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.8425,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5430257415754854,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8827,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5122734653026103,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.8715,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.47037923716378616,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.8428,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.4071915604324042,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.7873,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.4930792135935222,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8473,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.46072983553161256,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.7934,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4724090002512796,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.7856,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.5424387356431846,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.9459,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.49540806475580534,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.7716,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5035931795876943,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.7834,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.8131080597684293,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.7759,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.5163423632671406,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.827,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4291452218834803,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.8182,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.45224199709813434,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.813,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4261090870298,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.7053,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.5167876729083449,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.8275,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.4313552859242522,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.8171,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.45289041539428,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.8492,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.5166261174988512,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.8338,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4157435641812972,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.7917,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.491402132048252,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.7981,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.4445528461657566,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.9052,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.4197447499086703,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.7493,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4886253206372016,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.8029,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.4765189446925175,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.8344,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5144358719291028,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.8304,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.46916722646695236,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8396,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.4697171238614981,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.8141,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.4567021849997054,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.7285,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4894482041948432,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.8294,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4904653881718381,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7718,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4234566601688781,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.768,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.46261981904263455,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.7512,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.5106893649433253,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.8489,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.6163005786610454,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8145,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4674196739539739,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.7815,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.48965758838373274,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.8273,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.419009348485611,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.7198,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.5630703058755215,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.8807,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.5974320059787537,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.9517,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.5106332295584806,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8321,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.5241845014581666,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.8576,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.3817337728587,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.773,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.40242980611290113,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.7608,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.49824254385091293,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.8787,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.5597415955082202,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.8821,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.41719321050637004,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.7712,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.561733655171769,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.9214,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.403603783227237,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.8177,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.401241015246352,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.8014,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.6093542375620725,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.7716,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.4174338432478432,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.8355,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.560369224533605,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.8222,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.48807236825344513,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.6906,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.49877772882012494,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.8299,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5627159269452524,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.8169,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.5239948943616941,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.7465,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.41041236366072353,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.7824,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.4348437359785736,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.7417,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.6220227374639119,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.8912,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.4296457112545859,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.8072,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.4360646428952031,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.754,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.5054723789375187,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7758,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.494290935202186,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.8382,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.5333671479523171,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8051,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4993376861957015,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.8322,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4982647162083319,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.752,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4423169139813519,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.7763,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.5349362617362463,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.889,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.4435089793619459,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.7732,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.43091617490721384,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.8183,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.444060503169897,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.7883,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.41965491835732754,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.8106,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.44284326089286935,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.8015,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.39061979734847707,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.7657,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.39703513811657537,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.7763,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.5043580429456066,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.8012,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.5629925641967249,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.8638,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.5460601526584461,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.8266,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.41985598118037926,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.7357,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.5475399510700736,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.8838,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 1.1006791888104086,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.7982,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.38815953401271486,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7272,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.4440664673903164,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.8308,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3734226061489481,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7624,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.43399956288340513,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.7323,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.42319000811636015,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.772,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.39397233302623236,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.728,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.3893213470005053,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7019,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.35227510257967026,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.7391,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.47287328029829406,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.9142,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.47962860635829735,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.7615,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.4579895901402684,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.763,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.5216593170553835,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.8671,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.5464788696394796,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.8131,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4167947720982017,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.7709,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.49472030945838674,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.8371,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3744782140453558,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.7029,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.4528021003842892,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.8865,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.4589123011711344,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7524,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.42839347958134205,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.8261,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.36902286633529063,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.7776,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.6244510375840064,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.8404,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.44194539682664075,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.7606,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4265631702274794,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7329,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.39222524745628834,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.7171,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3705633072729717,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7362,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.44923869821756857,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.7928,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3860490982345257,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.6791,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.4741707270491527,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.8472,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4254148763209672,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7223,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4413502994303793,
+      "learning_rate": 0.0001,
+      "loss": 0.7815,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.35458312224766875,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7146,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.4469590316769706,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.7549,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4263386322112164,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.7745,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4149606139434068,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.8145,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.4256621117844649,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.8236,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.39257904027478074,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7016,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.4535928960965414,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.7954,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.4233331815902363,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.7895,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.42189751090775524,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7161,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.43914005287019414,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.7527,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.42567239550140645,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.8104,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.417351543024337,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.771,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.42762751595369664,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7527,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5091564310729734,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.7185,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.5191352153239585,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7472,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.39052374687232233,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.705,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.44439005828791617,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.7689,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.49277191680042626,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.7774,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4762458485994561,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.8411,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.47290284538716687,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.8232,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.4726519028658229,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.8124,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.4203880375352218,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.7598,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3941442723055963,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.7341,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.5054516938068756,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.8368,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.4243132536885088,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.7671,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4534177322388772,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.7765,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.43927406680363684,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.7457,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.44389773911521946,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.7933,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4039812656926671,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.7207,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.4428805100373204,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.8276,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.4673188138115627,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.7872,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.47265904688976484,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.7749,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.42953052839076467,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7322,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.4723567917036063,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.8228,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.47214932017910977,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7907,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.49575133422589773,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.8083,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.4971390236393325,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7912,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.46568071103960657,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.7432,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.35950760162351136,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7595,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.3852520082408446,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.719,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4144688519728944,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.7519,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.574956448805601,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.8408,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.4133163229719489,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.7867,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.48036249996384844,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.8117,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.4847266611954784,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.7896,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4067581760586959,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.728,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.4187734227054212,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7341,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.503269253418501,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.7621,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.506259067716606,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.8065,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4241877143025232,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.7265,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.4062906237273913,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.7286,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.4015033852678729,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.7966,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.4427337294004432,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.7994,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.36670330249588673,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.7539,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.46359869941524046,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.843,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.5250059887349049,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.7989,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.4931760616740979,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.8151,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.3855682951716813,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.7323,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4148347036245805,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.7118,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.47022827966731523,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.7955,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3918604326849458,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7425,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.40201106317085633,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.6585,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.4064711104467062,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.7507,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.43230832543129016,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.8291,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.41086763348387045,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7254,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.41106326675945515,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.6617,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.36682224813291225,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.679,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.5166929327698213,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.7117,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.36736057328082905,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.7006,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.4151280117511326,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.7469,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.4504996548534877,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.7587,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.36058371335471157,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.7505,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.4300359809363015,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7914,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.4000285275376355,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.7143,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.6151509764187009,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.8776,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.47029669638735594,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.7602,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.4206925243151734,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.678,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.45501137804658276,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.7416,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.39642161326813075,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.721,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.38016310546539966,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.7745,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.35166570303643224,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.6992,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.5630100003025338,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.9032,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.46440941920744416,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.8255,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.5028963259761965,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.7553,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.5342454680747596,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7309,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.638561461721519,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.7846,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.3975459542376279,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.6635,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.4324235174565061,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.7236,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.5051293824930861,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.8115,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.44916051057375367,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.7492,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.44819925713408015,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7944,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.4955285294947814,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.7128,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.3593977013244792,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.6488,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3796390525191943,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.6952,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.4320687103990978,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.6733,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.3930783842804485,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.6755,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.44459322981973176,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7601,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.367438558696493,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.6373,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.49124899058752686,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7743,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.471827562532131,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.7304,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.4361909599488475,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7808,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.4267501395480655,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.7604,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.4183946493041821,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7939,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3877019799044561,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.7391,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.47664260993902013,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.8213,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.43437766674982364,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.7556,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.38702892913749193,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.7246,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.38285741560545905,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.7138,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.4355115545752378,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.7514,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3825113651972068,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.7203,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.40434287867385094,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7203,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4619415343823919,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.8128,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.49297866762546866,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.8996,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.39966759598042595,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.7499,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.41912825742309595,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7187,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.48507379726989786,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.7198,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.42079408254398726,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7445,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.40029250602009475,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.7318,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.5352601493061019,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.6953,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.4103055555089203,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.8103,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.50097946349166,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7665,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.40415390709399,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.7174,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.36190735541695723,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.6558,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4511855196083346,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.7109,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.47199208951992744,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.8111,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.6014238116410946,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.7794,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.528098105779227,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.8053,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.37455150312706875,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.6943,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.433073283971249,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.8183,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.43399474629439905,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.7625,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.46784582464558705,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7781,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.4010880071391638,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.6853,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.4981044915202732,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.8253,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.37439237196850206,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.7231,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.4433273805623505,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7276,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.40452217918771727,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.7221,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.4150619811345609,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.6725,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.413438773475868,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.7608,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.4377068431797678,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.747,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.5277822195393723,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.7971,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.43945048480422544,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7237,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.4379035408937839,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.7532,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.4342441153115403,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.7447,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.6481027348435237,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.7571,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.5314548036716706,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.903,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.40313175330133494,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.7583,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.40735174719853,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.6696,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.42449345279075135,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.6966,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.48329070024673454,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.7644,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.40709003079455985,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.6849,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.4243713258025949,
+      "learning_rate": 0.0,
+      "loss": 0.6685,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 268297746972672.0,
+      "train_loss": 0.7987000615550921,
+      "train_runtime": 4814.0159,
+      "train_samples_per_second": 1.039,
+      "train_steps_per_second": 0.065
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 268297746972672.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..1450b0c1c8fc11f7f719894e0c00265434e0f3f5
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b3f998b8e3b8566304856ad765e8aa63893fb368
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50772b939e1de0f2bf3823b30fb6656d983f334cb3b840955e81494b3bd8b1bb
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b7eb4ea38e6d7b201b1aef80c4da749256eae7c7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6baf09c73d0dd685eb351304523c5f0b813e0ff2931735551313af2e9eb78143
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe7ec2e7eb56566f968592e8f8564b4d6d7422a6
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,1134 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 156,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.7852892419385117,
+      "learning_rate": 4e-05,
+      "loss": 1.2744,
+      "step": 1
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.8123301324016373,
+      "learning_rate": 8e-05,
+      "loss": 1.356,
+      "step": 2
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5966672278005135,
+      "learning_rate": 0.00012,
+      "loss": 1.1992,
+      "step": 3
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7054873167223312,
+      "learning_rate": 0.00016,
+      "loss": 1.1633,
+      "step": 4
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.7603762104866024,
+      "learning_rate": 0.0002,
+      "loss": 1.0346,
+      "step": 5
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5885181651908423,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.9551,
+      "step": 6
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5209000034448891,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.9527,
+      "step": 7
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.6632635263946058,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9641,
+      "step": 8
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.3958275154400083,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.8922,
+      "step": 9
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4482705445105819,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.9568,
+      "step": 10
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.4507926704898577,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.931,
+      "step": 11
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5101526548814669,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.9284,
+      "step": 12
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.38789254584648825,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.8089,
+      "step": 13
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4622760386572833,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.9155,
+      "step": 14
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.3982764045802694,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.9136,
+      "step": 15
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.40694387692825534,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.9076,
+      "step": 16
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.3874800847072502,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8438,
+      "step": 17
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4451084313545965,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.9307,
+      "step": 18
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.37554281242060195,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8146,
+      "step": 19
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.38914086120007607,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.8749,
+      "step": 20
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.36841739158575376,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8045,
+      "step": 21
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.3659882630147638,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8193,
+      "step": 22
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.3792566589980987,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8676,
+      "step": 23
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.3788031811405532,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.8562,
+      "step": 24
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.34891003346865745,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8594,
+      "step": 25
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.3736972128807295,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.8575,
+      "step": 26
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3334219023961695,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.815,
+      "step": 27
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.3537731834400865,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.7869,
+      "step": 28
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.3847612896751333,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.8598,
+      "step": 29
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.337348424209412,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.7764,
+      "step": 30
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.35287051664332336,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.8194,
+      "step": 31
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.3363048601214956,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.7572,
+      "step": 32
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.3595884998735282,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.8212,
+      "step": 33
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.3589845049590864,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.8351,
+      "step": 34
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.32468876356527243,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.7845,
+      "step": 35
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3282456892411546,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8261,
+      "step": 36
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.3660913696816195,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.8152,
+      "step": 37
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.36806893017265807,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8252,
+      "step": 38
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.3456274494023383,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.7714,
+      "step": 39
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.335467839910172,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7899,
+      "step": 40
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3206294504592128,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.7523,
+      "step": 41
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4305300228395664,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8279,
+      "step": 42
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.37912399345109443,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.8045,
+      "step": 43
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.36339194511211986,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.797,
+      "step": 44
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.5248417040381882,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8783,
+      "step": 45
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.3488749810196314,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.8035,
+      "step": 46
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.3357970783896402,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.8151,
+      "step": 47
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3480514063611181,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.8203,
+      "step": 48
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.3681154816060956,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.8617,
+      "step": 49
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3358072390594222,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.7818,
+      "step": 50
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.3479737624601583,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.8262,
+      "step": 51
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3316081314059503,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7526,
+      "step": 52
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.37147448547494943,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.7727,
+      "step": 53
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.303285830093508,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.7596,
+      "step": 54
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.33875140638156126,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.844,
+      "step": 55
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.34270498566617125,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7591,
+      "step": 56
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3538160657726156,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8131,
+      "step": 57
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.335452971166759,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7826,
+      "step": 58
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.35385521526738917,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.8304,
+      "step": 59
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.31921207731876666,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.7903,
+      "step": 60
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.3121946146036695,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7909,
+      "step": 61
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.3038170983694278,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.7746,
+      "step": 62
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.35674244978824016,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.7855,
+      "step": 63
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4191130455204281,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.84,
+      "step": 64
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.34596220764040214,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.7996,
+      "step": 65
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3126095224863489,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7536,
+      "step": 66
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.30503266549923264,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7911,
+      "step": 67
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.31065634553624866,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7432,
+      "step": 68
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.2913333062839636,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7101,
+      "step": 69
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3054011831296645,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.8268,
+      "step": 70
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3344018740472871,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7596,
+      "step": 71
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.3800228897057949,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.8404,
+      "step": 72
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.33273799548180377,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.8014,
+      "step": 73
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.30404580113092516,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.7902,
+      "step": 74
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3308579272219948,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7864,
+      "step": 75
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.37985174493832174,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.8043,
+      "step": 76
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.33269596150791886,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7424,
+      "step": 77
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.2725402036547681,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7238,
+      "step": 78
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.2962381633437675,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7314,
+      "step": 79
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3256654465787873,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7824,
+      "step": 80
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.29941414975909786,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7469,
+      "step": 81
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.33592948893537333,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.7677,
+      "step": 82
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.30615101882806117,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.8134,
+      "step": 83
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.30955307869065124,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.7459,
+      "step": 84
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3091065912061399,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7542,
+      "step": 85
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3169309045909451,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.7851,
+      "step": 86
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.2923730971006948,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7591,
+      "step": 87
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.33526977572554184,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7355,
+      "step": 88
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3050894393762434,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.7376,
+      "step": 89
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3445834949944767,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.811,
+      "step": 90
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.37838356459059586,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.8188,
+      "step": 91
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3029282472334983,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.7477,
+      "step": 92
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.34607185935788465,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.8049,
+      "step": 93
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3311291086559154,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.7643,
+      "step": 94
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.31724066310492993,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.7581,
+      "step": 95
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3209832103698408,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.8116,
+      "step": 96
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3187970820609727,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7537,
+      "step": 97
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.335328138754125,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.8079,
+      "step": 98
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.3592372773952761,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.803,
+      "step": 99
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3082878882980173,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7554,
+      "step": 100
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.29065935607242344,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.7405,
+      "step": 101
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3627304498422207,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.815,
+      "step": 102
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.352388441861533,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.8045,
+      "step": 103
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.3027712417529716,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7331,
+      "step": 104
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3766636606932217,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7868,
+      "step": 105
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.3121526055286457,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.729,
+      "step": 106
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.3149302698562098,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.8027,
+      "step": 107
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.31556013440906944,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.8025,
+      "step": 108
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.39623874251487173,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.8117,
+      "step": 109
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.296852741991719,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.7246,
+      "step": 110
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.32969648147687,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7727,
+      "step": 111
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.29024538079430984,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.7107,
+      "step": 112
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.30756514170597327,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7812,
+      "step": 113
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.28057012148418414,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.6756,
+      "step": 114
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.3128963704221699,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.7137,
+      "step": 115
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.31033894256136946,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.7585,
+      "step": 116
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.2894764903995195,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7783,
+      "step": 117
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.37786140371536453,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.8071,
+      "step": 118
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3198328748759536,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7239,
+      "step": 119
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.30767614800890697,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7363,
+      "step": 120
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.2712712834022847,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7436,
+      "step": 121
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.379576246128743,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.8755,
+      "step": 122
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.36741356623291405,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7485,
+      "step": 123
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.3946626921728505,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.7333,
+      "step": 124
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.340556183318827,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7736,
+      "step": 125
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3343972062233843,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7789,
+      "step": 126
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.3231610917304344,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.6847,
+      "step": 127
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.2917533010139889,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.6881,
+      "step": 128
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.30993215869001783,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7259,
+      "step": 129
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.31379134473465536,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7155,
+      "step": 130
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.34094723704701696,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7619,
+      "step": 131
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3713863817615012,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7855,
+      "step": 132
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.31240753593609405,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.788,
+      "step": 133
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.30270241182524654,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.75,
+      "step": 134
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.295655653102901,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.7396,
+      "step": 135
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.28558293795235745,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7301,
+      "step": 136
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3520550476656149,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.8676,
+      "step": 137
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.29355394153082187,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7446,
+      "step": 138
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.45225443281815797,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7387,
+      "step": 139
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3417791223034832,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.7224,
+      "step": 140
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.3378646955827422,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7994,
+      "step": 141
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.31479109212880263,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.6944,
+      "step": 142
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3296702583400903,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.7694,
+      "step": 143
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.40151803179348267,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.8046,
+      "step": 144
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3199316419603901,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.7664,
+      "step": 145
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.32793118837673924,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7785,
+      "step": 146
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.316461221508605,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.7653,
+      "step": 147
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.29821877779102307,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7334,
+      "step": 148
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.2975960096717064,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7078,
+      "step": 149
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3103506427392654,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.7623,
+      "step": 150
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.39376025834993006,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7689,
+      "step": 151
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.3136092190540982,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.7549,
+      "step": 152
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.3638008909987968,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.8375,
+      "step": 153
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.27970864537532564,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.719,
+      "step": 154
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3423685826931903,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.7382,
+      "step": 155
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.29697788326296587,
+      "learning_rate": 0.0,
+      "loss": 0.6871,
+      "step": 156
+    },
+    {
+      "epoch": 0.9984,
+      "step": 156,
+      "total_flos": 391611316568064.0,
+      "train_loss": 0.8044182276114439,
+      "train_runtime": 4779.7338,
+      "train_samples_per_second": 1.046,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 156,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 391611316568064.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d50a2d4ff590d638f38beed373bfb60235a3bc8
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "v_proj",
+    "up_proj",
+    "q_proj",
+    "down_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5c16ca5213690cdc7c54be8186bafbe97a9e1b2e
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:760853fdc0fb5b855c3eb934a37009087db4445d5b71b78ae90a33afe305b742
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ad1ecd815b74efe7c3e59f287532d27ba3bff7df
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4f67e42a4d1bde1c7ca898b2b55103bbbbbef502a896e2df1b374fd0e203f55
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f68f5c45b267cb39c080a3a4b6e96c1f158fc985
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9403164467491822,
+      "learning_rate": 2e-05,
+      "loss": 1.326,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8161387978103016,
+      "learning_rate": 4e-05,
+      "loss": 1.2227,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7903570972127051,
+      "learning_rate": 6e-05,
+      "loss": 1.2749,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.7372082384164766,
+      "learning_rate": 8e-05,
+      "loss": 1.2725,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.647136454087788,
+      "learning_rate": 0.0001,
+      "loss": 0.9416,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.966111147557462,
+      "learning_rate": 0.00012,
+      "loss": 1.14,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.8972547699219121,
+      "learning_rate": 0.00014,
+      "loss": 0.9772,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.9556489947629087,
+      "learning_rate": 0.00016,
+      "loss": 1.136,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.6566249511477352,
+      "learning_rate": 0.00018,
+      "loss": 0.9801,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.6088085260386118,
+      "learning_rate": 0.0002,
+      "loss": 0.8836,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.49451119398317356,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.8984,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.4947685304541711,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.8886,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5592752857560825,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.9013,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.6421409188740781,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.9336,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.6746215654926132,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.9473,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.6261733851250406,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9251,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.556355660919846,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.846,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5333528169937837,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.8921,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5948344826803486,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 1.0135,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5050731254706105,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.8385,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.49886787632077434,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.9134,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.4895502396158153,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.8997,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.5133303676414952,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.9528,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5645289165267436,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8669,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.4771886806387586,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.8335,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.4785744263962845,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.7468,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5817774914266006,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.9147,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.48793648235873005,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.8854,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.537552490827491,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.8859,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5220128682464678,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.9127,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4380001036261557,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.8505,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.6010189197957251,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.9349,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.47811971240378376,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.8648,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.5132998311126512,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8049,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5758538447495875,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.9162,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.7230922488004969,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.9278,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.7361533502274045,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.8012,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.44135926098043454,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8142,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.5139493655462305,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.9134,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.448488097956857,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.8364,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 1.000681699067436,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.7512,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.6650851792362262,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8665,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5589585903785106,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.7782,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5661529623212822,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8537,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.5326090290505959,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.902,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.5441834018302,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8372,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.5592781197360509,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.9005,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.5055356622946707,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.819,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.4494087343581663,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.8433,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5275668960943449,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8789,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.6141329983651026,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.8704,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.47629567533869904,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.845,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.39833138970678816,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.7865,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.6735185416773903,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8505,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.46919563547695103,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.7927,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4804315793841404,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.786,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.5548874913743832,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.9485,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.5111549241658514,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.7729,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5034137626819486,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.7852,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4488108829174529,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.7742,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.5188184982861153,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.8249,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4365343224794975,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.8146,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.45200806267614635,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.8113,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.43789202866455496,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.7082,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.5317837475598077,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.8287,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.4606302149091934,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.8177,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.4475068092534531,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.8476,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.5527007239826185,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.8322,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.42955105675771005,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.7941,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5103288402468691,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.7992,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.7192929984424484,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.9048,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.4309551951903274,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.7482,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.524905218715058,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.8054,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5183750960618888,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.8357,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4974939734697588,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.8283,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.4793129645486592,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8369,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.46697413868878285,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.8139,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.47135290687461023,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.7284,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4994984531684765,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.8283,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4171800017001343,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7675,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.42595149156295664,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.7655,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.482499526895452,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.746,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.5259500639822324,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.8482,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.6245585780675597,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8193,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.5027327496646725,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.7842,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.5114834099652439,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.8276,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.43072609121932287,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.7218,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.564639105906531,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.8817,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.604682876111702,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.9515,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.48198587224648937,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8297,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.6003370287960951,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.8569,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.3842765293637917,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.7724,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.4002523327211069,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.7597,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.48400032239723845,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.878,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.5542732995154189,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.8849,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4261167404490184,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.7697,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.5643656966798687,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.9232,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4206152573714265,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.8167,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.4278369117950475,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.8039,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4309359068500558,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.7743,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.40943336312292444,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.8352,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.5359876660658226,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.822,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.4466159127813941,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.6908,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.49271795050235634,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.8305,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5525175573731309,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.8152,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.5210246095031947,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.7472,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.4420260279231557,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.7831,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.6487323183372163,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.7425,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.4699987779256609,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.8933,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.4592059242516297,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.8075,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.42880936536633296,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.7539,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.46857951495905303,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7717,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.5012514304043896,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.8425,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.479922785247191,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8044,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4661734763422208,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.8304,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.46281454027918595,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7486,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.43443054674776366,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.7757,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.5549461110214192,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.8918,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.450064419920354,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.7735,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4380240394994644,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.8189,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.4455704564544761,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.7848,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.4265943986788346,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.8118,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.4387296129649349,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.7984,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4082806194667678,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.7666,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.397887808109905,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.7781,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.5063382696810116,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.8107,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.5402999745501336,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.8604,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.5511794660615774,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.8264,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4049254874424734,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.7375,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.6030364224349011,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.8861,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.5268547885056347,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.7992,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.4038438055064155,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7215,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.5467774429109289,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.8325,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4181719546030265,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7641,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.46070085076217804,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.7321,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4580405416596679,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7703,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.40519612037378994,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.7263,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.41765952251996086,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7021,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.3745934515144171,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.7374,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.5127091782649509,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.9169,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.48491962580869874,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.7599,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.49383390158049373,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7598,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.5197953411631441,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.8641,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.5454779674561641,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.8128,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4313475681156491,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.7707,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.5132205432582659,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.8376,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3982886336613627,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.7012,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.44730572163431054,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.8852,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.48304105246350115,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7548,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.43379715260938845,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.8249,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3722269585690899,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.7763,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.6015833868175456,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.8335,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.45155148311203713,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.7608,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4323587816287022,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7324,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.39303924688851216,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.7209,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.43511898322913245,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7365,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.4498897672241139,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.7935,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.47264276687337253,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.6799,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.509864229696816,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.8467,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.45543437884234866,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7217,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.44808316462676906,
+      "learning_rate": 0.0001,
+      "loss": 0.7794,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.35941205649478886,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7158,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.45781756180347105,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.7577,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4379183406476885,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.7761,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4190665599840034,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.8133,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.4362902610839825,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.8272,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.4151590081592175,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7042,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.45458142730658896,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.8005,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.4458988263161724,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.7927,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.42688999981693,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7168,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.48919574950425926,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.7535,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.45653322268010754,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.8149,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4572002897224282,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.7722,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.40101980297192197,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7546,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.48413738730953726,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.717,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.42046000647002596,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7465,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.3975721995046092,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.7072,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.44807879727459876,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.7657,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.4340176534043155,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.7796,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.5286008620789443,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.8444,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.4898308649785636,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.8207,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.47228683476043576,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.8161,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.445910528188009,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.7627,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.5839084167816913,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.7343,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.5419196487076774,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.8361,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.434670829219984,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.7691,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.46226934049123536,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.7731,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.4331061643516525,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.7479,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.454834956540033,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.793,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4121086759968263,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.7219,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.45158172523113455,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.8289,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.4263714476961183,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.7898,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.5238655375104982,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.7748,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.42185923206259196,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7328,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.441024424461854,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.825,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.49032435666200364,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7963,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.5039392484271568,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.8126,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.49666330579733103,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7889,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.45539269569468943,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.7439,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3681914738253727,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7601,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.3879745595310054,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.7215,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.559525163793432,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.7523,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.5639402939049178,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.8384,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.42349218652727627,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.7876,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.49184743437053013,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.8181,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.5089572330169558,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.7941,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.419386781205736,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.7303,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.4816466559542788,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.732,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.5222937607988991,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.7668,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.5027928990810613,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.8038,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4220864972323868,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.7285,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.4176007443523416,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.7271,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.45515802194139304,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.7978,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.4577351921329997,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.7971,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3790064243314445,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.7556,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.4983988032374861,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.8468,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.5379087413391773,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.7967,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.5147776717401932,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.8165,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.6249784194430409,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.732,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.41001673871650374,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.712,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.625582050090508,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.7917,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.39828725874660814,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7441,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.3819450698806339,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.6571,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3990032101593105,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.7529,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.4350507667639973,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.8304,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.3978367820930633,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7225,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.40257043569078077,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.6622,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3700010193562475,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.6825,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.469112884691295,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.7108,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.3645240671438613,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.6997,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.43547382362805426,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.7498,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.5039311910226827,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.7603,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.36654507387112795,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.7533,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.4369475913914052,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7923,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.4053219846239178,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.7163,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.6110807512044899,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.8785,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.4700108092984455,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.7624,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.430086120412855,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.6837,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.4471230873786409,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.743,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.38842437757365395,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7212,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3880894085413885,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.7728,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.3550252609641288,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.6983,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.5742894720748866,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.9021,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.4756840750985434,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.8286,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4746175037861587,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.755,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.5305745533741653,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7264,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.6745686774384193,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.7858,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.3991822202289771,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.6646,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.4300627097278966,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.7243,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.6221113967090829,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.8133,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.4396383168090041,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.7512,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.4322242291580685,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7927,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.4881464046761018,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.7112,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.3635967883127235,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.6488,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.9162631115013509,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.6927,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.4232183438208216,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.675,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.3983414780249098,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.6749,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.43736649086026275,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7566,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.38352977687355905,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.638,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.4929938245776581,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7796,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.47575598281648185,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.7288,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.44743746064919565,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7818,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.4277467149482736,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.761,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.4111108176793066,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7931,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.38627127454224836,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.7401,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.4865842859255661,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.8224,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.43387943324067996,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.7584,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3877846450634093,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.7267,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3751685403252838,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.7147,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.43956537906372245,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.7505,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.39081466236870016,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.7204,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.40751146184111137,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7196,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.5113127556918922,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.8143,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.49028884149407453,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.901,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3963954743021353,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.749,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.43387737277821176,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7181,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.4257288320928217,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.7162,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.45149432731324496,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7434,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.39795054274931096,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.7313,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.5787298763731419,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.6926,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.41569392614976874,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.8113,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.5286309729383197,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7696,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.41586498970579483,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.7173,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.40098365613370385,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.654,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.45275196123906825,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.7096,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.4704592086843827,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.8154,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.6155616558939484,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.7761,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.5371537043394504,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.8077,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3704992794499012,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.6935,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.43499353149792624,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.8202,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.4404320324404543,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.7615,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.4656663216499453,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7776,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.37652168273299075,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.6871,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.4839018460466476,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.8256,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.37824459079415657,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.7259,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.45051573211287965,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7285,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.39407823652788115,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.7209,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.44125661559905305,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.6752,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.41703806836884816,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.7589,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.44452416210889956,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.7459,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.5897734862202882,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.7962,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.47808881340202486,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7262,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.4422678402673698,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.7511,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.44324451607132576,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.7456,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.5271824627724633,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.7581,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.5194352897373036,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.9021,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.39981380158689,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.7602,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.4864393541236497,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.6679,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.44453238041721077,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.702,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.49408675694214305,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.7702,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.42048527475207637,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.6854,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.43465444254337887,
+      "learning_rate": 0.0,
+      "loss": 0.6674,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 268297746972672.0,
+      "train_loss": 0.799005626867979,
+      "train_runtime": 4811.5388,
+      "train_samples_per_second": 1.039,
+      "train_steps_per_second": 0.065
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 268297746972672.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..25928ca95c5fd6abf34f0d48986c6bab205e802c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "q_proj",
+    "o_proj",
+    "v_proj",
+    "k_proj",
+    "down_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..389fb1b8eee43f98021b7a0f22821aa2e5049075
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7530b7382ce3933dd24c6111408fd392a68b67efc64148e67fc49598d846871a
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a73d0d68280885eaf1500b8af9523f36b09f8789
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31a054c6255fa4ba538f90a94104809531922320e7d190502cc2784e2efc5194
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c31db710bff612687ef4e2b1d2fb7913a00f2beb
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,1134 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 156,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8035340793505722,
+      "learning_rate": 4e-05,
+      "loss": 1.2744,
+      "step": 1
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.8326815210726537,
+      "learning_rate": 8e-05,
+      "loss": 1.356,
+      "step": 2
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.6048062185696337,
+      "learning_rate": 0.00012,
+      "loss": 1.1985,
+      "step": 3
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7065372561442537,
+      "learning_rate": 0.00016,
+      "loss": 1.1624,
+      "step": 4
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.791038287770007,
+      "learning_rate": 0.0002,
+      "loss": 1.0353,
+      "step": 5
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5926212931857436,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.9565,
+      "step": 6
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.4961935514321317,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.9528,
+      "step": 7
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5927044662113193,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9635,
+      "step": 8
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.41203424704440733,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.894,
+      "step": 9
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.48187851159803763,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.9567,
+      "step": 10
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.4577696835046481,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.931,
+      "step": 11
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5392759187031103,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.9288,
+      "step": 12
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.3962306342126496,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.8091,
+      "step": 13
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.45163495803519205,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.9157,
+      "step": 14
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4039255911085676,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.9128,
+      "step": 15
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4144274624343591,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.907,
+      "step": 16
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.39407070351038004,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8452,
+      "step": 17
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.43548916333524695,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.9298,
+      "step": 18
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.38724088031902365,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8181,
+      "step": 19
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3811996227647024,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.8756,
+      "step": 20
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.3720832384145608,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8054,
+      "step": 21
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.37162449995068403,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8206,
+      "step": 22
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.37721195841803645,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8668,
+      "step": 23
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.3781960709473528,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.8555,
+      "step": 24
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.34823275392631015,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8595,
+      "step": 25
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.3697283311475832,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.8577,
+      "step": 26
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3279726724431966,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8154,
+      "step": 27
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.3361144174375379,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.787,
+      "step": 28
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.3789000717657513,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.8598,
+      "step": 29
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.33683848816432094,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.7757,
+      "step": 30
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.35834671597383877,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.8197,
+      "step": 31
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.3326729551946847,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.7574,
+      "step": 32
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.34995163009170244,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.8209,
+      "step": 33
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.3639572220576174,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.8355,
+      "step": 34
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.32212467900780717,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.7841,
+      "step": 35
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.32501947399021053,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8259,
+      "step": 36
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.38201116102721167,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.8169,
+      "step": 37
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.3626434293467508,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8258,
+      "step": 38
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.34313519341303317,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.7715,
+      "step": 39
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3275583265760916,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.79,
+      "step": 40
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3191062587367316,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.7519,
+      "step": 41
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4092994506893867,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8274,
+      "step": 42
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.3695189858901441,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.806,
+      "step": 43
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.3545162554280589,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.7963,
+      "step": 44
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4324311848015332,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.88,
+      "step": 45
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.35465287211803115,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.804,
+      "step": 46
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.31942958758251727,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.8166,
+      "step": 47
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3365577635582448,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.8196,
+      "step": 48
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.36006440509046755,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.8598,
+      "step": 49
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3306811147622879,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.783,
+      "step": 50
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.34268843518010145,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.8273,
+      "step": 51
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3158941962722818,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7514,
+      "step": 52
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.3731863427097849,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.7711,
+      "step": 53
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.31209125072987703,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.76,
+      "step": 54
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3448081778380014,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.8421,
+      "step": 55
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3451816731920408,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7579,
+      "step": 56
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3840857009752893,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8128,
+      "step": 57
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.33758751388577785,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7834,
+      "step": 58
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.3695050281343937,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.8295,
+      "step": 59
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.3346439954641722,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.7894,
+      "step": 60
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.31930767512743163,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7925,
+      "step": 61
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.3019269522549601,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.7783,
+      "step": 62
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.34031256857467895,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.7858,
+      "step": 63
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.42760295970656664,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.8397,
+      "step": 64
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3606486382014007,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.8028,
+      "step": 65
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3159985053530957,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7535,
+      "step": 66
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3154305870908761,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7915,
+      "step": 67
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.3291007330811774,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7444,
+      "step": 68
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.29839013247373436,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7099,
+      "step": 69
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.31924199456699764,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.8262,
+      "step": 70
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3548228413411494,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7585,
+      "step": 71
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.40046011439240137,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.8403,
+      "step": 72
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.33654136487192543,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.801,
+      "step": 73
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.31460927305651093,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.7911,
+      "step": 74
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3549394122392794,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7862,
+      "step": 75
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.38241670589324783,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.8062,
+      "step": 76
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.34489555449239506,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7434,
+      "step": 77
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.2806231949464588,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7229,
+      "step": 78
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3000699983392725,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7321,
+      "step": 79
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.33595212091928367,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7825,
+      "step": 80
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.2995860860695928,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7472,
+      "step": 81
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.33327316911841876,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.7664,
+      "step": 82
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.364902553727858,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.8141,
+      "step": 83
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.30391859609685723,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.7451,
+      "step": 84
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.30695551868865256,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7542,
+      "step": 85
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3227497614839023,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.785,
+      "step": 86
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.2990974753627751,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7599,
+      "step": 87
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.32988199154175873,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7347,
+      "step": 88
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.32116937919851646,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.7402,
+      "step": 89
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3398716441196792,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.8113,
+      "step": 90
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.3701808895751357,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.8209,
+      "step": 91
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3172929447749312,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.7494,
+      "step": 92
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.36440455545982475,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.8047,
+      "step": 93
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3782297199151222,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.764,
+      "step": 94
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3302517040045045,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.757,
+      "step": 95
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.31925075939475256,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.8109,
+      "step": 96
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.34597884431932663,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7528,
+      "step": 97
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.34399804653158134,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.8112,
+      "step": 98
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.4503304168982117,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.8019,
+      "step": 99
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.31408882640451896,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7539,
+      "step": 100
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.2950655274806464,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.7406,
+      "step": 101
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.36816918991595826,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.8141,
+      "step": 102
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.36209149313356365,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.8046,
+      "step": 103
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.3134766251824321,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7352,
+      "step": 104
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.48296319857057174,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7882,
+      "step": 105
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.3160209673487955,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.7291,
+      "step": 106
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.31709343856767247,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.8039,
+      "step": 107
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.31063930635038317,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.8023,
+      "step": 108
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3956030948962467,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.8113,
+      "step": 109
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3123712643671273,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.7253,
+      "step": 110
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3482739347672168,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7742,
+      "step": 111
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.2863058456033559,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.7096,
+      "step": 112
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.302090802722911,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7803,
+      "step": 113
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.28436088477268534,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.6765,
+      "step": 114
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.3126965341963825,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.7144,
+      "step": 115
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3180784275437213,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.7593,
+      "step": 116
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.2978419987936676,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7781,
+      "step": 117
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.41255804552728154,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.8066,
+      "step": 118
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.32837895155628116,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7248,
+      "step": 119
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.32726215856422297,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7365,
+      "step": 120
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.2739476755750894,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7443,
+      "step": 121
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.37548914623504875,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.8744,
+      "step": 122
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3604271713813052,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7482,
+      "step": 123
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.4060539170115801,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.7333,
+      "step": 124
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.34529267555652776,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7743,
+      "step": 125
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3390417346838904,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7796,
+      "step": 126
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.33150772182289123,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.6846,
+      "step": 127
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.29328252821956335,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.6885,
+      "step": 128
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.311185954863508,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7278,
+      "step": 129
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3285936074265703,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7144,
+      "step": 130
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.33971634630816916,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7638,
+      "step": 131
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.2978291588736038,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7856,
+      "step": 132
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.3133959872428493,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.7897,
+      "step": 133
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3013524652462147,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.7488,
+      "step": 134
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.2944488340362683,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.7387,
+      "step": 135
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.502661096719573,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7307,
+      "step": 136
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3480637990631335,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.8673,
+      "step": 137
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.3030909147874621,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7444,
+      "step": 138
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.315645662045372,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7382,
+      "step": 139
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3495253840763391,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.7223,
+      "step": 140
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.41048950032203585,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.802,
+      "step": 141
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.2839975051047652,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.6948,
+      "step": 142
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.41928555434574294,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.7693,
+      "step": 143
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4122207518174965,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.8077,
+      "step": 144
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.29709260516698854,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.7667,
+      "step": 145
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.4301082952564468,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.778,
+      "step": 146
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.32064610401192734,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.7654,
+      "step": 147
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.2986447331282153,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.732,
+      "step": 148
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.29822579082757045,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7077,
+      "step": 149
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3080627982411816,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.7625,
+      "step": 150
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.3623747242759375,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7701,
+      "step": 151
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.30089343943365976,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.7548,
+      "step": 152
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.36369213030522163,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.8372,
+      "step": 153
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.28995750962922084,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.7195,
+      "step": 154
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.34177664411092873,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.7391,
+      "step": 155
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3122340778319871,
+      "learning_rate": 0.0,
+      "loss": 0.6868,
+      "step": 156
+    },
+    {
+      "epoch": 0.9984,
+      "step": 156,
+      "total_flos": 391611316568064.0,
+      "train_loss": 0.8046467529657559,
+      "train_runtime": 4784.5328,
+      "train_samples_per_second": 1.045,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 156,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 391611316568064.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c6cb7b0be76f388e90234ff3702db6268a74b8f7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "up_proj",
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "k_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2515f7877f8d12eda55aa1af8124ff7a5ecd086a
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d00a83ea5723b0b88550c9d1f34887a155fa60b2a6ccefb9fdfcb07455f3424d
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..07fd6496e5ef7917a07439013b942420b77b7776
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8964d7a41830da964ca1f64c6c82fa7fdee55315b6f4e96b3cef02c9324c4f70
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cbfde9eff16d3fd72a5bfecdf3a39fa008c5fa9
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9475372509233241,
+      "learning_rate": 2e-05,
+      "loss": 1.326,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8255112618496162,
+      "learning_rate": 4e-05,
+      "loss": 1.2227,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.8031490881581672,
+      "learning_rate": 6e-05,
+      "loss": 1.2749,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.7386472491239978,
+      "learning_rate": 8e-05,
+      "loss": 1.2729,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.6123801345342339,
+      "learning_rate": 0.0001,
+      "loss": 0.9416,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.9748162205092032,
+      "learning_rate": 0.00012,
+      "loss": 1.14,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.9195383903965376,
+      "learning_rate": 0.00014,
+      "loss": 0.9798,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.9550980567072939,
+      "learning_rate": 0.00016,
+      "loss": 1.1364,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.6358806958616194,
+      "learning_rate": 0.00018,
+      "loss": 0.9806,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.6584899966952839,
+      "learning_rate": 0.0002,
+      "loss": 0.8835,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.49892591939119807,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.8981,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.4962301388144038,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.8893,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5697491748058041,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.9026,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.6481407921914579,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.9356,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.678142222999374,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.9462,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.6228716926229473,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9257,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.562202486455815,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.8431,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5479913775413893,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.8926,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.6080925105746661,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 1.014,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5065288186619548,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.8423,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.5159448796732826,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.9139,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.5039012397213576,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.9021,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.5287943390341321,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.9534,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.4689921770463583,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8669,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.48284728730041415,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.8328,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.4965084783063032,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.745,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5905989291752782,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.9137,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.47700271812675477,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.8862,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.5549112880831628,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.8866,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.532322622933053,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.9127,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.443849424472922,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.85,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5980883187850191,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.9338,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.47110264880609937,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.8631,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.5075464761050624,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8048,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5843680350834338,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.9154,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.5819950358122334,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.9255,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5178130228650352,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.7974,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.4602445891115539,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8179,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.518934498544055,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.9162,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4747867659375649,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.8348,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 1.129356077205779,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.7498,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5438136919706106,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8675,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.487493061884578,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.7767,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5820093925186012,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8538,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.514133066410687,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.8992,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.5543124541671927,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8356,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.5700850850672268,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.9012,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.4921387990437491,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.8159,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.4351184948516209,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.8419,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5393404388572136,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8815,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5002819136243063,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.8719,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.47210034832096764,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.8463,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.42704344797802224,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.7851,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.5050122125112612,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8484,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.46594147112433676,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.7922,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.6049026714505561,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.7832,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.5520570407070383,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.9458,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.5258281889886353,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.7662,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5042869992665732,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.7838,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4415498821824666,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.7739,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.528068837310034,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.8279,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4382614858545404,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.816,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.45850262081352067,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.8095,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4415344758160172,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.7094,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.5341688157184947,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.8289,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.4477850495284644,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.8187,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.46563284629446294,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.8479,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.5244429014213015,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.8311,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4224889085914789,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.7937,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5169843727719514,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.795,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.47188379389130514,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.9052,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.5536682110712087,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.7486,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.5019047354964509,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.8023,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5247331966004386,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.8345,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5203590067022946,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.83,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.4781060465248561,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8388,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.4669659122826244,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.8135,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.46355569987707634,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.731,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.5457644600238496,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.828,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.43555675848315856,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7712,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4354177922283847,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.766,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.4618480255495938,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.7458,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.5179627468498007,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.8463,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.6679783445582437,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8165,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.47708984031515167,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.7847,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.48916241960280127,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.8247,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.42819076627920327,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.7198,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.5551537423140956,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.8808,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.6185891989336596,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.9487,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.48990222253922966,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.83,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.584563973573214,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.8531,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.3822568873820786,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.7736,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.39153513190589123,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.7613,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.4722500361916666,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.8785,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.5432777140214171,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.8845,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.42141522285453015,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.7694,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.5603710022840139,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.9227,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4692676989268742,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.8197,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.41431728254284184,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.804,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4303135895029476,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.7709,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.4132483714354605,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.837,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.5513903573976224,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.8217,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.4367114402281363,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.6912,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.481909466793859,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.8313,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5650200354143932,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.8164,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.6003386843843564,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.7437,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.41804442073857784,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.7807,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.4351168603450991,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.7445,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.4623350020448244,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.8933,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.4373188970399322,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.8076,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.45494271034653544,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.7516,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.46107867226622695,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.773,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.49119829743495835,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.8354,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.4899833574630175,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8039,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4853333535137589,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.8305,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.48298296036949917,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7502,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.45567966724483405,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.7734,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.5396135149138322,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.8909,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.44065113727070815,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.7733,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4435587464426928,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.8202,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.4540090705940411,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.788,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.422623541311413,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.8095,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.4333205443650321,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.7995,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.3981291884403016,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.7642,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.39105313135637965,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.7737,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.5037497450946807,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.8073,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.5488664229378287,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.8581,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.6003351085243592,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.8258,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4091264449134919,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.7357,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.5470462367821493,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.8844,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4894282121746544,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.7962,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.4101795046595908,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7266,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.5068711466438554,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.8303,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.38242083266801624,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7646,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.42033032308999724,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.7327,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4202205681205054,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7697,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.4076882425961211,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.7262,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.3970760465743167,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7013,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.3563358239374254,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.7397,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 1.0149081113634044,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.9153,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.47512757583798637,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.7641,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.4644157016218601,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7605,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.5972922545320057,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.8634,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.5638793356740267,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.8134,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4208532395312315,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.7711,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.501311946451056,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.8352,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.39223467428190856,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.7033,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.456994207970227,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.8883,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.45684422134646335,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7564,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.42288440860117754,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.8268,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3625161326066237,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.7752,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.6363628609763268,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.8428,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.4388850237086708,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.7613,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4238194413830428,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7368,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4313229068700299,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.7228,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3665458963175191,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7351,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.4606084144955922,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.7933,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.37454884179441855,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.6785,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.4685857988699956,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.8421,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4635048557171666,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7237,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.42921898225240124,
+      "learning_rate": 0.0001,
+      "loss": 0.7781,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3662956360921228,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7159,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.4557524466979869,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.7561,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4893732127944404,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.7776,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.41560284819860266,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.8182,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.43583050097753673,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.8272,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3975629083125364,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7031,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.43624658704902014,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.8014,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.42770672908711543,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.7893,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.431287601565606,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7174,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.4477161386575832,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.7533,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.4378644630155987,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.8106,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4172200088521494,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.7713,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.36721232643194673,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7491,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4862813172573176,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.7231,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.40621933997484455,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7486,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.39390182160705,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.7062,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4423733823524504,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.7722,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.42234610710821735,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.7751,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4790419250342842,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.844,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.47088007396822795,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.8215,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.45537714110666144,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.8159,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.43357137618807773,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.7605,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3827670305696227,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.7338,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.513998983919058,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.8397,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.43364564030184066,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.771,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4675556367134949,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.7752,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.4049143779921121,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.749,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.4500976620473903,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.7935,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3995426553234267,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.7244,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.44562703640404544,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.8293,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.4051528567354779,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.7884,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.4771405413248115,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.7784,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3926186909030838,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7335,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.41976332389254317,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.8242,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.4803382328813548,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7945,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.5297101516864969,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.8094,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.49411259395914287,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7898,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.46352144102877846,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.7439,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3696971917361302,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7595,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.3814101337331313,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.721,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.40401538235653045,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.7503,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.5564174440836027,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.839,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.4199095129579862,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.7874,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.5007298515301487,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.8153,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.4824688967442501,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.7891,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.40526171374102343,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.7297,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.4166847918577331,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7311,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.49940816867089055,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.7645,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4994077839956053,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.8044,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.43752905673208514,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.7281,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.4041679096739365,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.7274,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.4098059795351516,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.7963,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.44718722572355823,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.7985,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3682360366651651,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.7541,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.46498255039759495,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.8432,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.5488964536417223,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.796,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.5073904499922492,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.8135,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.3952360275521916,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.7325,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4104314612045499,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.7144,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.7434300816992129,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.793,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4010089987617862,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.743,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.3882211408530366,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.6581,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.39187843713439985,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.7531,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.4193424215582899,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.8272,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.4475292298664944,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7227,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.4525161513980812,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.661,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3651492327454228,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.6811,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.47595078991005435,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.7122,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.40883568477553434,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.7013,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.4204261201847352,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.7485,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.45569331512014916,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.759,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.3605187290705392,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.752,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.43941323831982837,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7928,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.42392138054141804,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.7183,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.6002181145748936,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.8757,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.4704650269250136,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.7607,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.45646416600668926,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.6787,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.4457812809777728,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.7402,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3983232714534782,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7209,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3841756021972201,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.7734,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.3548275434989953,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.6981,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.5542249068953132,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.9009,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.4614383625410636,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.8271,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.48453095060087825,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.7532,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.5453650007330265,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7288,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.6577342749465136,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.7854,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.4020345678036813,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.6646,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.428764083923936,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.7225,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.6598289868683983,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.8133,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.4611059514614476,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.7492,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.4346383906696144,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7972,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.4903650685366549,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.7085,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.34351000406824495,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.6492,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.37624588282483906,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.6955,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.42553446492000263,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.6734,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 2.210252347918217,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.6773,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.4371212286257628,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7589,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.36722660195541307,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.6387,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.5240922919354317,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7771,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.48516917315907054,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.7286,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.4357039541849848,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7802,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.4283435397363714,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.7599,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3981078881096444,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7931,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.39156672531944486,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.7401,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.48543910561325665,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.8197,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.4378682167583888,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.7549,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.38647455959198573,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.7263,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3735332505931595,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.7155,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.4379368538027031,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.7484,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3882393713789715,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.7211,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.40882868968147507,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7205,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 1.3945997112526465,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.8109,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.49057638899855605,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.9028,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4188288937856507,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.7493,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.40361671819863904,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.719,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.6218687942184031,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.7181,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.4276015947492314,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7432,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.3893696064158002,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.7316,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.5573916049390001,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.6943,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.4209353071823604,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.8095,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.5022987674918924,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7704,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.4058450991396016,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.7177,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.356976458199042,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.6557,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4559370470859033,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.709,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.4628851942692234,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.8099,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.6231695580539695,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.7739,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.5341918529954933,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.8079,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3898975739162987,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.6931,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4323735885773568,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.8201,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.42396609382981215,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.7612,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.46687228082218096,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7789,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.3725562535302593,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.6842,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.49985405407575395,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.8269,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3805866889061923,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.7256,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.44370791874981386,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7253,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.4260670162379159,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.7221,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.41106896725164604,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.6774,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.40712095378486146,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.7613,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.42998666569186,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.7444,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.5435849941142359,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.7975,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.4555921328168688,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7239,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.4302517927607182,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.7524,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.43365261945779077,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.7458,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4966460762888895,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.7581,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.5136891806956415,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.8995,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3911369216152323,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.7568,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.38558731844673394,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.6653,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.45316318083820073,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.7015,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4903919882922413,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.7682,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.40877464708263694,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.6875,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.4338634341048958,
+      "learning_rate": 0.0,
+      "loss": 0.6674,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 268297746972672.0,
+      "train_loss": 0.7987522908892387,
+      "train_runtime": 4818.1513,
+      "train_samples_per_second": 1.038,
+      "train_steps_per_second": 0.065
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 268297746972672.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7f3c8ddd916a07f9ddb19b2de2d535d74d86617
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "q_proj",
+    "v_proj",
+    "gate_proj",
+    "down_proj",
+    "k_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0217b29baf77a23b7fd9449433bd613b54573b26
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d5ab49d0cbb59628b39a81f071d33fa784c87d1d63facadfc4e1f6ade35885d
+size 671150064
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..de454bc4c8d12af371b63ecda74dc98ff8028b75
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ccea06363f7010d274b2717119338b843a180aa3acd4b6a001cc95cad582498e
+size 918507402
diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..e22104f95e4f0b37dffb8c21d73c27a58f964bc9
--- /dev/null
+++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,1134 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 156,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.798103541351361,
+      "learning_rate": 4e-05,
+      "loss": 1.2744,
+      "step": 1
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.8268256866411904,
+      "learning_rate": 8e-05,
+      "loss": 1.356,
+      "step": 2
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.6024838262870713,
+      "learning_rate": 0.00012,
+      "loss": 1.1987,
+      "step": 3
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7090874925679889,
+      "learning_rate": 0.00016,
+      "loss": 1.1633,
+      "step": 4
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.7767469984784777,
+      "learning_rate": 0.0002,
+      "loss": 1.0355,
+      "step": 5
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5851130403593353,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.9556,
+      "step": 6
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5006197073828017,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.9518,
+      "step": 7
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.613689066369758,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9631,
+      "step": 8
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.415328797805894,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.8929,
+      "step": 9
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4515627425180524,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.9557,
+      "step": 10
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.4575354927834304,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.9313,
+      "step": 11
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5521896687236068,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.9291,
+      "step": 12
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.3977577767442164,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.8088,
+      "step": 13
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4657450149066985,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.9159,
+      "step": 14
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.402853389805612,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.9126,
+      "step": 15
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.7706018254819981,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.9101,
+      "step": 16
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.38828276045292986,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8458,
+      "step": 17
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.46581785750217314,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.9317,
+      "step": 18
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.38190582131334216,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8162,
+      "step": 19
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3774002656162366,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.8754,
+      "step": 20
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.4236226446364666,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8062,
+      "step": 21
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.38796693490725326,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8203,
+      "step": 22
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.38039026359896194,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8685,
+      "step": 23
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.386980713471517,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.8575,
+      "step": 24
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.35314630403308767,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8595,
+      "step": 25
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.3779210500854245,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.8573,
+      "step": 26
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3297621731524984,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8145,
+      "step": 27
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.3394455923366161,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.7879,
+      "step": 28
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.375977988038427,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.8579,
+      "step": 29
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.340761944056107,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.7752,
+      "step": 30
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.3455332956794238,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.8181,
+      "step": 31
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.32710973018478223,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.7574,
+      "step": 32
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.3500517950500944,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.8229,
+      "step": 33
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.3595737134142622,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.8348,
+      "step": 34
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3177571851143064,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.7854,
+      "step": 35
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3286868016903798,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8262,
+      "step": 36
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.36845715233852266,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.8182,
+      "step": 37
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.35277395281465673,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.8268,
+      "step": 38
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.34235646542887793,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.774,
+      "step": 39
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.32801991248244367,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7912,
+      "step": 40
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3209126529312384,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.7542,
+      "step": 41
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.40794489864543393,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.8257,
+      "step": 42
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.3408888492524192,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.8063,
+      "step": 43
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.35959807729052756,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.7978,
+      "step": 44
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4259774615159538,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8795,
+      "step": 45
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.3448967552992956,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.8024,
+      "step": 46
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.3272033029206055,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.8139,
+      "step": 47
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3479088858440763,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.8182,
+      "step": 48
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.359132852878714,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.8594,
+      "step": 49
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.31490254874430174,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.7838,
+      "step": 50
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.354879440739144,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.8281,
+      "step": 51
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.327843049496559,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7506,
+      "step": 52
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.40599580002804275,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.7735,
+      "step": 53
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.3139259928133556,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.7594,
+      "step": 54
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.34277518713692245,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.8431,
+      "step": 55
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.337432838751828,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.7585,
+      "step": 56
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.36705196571229975,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8127,
+      "step": 57
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.33158871878903523,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7825,
+      "step": 58
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.3439731934454538,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.8281,
+      "step": 59
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.318083568421075,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.7892,
+      "step": 60
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.31565952036149936,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7911,
+      "step": 61
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.30865906135962495,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.7773,
+      "step": 62
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.3302165770043087,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.7857,
+      "step": 63
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4252224960949748,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.842,
+      "step": 64
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3603281725920143,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.7999,
+      "step": 65
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.31189541399551407,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7535,
+      "step": 66
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3087015288022916,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7915,
+      "step": 67
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.305018125969194,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7432,
+      "step": 68
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.2934928706117203,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.7104,
+      "step": 69
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3030298084243325,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.8268,
+      "step": 70
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3438139485734787,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7616,
+      "step": 71
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.39297250837172915,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.8398,
+      "step": 72
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.33471051721291323,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.8027,
+      "step": 73
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3065066927350221,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.7905,
+      "step": 74
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.33318943398753137,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7853,
+      "step": 75
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.42129473104900184,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.8072,
+      "step": 76
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.32436875336084614,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7448,
+      "step": 77
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.27784398294634927,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7221,
+      "step": 78
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3055115279913575,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7319,
+      "step": 79
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.31640581286782005,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7814,
+      "step": 80
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.2933834108033171,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7461,
+      "step": 81
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3231004000455766,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.7646,
+      "step": 82
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.32688724585582374,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.8137,
+      "step": 83
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3215700871727219,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.7467,
+      "step": 84
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3225959581760687,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7534,
+      "step": 85
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3380772436668406,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.7856,
+      "step": 86
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.3947076794390273,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7596,
+      "step": 87
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.32682860564478994,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7352,
+      "step": 88
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3107025811034861,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.7378,
+      "step": 89
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3431174414700978,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.8123,
+      "step": 90
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.41848267196202366,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.8216,
+      "step": 91
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.32206129050676485,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.7476,
+      "step": 92
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3797253454059359,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.8047,
+      "step": 93
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.37579587132321085,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.7657,
+      "step": 94
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3144160021303849,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.7588,
+      "step": 95
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.31872120942680743,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.8133,
+      "step": 96
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.33461256383139165,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7537,
+      "step": 97
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.34544511746242657,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.8093,
+      "step": 98
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.3556859669902957,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.8024,
+      "step": 99
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.2975776942983784,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7544,
+      "step": 100
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.2924978663967572,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.7422,
+      "step": 101
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3633837849321103,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.8153,
+      "step": 102
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.3559642361047184,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.804,
+      "step": 103
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.30450721922184676,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7354,
+      "step": 104
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4018138547264168,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7881,
+      "step": 105
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.31572731104326274,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.7304,
+      "step": 106
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.3202202734392908,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.8013,
+      "step": 107
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.32290299485712043,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.8035,
+      "step": 108
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3848146563924532,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.8116,
+      "step": 109
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3013951272060788,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.7257,
+      "step": 110
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3322449631033,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7721,
+      "step": 111
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.30745286582323006,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.7092,
+      "step": 112
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.3085363085926714,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7815,
+      "step": 113
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.27770721320241926,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.6754,
+      "step": 114
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.3128107292452356,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.7151,
+      "step": 115
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3136638116475556,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.7577,
+      "step": 116
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3197174021974061,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7777,
+      "step": 117
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.38070102732191174,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.8077,
+      "step": 118
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.32484409248060714,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7242,
+      "step": 119
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.30865721254742123,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7363,
+      "step": 120
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.2782127109708396,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7441,
+      "step": 121
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.3862800418970225,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.8741,
+      "step": 122
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.36622365622341957,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7481,
+      "step": 123
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.4012346259284241,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.7357,
+      "step": 124
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3533622717190259,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7763,
+      "step": 125
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.33243555929314145,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7787,
+      "step": 126
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.3139784350612795,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.6837,
+      "step": 127
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.2930961825783699,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.6893,
+      "step": 128
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3132000826330831,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7264,
+      "step": 129
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.33020574839410205,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7152,
+      "step": 130
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.36208350989479543,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7622,
+      "step": 131
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3025134339835463,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7842,
+      "step": 132
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.3157431836217189,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.7883,
+      "step": 133
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.30264971788974415,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.7493,
+      "step": 134
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3020946816597262,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.7403,
+      "step": 135
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.29825579056874574,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7303,
+      "step": 136
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.34805042849674267,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.868,
+      "step": 137
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.30748878976922644,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7443,
+      "step": 138
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.31312920419810836,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7381,
+      "step": 139
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3448656445594576,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.7215,
+      "step": 140
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.34366821186403623,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7987,
+      "step": 141
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.2842539946388669,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.6935,
+      "step": 142
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.33709204356585964,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.7689,
+      "step": 143
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4085295140872269,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.8048,
+      "step": 144
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.2966532953057819,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.7672,
+      "step": 145
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.33309653198101075,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7779,
+      "step": 146
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.3209119196082951,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.7653,
+      "step": 147
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3002663218377736,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7318,
+      "step": 148
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.2974717069333661,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7078,
+      "step": 149
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3349525244186073,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.7619,
+      "step": 150
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.3704839046536139,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7687,
+      "step": 151
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.3106434251504329,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.7551,
+      "step": 152
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.4092648868636652,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.8391,
+      "step": 153
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.2886810815093622,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.7204,
+      "step": 154
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3459032074889401,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.7398,
+      "step": 155
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3167726444330969,
+      "learning_rate": 0.0,
+      "loss": 0.6865,
+      "step": 156
+    },
+    {
+      "epoch": 0.9984,
+      "step": 156,
+      "total_flos": 391611316568064.0,
+      "train_loss": 0.8046577454377444,
+      "train_runtime": 4785.766,
+      "train_samples_per_second": 1.045,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 156,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 391611316568064.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e66c2bb24e52f99f14dcfa8f9782b020664fc4bd
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "gate_proj",
+    "q_proj",
+    "k_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e47d0af5868b4d3b3bf4b2bca2faf540b151083a
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65b661237c42e5f110dfe3f432de15d54a240dfed342889c942cda0caf23b07d
+size 671150064
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9622252d9037762d6d42e97c1afb6aafca440ad9
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2de6d6d50a593dd1fef4b431769a19f06bab0ed049a2e5b55eceabbd2699d14
+size 918507402
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a124a9f137de2e63b9f528ad27572d2ce2f217c9
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.7705987921299485,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.2058,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9287416636509861,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.4134,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.6528262057381008,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.1559,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.6765882832433683,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.1743,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.648079038578141,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.2401,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.6566661666440996,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.1006,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.8548354477419776,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.0458,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.543740331558598,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.9982,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.7318228644416274,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9406,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.7417915994317061,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 1.0351,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.5922704421680609,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.9339,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5862372762823224,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.9481,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.535213198327505,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9431,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5089424410764984,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.9458,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.5097782413871194,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.9507,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.47556040207833583,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8975,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.5018954188069049,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.9151,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5144020872477301,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.9505,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.43303216280426504,
+      "learning_rate": 0.0002,
+      "loss": 0.8126,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5072332356758589,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8382,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.5120915383249092,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.8401,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5421958032379722,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8774,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.5150299514545867,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.9377,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.4721233844843988,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8796,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.4823898019933978,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8918,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4541381232034085,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8997,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.4798261742844607,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8502,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.4551744043933959,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.9257,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.4775774615905478,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.9352,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.48409776638768615,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8945,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.44386948052851427,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8241,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.48822961505738594,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.9087,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.4991405828880664,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.879,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.47567582863477037,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.8483,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.51873526165059,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.8829,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5011861028018147,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.912,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.504469326779835,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8259,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.3637338707859365,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.7404,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.4284421793999004,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.7995,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5034556480799789,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.9058,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.5436751511445966,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.9353,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.42969339484572733,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.8426,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.42490544608441266,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.7866,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.44823434854700056,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8101,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5603168797723098,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 1.0164,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.47644193806778984,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8952,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.43382576577650683,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.7644,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.4590678040474791,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.797,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.45059180280160405,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8682,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.458024289787967,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8254,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.4708236548074165,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.9232,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.4107265066815477,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.7776,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.44617773282303985,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8822,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5143311662028887,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.9311,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.410377419821763,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.8152,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.45038945127365737,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.8273,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.4689868980609412,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.9484,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4086286046140391,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7996,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.3925097264811042,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.7805,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4468204196738424,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.8896,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.427897958968694,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8762,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4694502565493321,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.8481,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.44286505229187695,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.7239,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.38451087694388386,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.7907,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.474696902181304,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.9338,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.3941485079754015,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.8234,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.42634108625304457,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.7857,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.43721561580101864,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.755,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.41400298876290526,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8191,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.4111519335093886,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.7836,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.3981205896778025,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.8067,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.3944168309890051,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.757,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.4378681168720987,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.823,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.45930105215478095,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8343,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.44690939308144634,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.8888,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.41052669970769035,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.8003,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.42790787379025635,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.7959,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.4175218455704906,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.8134,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.48152789594791556,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7923,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.37106931295853823,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.7594,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.39560355388219715,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8018,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.423707151791688,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.7834,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.4120453080503355,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.7546,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.4113061305774698,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.7901,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.46589571714326883,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.8592,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.4305164674690721,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.8756,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.42281779829677335,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8405,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4525954367363889,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8727,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.45227946434780003,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.8716,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.44211617864856445,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8879,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.40256649897418,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7606,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.3741521146665791,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.6961,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.3705518303191532,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7518,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.4752188811217134,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8534,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.4102678943154998,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.7634,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.45507608170811004,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8659,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.42180581722244276,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.8058,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.460635684487175,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.8622,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.3851113848164485,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.7331,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4544307900666547,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8508,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.4125764591291438,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.7378,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.43655716374423936,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.7818,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.3955333708829416,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7435,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4356225922622882,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.8667,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.4135943321838593,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.7929,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.44086428372922054,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.8279,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.47213515956851726,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.9233,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.43531197249522713,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.8459,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.5097410845471991,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.882,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.46001640374981173,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.8223,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.4110670427542391,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7927,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.42224965826201943,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.734,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.44770731481345016,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8318,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.4718645249580385,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.8198,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.43084154367305444,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7576,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.41664958460149204,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.8228,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.5159539127338308,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.8686,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.44146899039468684,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7658,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.42253703116326774,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7849,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.42141440612086756,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8632,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.4173262660076212,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.7567,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.6396271573551264,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.8004,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.390100213608852,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.745,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.40109375411157056,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7762,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.4503019228338576,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.8385,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.45327963011421096,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8516,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.4354314586768901,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.7582,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4035901466888465,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7655,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.48647013297209213,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.8354,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4250445114534265,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.8026,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.3835530554003175,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.7443,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5045517877721084,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.9247,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.38472219697658894,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.713,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.3970787060759494,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.826,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.42215465881861736,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.832,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.3829002961198215,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7654,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.41525532207570953,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.7949,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4596257126163404,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7851,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.45575584823464793,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.8233,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4460233322200534,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.8032,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.4421197126830367,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.8437,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.3893196845859463,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7453,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.4592060372904219,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.8468,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.4385397356658086,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.8273,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.46710183295178787,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7739,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4176634754089223,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7437,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.5003204720116483,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.9364,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.47028599712650937,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7849,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.4147643110815873,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.8051,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4211866455889534,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7602,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.42739548556178647,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.8049,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.34268327409876764,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.6774,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.4228806711280117,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7527,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.4334201330779741,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7945,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.5008077234098525,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.8601,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.44269101925884186,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8329,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.46878674881645654,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8273,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.44699352618503435,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.811,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.4350488356067898,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.8307,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3891456533806176,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7757,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.4624821361475699,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.9079,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.42936710819614854,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.8596,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.4414795774129634,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.846,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.4094980900461479,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.8341,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3935718900239617,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.789,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.42602983206125355,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.8319,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.4693988113168292,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.8866,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4527066533195711,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8334,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.4315756796958446,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.8035,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.38470897900882584,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7471,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.4138045470291073,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.8217,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4490076288389667,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.8114,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.3831397215480176,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.8037,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.40656339965363736,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.8041,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.4230254535846176,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.8037,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4502358942022541,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7649,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.4133916093154814,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7642,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.3996960998479458,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.8166,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.44549862199892615,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.8283,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.396495724613648,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.8008,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.39841238324541506,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7687,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.39333262230313937,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7383,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.40407206038335314,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.774,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.44971391392611154,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7585,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4461302770699972,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.797,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.37971869846791584,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7014,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.41317053030930717,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.7832,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.4263929203142742,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7407,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.39562339260632456,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7781,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.5013959628903849,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.9173,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.3942335902989073,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7969,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.37459670529215294,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7294,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.4210059470185516,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.8715,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.41642305626617326,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7845,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.39990830100754915,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7446,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4303864965308377,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7968,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.39820698504610136,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.7969,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.42663518600017114,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7582,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.41520429902798184,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.8021,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.36322319811758375,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.6502,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.4163307297397158,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.8172,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.37562605218750017,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7547,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.4230534393680675,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.8363,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4276005604829845,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.8644,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.3939117069959653,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7293,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.3796784177603523,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.6843,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.38621167634662484,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.8034,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.44536606693319236,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.8331,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.3843524680432091,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.6972,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.3776309616691966,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7132,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.424706653591385,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7772,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4362257286558965,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.8142,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.4144045790572788,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7776,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.4182797718014241,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.8402,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.4042797647645218,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7901,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.39190545447558184,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7468,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.4152515011993666,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.8122,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.44268429313744734,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.8285,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.3425723489745504,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.7116,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.41744878887910053,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7176,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.46331206241115097,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.8512,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.3613265930834721,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7438,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.38698802573239155,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7379,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.386993427342769,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7188,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.446411015740921,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.8473,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4024015048695294,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7998,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.44200562601104565,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.8643,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.44979198811554505,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.8779,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.3683046668010399,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.7407,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3805387133659554,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.735,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.4335537863789879,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.8147,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4515454431913857,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.9138,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.4071599274675956,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7195,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4448620554088051,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.8262,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.40612385326640654,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7293,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.4201371009731904,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.7699,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.3817394982005736,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7096,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.3843192659592516,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.6966,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.354570749367614,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7078,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4398836894186461,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.8125,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.4030011170075466,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7938,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.43157156210319664,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7683,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.5041538238832468,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.8766,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.3757364033441835,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7205,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.42081440661009456,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7702,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.3894381312900156,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7194,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.41722293075106853,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7095,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.3775195605592594,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7463,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.3781725749303525,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7386,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3706384133547582,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.718,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.42984820535863394,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.7752,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.4350068963361006,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7418,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.40270988988661344,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7238,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.44146982128082984,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.8166,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.4160425881027499,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.8187,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.3793738450125677,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7636,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.39380344780082505,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.771,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3999588927094165,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7631,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.35011976155280244,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7405,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3691023639048862,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7313,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.4276610856998654,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.789,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4015299875013705,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7685,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.43478748123059957,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7292,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3647866649374095,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.6115,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.3706666387908333,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7263,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.4082112506331456,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7279,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.3567764932832936,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7078,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.32385613327994045,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.6632,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.41752017211552456,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7704,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.36484783789209513,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7171,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.3800517948110794,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.6994,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.38449086574583846,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.7564,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.4375588496174668,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7946,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.4170392613463579,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7976,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.4013437633464495,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7602,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.40586730733844506,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.7139,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.38522148446881194,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.706,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.42134563172884165,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7899,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.46908264078789513,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7233,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3456513199532403,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7475,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.42382505862487274,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.7127,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3848614543819859,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7472,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.3754873003535694,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7421,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.35672849974354826,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.6879,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.393706100119924,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.7442,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.5132659461901151,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.8223,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.42096085231862373,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7911,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.41160139009461555,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.7602,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.40370372309885755,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7522,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3797479758946454,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7441,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.4089049239884153,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7674,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3699422967708885,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7913,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.4172489783980525,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7504,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3941674909826128,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.7512,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.3717788781996277,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7018,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.36612187327758705,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7612,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.4240098441909349,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7752,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.4052278391226007,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.7494,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.38791050098976654,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7352,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4193408021718286,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.8262,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.3436289543752552,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.6915,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.39333500810550576,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.7322,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.3879541659537608,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7735,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3992620787556518,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7088,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.45595291894570794,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.8255,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.39090092696923756,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.7784,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.39554473367621173,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7205,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.37634005674867854,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.7633,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.3923165222100373,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7269,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3802239727476873,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.724,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.3887117802908206,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.779,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.38929208447851044,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.7598,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.4392462816587192,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.8084,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.4233610027255395,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.8081,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.35551803094636486,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.6632,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.43762388215444115,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.8377,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.39059086643467694,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.717,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.48759849975439196,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.7588,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.36638834844002566,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.7019,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.40404816221652523,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7781,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.4178781138104464,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.754,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4067391473152315,
+      "learning_rate": 0.0001,
+      "loss": 0.8082,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.3527104968566196,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.6608,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3968723263442942,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.719,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.39597297154816335,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7231,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3768104530544193,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.6931,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.49124318766056924,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.8417,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4008540189956664,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7466,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.38033117101983577,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.6648,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.42433433907030377,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.6843,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.39575325823787383,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.7249,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.4350701258943475,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.7909,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.3969116712819791,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.8239,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3207117103886632,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.6272,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.4350506568108911,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.7648,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3727988120773944,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7107,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.3684460888235613,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.6512,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.3782634227770135,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.6954,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.41514391282200863,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7651,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.401028442423211,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.6985,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.3480899665203147,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.695,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.41568335772628967,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7377,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.39347165513947524,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.7183,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.41172276913242867,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7699,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.3618416233059816,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.6934,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.411553987932596,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.7428,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.3698943895704956,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.7472,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.43346183874671734,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7254,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.3535648717952184,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.6434,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.38838719596815624,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.7129,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.3424647163807555,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7146,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.442661686489916,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7515,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.42019776337895187,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.8075,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4379255306731001,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7847,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.42893827182216726,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.7439,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3849300991614672,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7349,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.39934497863521856,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.7044,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3910138678129991,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.7603,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.37063618096573925,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.708,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3797548351310327,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.7516,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.367283331582471,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.7282,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3693911710963683,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.742,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.37117296819053347,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.7307,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.4178706979946997,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.8086,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3341662767235502,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.6586,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.39130683157286156,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.722,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.37536743917135806,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.702,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.3894262722031057,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.7387,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.41955287473679065,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.7357,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4312111630989293,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.8228,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.39716106325643424,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.8019,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.4178377909508333,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.7513,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.44004532782831524,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.7758,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4076132922530294,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.7571,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4384143849615539,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7597,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3824858096412295,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.7163,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.41274801929946386,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.8296,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.3778452359098377,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.7169,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.83457178394802,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7542,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4051074171497204,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6941,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.40839742549491664,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.7335,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3599698010426839,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6727,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.4044892607723987,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.7579,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.4136646261049131,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.7691,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.4023757957141722,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.7302,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3745309167149619,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.721,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.4609841289633602,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.7652,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.43918955408635996,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.7718,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.38943471053372797,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7419,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3641328317006478,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.6782,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.39054082564457465,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.6946,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.4163721453017164,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.7334,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.4040283264722037,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.746,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.3899654799773014,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.7533,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.3862854476843213,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.6908,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.41090067469848174,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.8084,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.3903151474186511,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.6616,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.3408539252470795,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.6419,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.4301941156068001,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.6987,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.41468517833589413,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.8409,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.3641103522991886,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.7095,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.37981059039886306,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.7086,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.361243057344294,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6633,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.388195254180337,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.7363,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.3905619249093549,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6728,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.3932491367342553,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6946,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.350177793935555,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.69,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3618476324694659,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.695,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.36566105958982537,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.6729,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.37240137101971765,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.7005,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.365534114600444,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.6521,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.4278148742820072,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.752,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.3446300250209333,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.6819,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4564423647918072,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.7751,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.3482488541574724,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6676,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.429744562251442,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.7065,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.3555226566302722,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.7224,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.40841608187723355,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.714,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.38425461037068026,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.7388,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3250019884787973,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.6108,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.36666384292032755,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7037,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.34705434799655177,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6675,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.39998728379268955,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7274,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.36905428382946914,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.7448,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.3369505694390931,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6504,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.38658903930111854,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6545,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.38341682365351915,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.7368,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.37245313752497233,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.7203,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.3925999422669356,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.744,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4091867154975362,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.7168,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.43459939651537244,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.7363,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.37977781506193403,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.7212,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.42916604552824417,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.7584,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3811683876560328,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.6965,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.36854117902095374,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7465,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3486796533755056,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.6606,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.4223137021437662,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.7942,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.41888004844002535,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.7344,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.35508306503610504,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6782,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3660112193162608,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.7241,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.4036480896953565,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.7688,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.407694512647162,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.7412,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.43662370557404956,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6908,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3634891497837515,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7257,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.40664977533567964,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.7068,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.39365246871243736,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.7766,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.4170646598756153,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.7237,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3949223629278131,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.72,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.4066949646787602,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.7605,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3923750393538408,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.7038,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.40529782786728324,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.7033,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.4299209870019213,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.747,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.41000023145491954,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.7578,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.4427874287197096,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.7156,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4056149971087807,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.7204,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.4735201037988204,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.7071,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.390191805503046,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.7422,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.42161085510406815,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6752,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.3634090280667586,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6786,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.4175867578453434,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.6741,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.38021645890769495,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.6667,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.3629416835719003,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.7011,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.37758032419229504,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.6847,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.37926863810244416,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6843,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.40483928275961767,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.7684,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.4012428830040729,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.7423,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.4531335988464249,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.7868,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3724295350867727,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.7229,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.31844459673385905,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6571,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3817107137939138,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.658,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.41823823403293325,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.7522,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3853953899148513,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.7086,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.37444613159378504,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6852,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3364469465738454,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6309,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3766916878108581,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.6876,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3687270478967296,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.631,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.43006711836241546,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.7249,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.41376371976415266,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.7457,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.39661335961253313,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.7553,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.41278923081094665,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.7389,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.40790022202783777,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.7601,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.37338816286727505,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6919,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.444446835884112,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6802,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.43917108463930477,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.7169,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.3782458515975163,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.7329,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.44412516224453413,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.7783,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.3706509169863611,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6854,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.3857956584271855,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6859,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.4003236290741562,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.7439,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.3760773928985438,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.6723,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.36045457976400214,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6893,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 1.2048342969352837,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6331,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.3674614487857438,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6861,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.4309117087357981,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.7497,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.3746159140343749,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6925,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.49526397518811116,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.8221,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.3552528891639267,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6632,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3850478844126527,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.7187,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.45331788252803906,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.7863,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.41113094587203364,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.7442,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.34173299346522756,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.621,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3918093267833597,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.7471,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.4071757676646653,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.6827,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.36461233978685836,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.7157,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.4173732799907159,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.721,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.38034957823911675,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.7238,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.36901819783380657,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.7073,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.34235739707844226,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6765,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.359160494256085,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.6573,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.36841471957322675,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.6714,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.3746998392794317,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7187,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.4009926064824942,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.7293,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.39747495239080854,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.7569,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.3603387458815299,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6585,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.35799781284175525,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6808,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.44114057001829704,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.7489,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.36499192034738037,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6568,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.40822150511019223,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.7737,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.419435383777233,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.7003,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3673923491290724,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.688,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.4014301027660096,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.693,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.36667018873927276,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.679,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.3792293799271566,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.7182,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.34501044137013914,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.6377,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.3958280906980653,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.666,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.33412031700283,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6487,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.3801984010243941,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.731,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.39145676353884284,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.7134,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.4504736694689959,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.7915,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.38525345734357347,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.7145,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.32936707004333354,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6181,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.35780029503587424,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6665,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.36326556419103656,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6328,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.41195049470576833,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.7343,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.35621015351537844,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6396,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.4792182923930832,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6594,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.6149215583954762,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.688,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.4297627637067828,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.7642,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.40345632075139887,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.7588,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3610496830175501,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.7162,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.41011685694858785,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.7967,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3798344941908174,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6582,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.3464177728333097,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6785,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4312418663664988,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6904,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.3948205989738892,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.7135,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.43551744339062237,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.8037,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.42973270177790496,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.7149,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3931138997667529,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.7311,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.4069832256194925,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.7481,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.35295605615077075,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6563,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.36970147203136483,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6784,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.3993687277137508,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.7054,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.3893239804460525,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6396,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.3645677559830506,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.6305,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.37258441212125676,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6641,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.406222026526566,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.7173,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.35254682706810464,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.6441,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.34578045575046695,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6401,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.40371305535068225,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.7299,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.6228207476057178,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.7163,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.3714632656937209,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6307,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3977346348790776,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.7076,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.3784668881873,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6746,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.37457939265841944,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.717,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.34966436288115627,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6452,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.3901389579345433,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.7063,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.3690124987745926,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.7087,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3367381615082695,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.614,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.3846644802828723,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6728,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4241241328617195,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.7396,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.39826283281007263,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.7462,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.35359225029925234,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.7052,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.40396954109044586,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.7128,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.346954730447735,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6635,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.38781032367239804,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.707,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.42152169571432246,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.7352,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.38507703413309846,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6512,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.47196913997467754,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.7447,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.3575724745654256,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6611,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3995944350589313,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.7024,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.4241968511826328,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.7842,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.37744754885360143,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.7176,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.3695777839879396,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.689,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.5692474674640828,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.7333,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.40111766458101306,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.7541,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.401526406854631,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.7282,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.34518584418627035,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6562,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.38119101383299064,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.7488,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.3950947394304229,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6646,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.37298648609688384,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.7561,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.38545179724312784,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6542,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.4213751776668678,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.7147,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.4113797418789869,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.7601,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.38536088528820606,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.7049,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.3940209686407352,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.7093,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.38690608167699586,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.7008,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.3637323418643273,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6682,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3613533381200542,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.6902,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.3796143995634272,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.7431,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3738193351099153,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6407,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.38988250774522487,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6804,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.4016807389934873,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6784,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.48043543790293974,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.8607,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.38827723015107324,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.778,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.34769342382578017,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6027,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.4443948802872273,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.7918,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.38797962276906184,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.7203,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.3728930396134099,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.7108,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.45712577327358017,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.761,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3787903003015055,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6748,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.3828030831780167,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6948,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.4481459645525971,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.7452,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.42796719785108117,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.7581,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.35796128731751464,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6817,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.4597394998169397,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.7275,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.4387161435546444,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.767,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.6230864849651305,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.7307,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.368425023152609,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.643,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.4251114006197662,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.7252,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.38524312435789754,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6405,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.38587067921959417,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6866,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.5329911655371361,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.7437,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.36957638046494906,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.7233,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3529599745664591,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.6753,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.3222777567675539,
+      "learning_rate": 0.0,
+      "loss": 0.6019,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 564799309512704.0,
+      "train_loss": 0.7624192319869995,
+      "train_runtime": 9812.0821,
+      "train_samples_per_second": 1.019,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 564799309512704.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bf098c7db00ad5f90efb5ddac146a4d2ea3dd9f
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "q_proj",
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cfa7133f35b40b9c56fcdf92119575cade1bbd2e
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56d4a1fc562e1df7086b8402a70b528b9238517c664afc92a8542c141223eb31
+size 671150064
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ba4c0c90b5fb5065293dbc49c95107dc7f539abe
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1a420a0c6568db3d9b9e19aceca86a024806831d6ffa900fbf7c2a6243dd8ac
+size 918507402
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d3ce7be30f414db6f6a5d953fbc46404eaff38e8
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.8257624953110868,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.27,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8849018671370342,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.4458,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.7509311592129871,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.2182,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.7137680066316227,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.2617,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.7316186299029664,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.3126,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.6001527072491968,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.0612,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.8492302070359525,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.0722,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.450302548161873,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 1.0387,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.6960657836646971,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9636,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.705172266920731,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.9926,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.6120426006524602,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.9623,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5426022905466087,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.9781,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.47821872255356507,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9036,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5498607304344992,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.9288,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.4813152944280897,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8484,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.4279786143269863,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8211,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.5233559183729891,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8525,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5037619024731214,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8096,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.47578877531470143,
+      "learning_rate": 0.0002,
+      "loss": 0.8462,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.4972768484731378,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8986,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.4899022691665234,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.8565,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5182013476634282,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8519,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.4885670832653695,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8396,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5051081412652413,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8486,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.45830988104135423,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.9069,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4819368762206989,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.9282,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.46110598549601506,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.9237,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.454165240480369,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8468,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.4710043873377542,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.8353,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.42572029425373187,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8304,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.40729676507467505,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.7821,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5461850316857276,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.9009,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.5092329216983018,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.872,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.4407306898878146,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.815,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.47477756275372385,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.9128,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.637199937427722,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8611,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.44257844598618723,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8562,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.3839093232039565,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.7666,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.38982421254229743,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.7774,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4672412257535968,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.87,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.49388594602208363,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.9593,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.40004466148897183,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.7857,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.4391729474981486,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.9029,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.4062100678606932,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8269,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.4884470051347818,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.9164,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.4487857448368005,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.7965,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.39765220657903605,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.7784,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.4601633869157618,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.9308,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.4101666126792353,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.754,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.44548198621900076,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.856,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.5801328126105747,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.8932,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.36856601944646905,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.7528,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.45425100785181777,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8669,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.49515129914524086,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.8666,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.449627157260385,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.8773,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4519274467445987,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.8933,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.5315561619090179,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.966,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.41488901833456876,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7854,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.4209767323467047,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.754,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4646050150852957,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.8864,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.40622135118720293,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.7717,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4105597365968364,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.8562,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.3976819335123263,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.8209,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.3923890516912274,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.7461,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.49781743100843345,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.908,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.3664119530957589,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.748,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.3868815767533198,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.7793,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.4637771670849159,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.7995,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.5112540673791847,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8757,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5215635020199666,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8088,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.4175422409943572,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7561,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4199343991060303,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.818,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.4286859957616902,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.8261,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.44104864828984747,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8134,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.4898049596218389,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.8684,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.37882329742187837,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.7615,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.4342522279372195,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8552,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.39918003097612353,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.8081,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.3843608742830849,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7525,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.41185360675496846,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.7833,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.4712143549738129,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8979,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.46496616702516796,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8028,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.39756632366171013,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.7294,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.48898786552510476,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8183,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4650291567120808,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.8263,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5712177965190297,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.9212,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.46084933887694657,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.854,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4287071742279292,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.7442,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.44196836754754315,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.8025,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4323605793101959,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8067,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.4435088889631693,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.8735,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.4145358036680274,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.7529,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.4347155628860444,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.8356,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.44837133567887316,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.7801,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.37201668165668317,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.7497,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.5081356279683785,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8867,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.3938546028533844,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7808,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.4500008885494135,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.8128,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.4495024223333861,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.81,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4406379829690688,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.7955,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.41826780433074173,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8126,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.420674633705996,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.8429,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.44375081790379906,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.8415,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4156010025808981,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.8246,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.4163840175778524,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.8027,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.42829205970203965,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.8678,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.5169822667669132,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.8952,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.4115600454299799,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.8171,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.5459970450426384,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.9308,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.46653161274206084,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.8286,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.4169817851447276,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.785,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4296986336280778,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7317,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.4598289764406156,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8509,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.4284789450172835,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.8109,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.4085105133998958,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7483,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.4223516139021159,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.8008,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.4629910355273375,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.8232,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.422795987366406,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7653,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.4366821075522493,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7955,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.49327921484191617,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8061,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.4130638414034734,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.777,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4021659071975698,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.7052,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.4271748007291228,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.7365,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.5299687239505216,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.8153,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.4417088026157466,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.7942,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.45394229276773096,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8192,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.412362902033608,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.7597,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4007337048627665,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7681,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4615639947139987,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.8859,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4302940521491683,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.7851,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.4627462262458623,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.8236,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5225423155947857,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.8811,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.4118106570776621,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.75,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.40334530582040934,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7871,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.481257405404212,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7831,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.395477348102348,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7759,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.47736217256570684,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.8457,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.41422381746618936,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.718,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.4377367205851379,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.8147,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.43847669953968815,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.8322,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.45068408744912125,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.7826,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.42442234856525457,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7558,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.42801746913566524,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7609,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.4156507051576132,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7683,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.4473201587405533,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7999,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4450985377903021,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.8379,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.47962557581652776,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.8857,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.4545841624688034,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.8338,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.4872334190206932,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7465,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4115959905031825,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7727,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.42134033238754837,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.8107,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.4651537446633174,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7605,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.3958512961688566,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7908,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.40374748010966977,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.8042,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.46632059523037944,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.8747,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.4089827861507361,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.7378,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.4204564786385584,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.7679,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.43135946170039424,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.7364,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.4572562720487233,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7828,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3964215078366468,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7606,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.4079025985071964,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7709,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.41629485307444286,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.804,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.48862714334532076,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.8566,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.5481987062788519,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.8234,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.42388491477007645,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.8151,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.4278050985828012,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.8268,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.4896370556326024,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.9108,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.47988687499860594,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8804,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.39563609410528994,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.7929,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4137808481888729,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.8035,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.44737224794448993,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.8587,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4320781272251299,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.8008,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.5549253939838589,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.7384,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.40368626607297975,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7752,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.3919632231202716,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.758,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.38846970875015685,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7407,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.4234695021547494,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.8041,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.37033350020258665,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.7239,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.450640661964644,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.8635,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.414006871374076,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.8208,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.385575077556316,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7393,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.37700313300165783,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7203,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.36571802827600036,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7634,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.43104127310561624,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.8244,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4272358408847561,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.8464,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.4224505342065028,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7587,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.44320846402142194,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.7414,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.40422936071607923,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7393,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.38470066057426316,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7608,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.42680134499154426,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.8225,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.39199904277864656,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7313,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.48430667283851375,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7957,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.5008213998300665,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.8366,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.4258478451411289,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.8014,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.41286270326182095,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7777,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4608928926555317,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7625,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.37566052898735275,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.7269,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.40093337371128995,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7389,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.4250575628607831,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7801,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.38638385236682093,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7088,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.4490332176805291,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.8242,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.396049438484832,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.6977,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.44389641370159033,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.8626,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4138122349178688,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.784,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.39957142803660367,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7464,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.39044490610642507,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7224,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.423854098617967,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7765,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.494937095647527,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.931,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.43593555821026797,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7474,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4180729166268637,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7606,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.37792494101350965,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7049,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4258499126283775,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7707,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.5362628797091166,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7658,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.423541897868339,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7839,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.4400929463964708,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.8009,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.39823372133247276,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7407,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.4012724681493264,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.8122,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.430027401544907,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.8377,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.3846442643320667,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.7156,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.41989272346707535,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7607,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.492464913681311,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.7787,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.420887246174931,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7184,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.3921935792428722,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.6461,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.43121996065043783,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.8441,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5256407145540728,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.8053,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.39102355519678256,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7677,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.4450017516918292,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.8406,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.43401607108771595,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7895,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.43172550282134836,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.7656,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.40115710551859224,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.6914,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.47508107157931734,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.8685,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4364658292422192,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.8461,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.405357155813672,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7901,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4268868031697948,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.802,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.4203661553305176,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.8016,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.44818778300154055,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.8305,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.40703274262091965,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7762,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.38366165022774773,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7233,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.39598038159608206,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7415,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.3945885969531272,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.7058,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.43623370184657606,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7978,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.4204091213025159,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.8017,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.5289529913397587,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7869,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.39563063038232776,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7279,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.6646447717718736,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7663,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.3436166962308719,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.6877,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.41497417485524535,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7728,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.3940763188554032,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7276,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.40170852358208364,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7649,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3450250773425443,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.6869,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.45045967476777526,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.7924,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.4170294253003158,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7832,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.3984311220870121,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7684,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.40637574390925585,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.7922,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.4092462090318203,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7896,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4167534061507475,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.8078,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.3924934029683997,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7332,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.39804257535948756,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7392,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.39437321700400224,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7295,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4130817180305685,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7655,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.46532009398453045,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7732,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4421018210383502,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7174,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.4325475151037246,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.8001,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3793179799376978,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.6803,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.39957002001596764,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7373,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.49396432276637636,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7904,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.3966663161210732,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7132,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.33985759240020935,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.6269,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.463495162503486,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.8521,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.3864811707219409,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7731,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.35898489171148906,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.7166,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.3830649830003527,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.7008,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.39218740129801105,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.8066,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.4545210140307896,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.8088,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.4001646467869731,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7583,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.501760698783621,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.8184,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.41495984936402913,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7502,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.4429571974157355,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7889,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.35297438425001587,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7223,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3853526802627918,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7349,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.3959742690518756,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.7553,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.36840329577987396,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7237,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.4147780404717559,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7562,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3225832480362139,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.6579,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.4295353448094699,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.7615,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3775483048845946,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.6848,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.4142142075691693,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.7828,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.5353698126363954,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.8546,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.3709076345312252,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7884,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.38094439266936675,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.8052,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.3648239047897685,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.6778,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.37621723942302054,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.6868,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.37294335429933,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.6813,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3947357539347734,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.776,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.42250703638121173,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7903,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3653307401093366,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7897,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.43225090802331745,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7897,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.35987063668403163,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.7095,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.3703943344110747,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7044,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.39396779805761384,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7138,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.3669713436401558,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.6887,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.4115968677666093,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.7336,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.4053731358187222,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7511,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.383167190850852,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7879,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.44544583953871886,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.7624,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.38762096582354316,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.7064,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.46154447729862186,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7773,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.3667986654815652,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.7164,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.39421575859433117,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7849,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.39348479958744115,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7057,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.3345782623398654,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.6458,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.38077963460395853,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.7538,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.39380524756070634,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7337,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.4027945367600016,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.7876,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.36455482252741356,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.7172,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3935296430183471,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7782,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.4290128530986274,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7033,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.43106936236417626,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.8015,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.3895271346888774,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.7489,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4430065252654003,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.8307,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.4128390845399857,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.7193,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.49443405778362715,
+      "learning_rate": 0.0001,
+      "loss": 0.8562,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.3593474485537418,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7087,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.40412028320168725,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.753,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3951795079899187,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7211,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3367446433383703,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.6938,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.4841404000315512,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.8777,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3851192893210263,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7634,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.5704732313833488,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7353,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3806526251171886,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.7154,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.3846988731990761,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.7406,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.4010843355156052,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.8044,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.4117946143014912,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.7466,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3524936105268302,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.6818,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.38073387417606747,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.6732,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.4046186313227552,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7236,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.36007367962423836,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.6734,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.37557697379206506,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7367,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.44575227355425057,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7971,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3982433187654024,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.7411,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.33802865927334874,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.6448,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.5241892433461196,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7203,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.3795284951339069,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.7399,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.37754983890899113,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7293,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.3811119514810814,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7078,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4272118304347965,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.7358,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.3527172570488226,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.7124,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.41975296557179936,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.814,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.4366513397993346,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.7753,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.36646782412326345,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.7218,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.36411771714130664,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7333,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.4171863450474228,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.7143,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.4589408752697243,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.779,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.50266819165461,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7659,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.38642188183332127,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.7192,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.41730490029421946,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7533,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.39415503852998696,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.7477,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3861078446656998,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.7181,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.3831644260946984,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7182,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.39034010397086755,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.7445,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.33999132167303014,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6693,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.37322174347140913,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.7158,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.46321417604021015,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.7518,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.42936758323938373,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.8485,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3753215628295166,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.7021,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.35094014140180074,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6345,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.431186522618548,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.7225,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.37720378959640954,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.6884,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.3905684871067712,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.741,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.5132491011476759,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.836,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.47951055792597375,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.7884,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.43948102408392065,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.8311,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.48257264189097065,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.8758,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4447869671042225,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.7378,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.39888307027522596,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7463,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.35485033544439026,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.709,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.435385292631864,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.856,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.3654057961572283,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.7277,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.3512050443870654,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.6231,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3631995693895185,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.7405,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.38125792459638624,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.7495,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.37284718216829515,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.7397,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.41853111533941406,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.8257,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.42212708864466325,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.7621,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.4149609259978424,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.6828,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3591701977975191,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.6771,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.4462034755358388,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.784,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.4389177547070431,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.8281,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.39498352550620824,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7069,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3902459879971352,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7696,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.3841269188816985,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.7026,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.38501668513554343,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.7246,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.42015739538850205,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7774,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.36397079116853126,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6882,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.39594664298516524,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7239,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.39402405299278287,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.76,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.3908592872704309,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.7243,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.3859405443289071,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.7062,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.4165749502624558,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.7745,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.37482765096839027,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.7442,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.3460701837878398,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.7042,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.38972027239062884,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.7041,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.34049194096743063,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6731,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.36566354590299033,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6411,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.36042562161243635,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.7209,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.397309512992513,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6715,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.40629926491617097,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.7581,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.38356606375229335,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.7244,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.4152544313073025,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.7651,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3758133525999123,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.7082,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.3816846505519979,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.7805,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.3965486514163321,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.717,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.33240408453613673,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.6401,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4175149020063599,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.7771,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.3732170038705519,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.7214,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.4419458585067435,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.7804,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.41353646131130006,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.7175,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.40766063788271767,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.7859,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.35690262967116304,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6778,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3882922737844192,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.6904,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.3275958128104016,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.6574,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.3716514794615876,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6945,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.39215917011327645,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.6868,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.4176689926298356,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.7971,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.35351206530878865,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.695,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.3960789782385614,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.7052,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.35111156694312495,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.6548,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.39265391171644753,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.7084,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.42802899420975543,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.7307,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.40932452653354834,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.7581,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.3682883000354338,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.7211,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.36143839488425517,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.6962,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.407685444205195,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.7794,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3914127433539138,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.7232,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.32812967570686413,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.6575,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.36512818943137537,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.7275,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.39077515220878783,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.7408,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.3697446229432021,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.7028,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.38193183440104617,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6518,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.41531624959307795,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.742,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.41139871977395037,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.7581,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.38161584010920957,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.7364,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.3749220941999862,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.7079,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3777814657377101,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7123,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.35130940006502914,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6171,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.4012010774522812,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.7606,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.3717235290661236,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.7096,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.4134247538055933,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.7233,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.4055302874678319,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.7306,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3944977507504851,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.6487,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.34873206606474894,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.7345,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.39267676884304525,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.7755,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.4053108315648405,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.7137,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.4023351316038286,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.8173,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4093808107576682,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.7122,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.4491558034357714,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.7879,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.3617404389519696,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.7268,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.42799834376001655,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.8069,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.3510121740629392,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6819,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.45624590684608124,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.7434,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.3698458004495236,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.6181,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.37472419134033885,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.7097,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.3993404575907408,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.7013,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3269754679799917,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6265,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.3800469959383693,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.7103,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.3695595873693324,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.7032,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.39376261966416126,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.7354,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3801514420197968,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.7268,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.35377429778077096,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.727,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3426628136280171,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.684,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.41349331661824823,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.7034,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.42756822660149474,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.7443,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.38368861663048015,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6673,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.4036097214403057,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.7114,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.37126779143968575,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.725,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.42227580118022057,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.7536,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.3589848733378133,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6327,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.4447606307607397,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.8961,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.3618392218752661,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.6671,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4017181337738947,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.7277,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.36709172680448393,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6982,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.4075043074993376,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.7011,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.44322286468878724,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.733,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4160851937190195,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.7779,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.4049238728038501,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.743,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.4638416235860613,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.8496,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.32242176649697163,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6046,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.4056506755784108,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6943,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.4306554451678464,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.7155,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.36470418661099374,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.6124,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.3947940275793731,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6759,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.4600333734797668,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.7176,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.3570800777269948,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.7315,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.38644016962962036,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.6743,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.392617559246826,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.7205,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.45611966255994957,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.7692,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.37146254281751323,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6856,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.38229305346934395,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.7541,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.440461568892407,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.7624,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.4020066834371898,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.7859,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.3492975200839857,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.6619,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.9341472418513593,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.7237,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.4025533441878727,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7634,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3639760736523722,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6507,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.3745761817589749,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.6868,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.37884889902728264,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.7736,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.365674772684498,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.7182,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.3624795912730274,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6225,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.3683753071921724,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.6936,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.35542279663291104,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.6602,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.46197904230171544,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7309,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3792773991364782,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.7131,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.43029104336412755,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.7544,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.37613372909705506,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.748,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.37723167866393553,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6579,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.5880112976429507,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6757,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.3642343661248838,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.7208,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.4139194259023456,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.8235,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.37933257982824714,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.7123,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3738416827071965,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6959,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.40217246805698825,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.7336,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.35271916282499327,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.6936,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.3646030235779607,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6937,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.34717235326994916,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.6172,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.3566662258185896,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.709,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.3333023597264441,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6998,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.37598417466923056,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.7131,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.4544822203706908,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6943,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.4272499819402007,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.7532,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4231649747359272,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.7626,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.5451685171570724,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.7421,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.38091879800214,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6688,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.3372074968746882,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6387,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3820980681553104,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.7031,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.5064060662251902,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6677,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.4379848712592458,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.799,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.36116529315085705,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.666,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3777588973708302,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.7148,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.4167105277665021,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.7685,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.34278184923898475,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6508,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.3698009942965616,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6828,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.40029887541730624,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.7863,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.3586155213422039,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6712,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.35753596513691044,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6914,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.37166902509689304,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.7057,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.38789242252868983,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.7205,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.3613517774832937,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6579,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.36005293153058765,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6636,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.42523047864521163,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6521,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3816463390281512,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6912,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.3771823012583285,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6688,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.41737079550125006,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.7161,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.3765121293994986,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6734,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.3693903375454175,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.6272,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.339695796508919,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6653,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3919350116426099,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.7107,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.3270127503285997,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.6282,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.33748229524615164,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.618,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.40175416378107937,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.741,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.35748115958853127,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6269,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.39710210789544903,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.7247,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3354133701651576,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6524,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.35240132550609804,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6588,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.3954769971174592,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6755,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.3494405366807235,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6475,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.4137297947591601,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.7107,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.3513366536788848,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6973,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.346812197226684,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.705,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.41841690235705675,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.7331,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4013822537523903,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.7293,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.385092970010389,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.7544,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3273345381287588,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.638,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.4025834689258895,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.7374,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.34894211940017017,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6825,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.39694560685304675,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.7365,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4309222891676764,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.7732,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.36303753967921265,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6455,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.5312086623336856,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.7509,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.36877958151863327,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6938,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.427257036979857,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.7845,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.37609962331991664,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6788,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.39459259246211736,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.7242,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.3960478994548615,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6841,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.4472870271167729,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.7984,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.4952527410376186,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.7035,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.4732747771840069,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.7197,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.3741639402451771,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.7303,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.38366485382899274,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6993,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.38509965539294083,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6775,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3685855566624197,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6412,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.3717389723254707,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6387,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3865092354235318,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6676,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.35129856196138265,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.6856,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.4692103617575446,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6423,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.3997876857378165,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.7305,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.36764496849984923,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.656,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.3693734170527303,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6596,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.34281269584844914,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.6312,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 1.418254312024875,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.757,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.34961298014672315,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6155,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.3537304281066106,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.5877,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.3776264669356993,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.7001,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.44861620357104404,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.727,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.379647447685291,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.7139,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.3318674040023656,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.654,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.406704858957328,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.7193,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.37829259854556246,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.7246,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.40938567202470044,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6912,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.38616679202871845,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.7129,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4166701096457814,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.7468,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.36679294774649107,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6919,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.43166086429980705,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.7879,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.36798697225067656,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6477,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.38266191831838003,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6782,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.4358544822598908,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.7521,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.45600870802673316,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.8347,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.43877231268351735,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.7373,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.3427494084909085,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.703,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.4026245808095916,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.7031,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3811537659017375,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6928,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.41639384346031383,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.7257,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.41962231191955746,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.7018,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.40149996237190355,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.7354,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3661174120034241,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.6987,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.33420017937981583,
+      "learning_rate": 0.0,
+      "loss": 0.6408,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 569932943065088.0,
+      "train_loss": 0.7621833514213562,
+      "train_runtime": 9858.9358,
+      "train_samples_per_second": 1.014,
+      "train_steps_per_second": 0.063
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 569932943065088.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6893779a25445c0494928b1ff9d10655c35b451
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "q_proj",
+    "o_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d7eb20703d38d43b6e4984121caf4a6e0165c260
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b40c219d5d9b5bec2f4ac07da90e719dbadba67c3be46da50b1d5098d4991dd
+size 671150064
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2d6d9e259b18437d58b62934611ed505f58dbd8a
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a52359cc6dba98205907cc3a1e177ede41b865bd872a8eab037bc751b153584
+size 918507402
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e630da6d003cec2d4160d27bea00e853158b0a4
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,4417 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.7231983852482833,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.2218,
+      "step": 1
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.9239428360353562,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.4363,
+      "step": 2
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.7180615378365476,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.191,
+      "step": 3
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.6817646092846863,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.2504,
+      "step": 4
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.6605178885709505,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.2184,
+      "step": 5
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.6490829266228314,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.0808,
+      "step": 6
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.8602365436315741,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 1.0775,
+      "step": 7
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.538864928859595,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 1.0181,
+      "step": 8
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.7654371127001822,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9833,
+      "step": 9
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.7138691243196663,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 1.019,
+      "step": 10
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.5886278358027343,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.9399,
+      "step": 11
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5458193648893132,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.9659,
+      "step": 12
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.5177775238784846,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9405,
+      "step": 13
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.5251989265716136,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.8936,
+      "step": 14
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.5170180481311273,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.9367,
+      "step": 15
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.4447650129806105,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8518,
+      "step": 16
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.5130114045647743,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.921,
+      "step": 17
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.536096545149244,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8237,
+      "step": 18
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.4747624905240158,
+      "learning_rate": 0.0002,
+      "loss": 0.8315,
+      "step": 19
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5272007852308388,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8823,
+      "step": 20
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.47184598750765694,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.8982,
+      "step": 21
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.4686585204755349,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8416,
+      "step": 22
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.5093496630970099,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8637,
+      "step": 23
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.44506071615956516,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8293,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.45305206064668757,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8356,
+      "step": 25
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5162905796803792,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.9218,
+      "step": 26
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.45194404186666465,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8877,
+      "step": 27
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.39989939508442185,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8065,
+      "step": 28
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.500024102912701,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.9477,
+      "step": 29
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.44488308131175786,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8545,
+      "step": 30
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.43483669672508,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8122,
+      "step": 31
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.45846847202763763,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.8754,
+      "step": 32
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.4703483132913366,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8578,
+      "step": 33
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.4635598888604102,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.7912,
+      "step": 34
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.4824686288787691,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.8814,
+      "step": 35
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.43694675557467566,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8162,
+      "step": 36
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.425351041096402,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.7953,
+      "step": 37
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.38452492867769794,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.7559,
+      "step": 38
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.3702718226298968,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.727,
+      "step": 39
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4313030058682959,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8624,
+      "step": 40
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.5519558150093004,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.9452,
+      "step": 41
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4893979262026225,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.8985,
+      "step": 42
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.4458711425408651,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.8849,
+      "step": 43
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.4683453183063352,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8542,
+      "step": 44
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5356719762148601,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 1.0009,
+      "step": 45
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.4711332636236724,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.9286,
+      "step": 46
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.4273826197745025,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.8184,
+      "step": 47
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.4977211939271013,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.8839,
+      "step": 48
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.4411220100726296,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8103,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.442763721455099,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.84,
+      "step": 50
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.47643441433077544,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.9073,
+      "step": 51
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.4617633420340866,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.8034,
+      "step": 52
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.4855214192586931,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8373,
+      "step": 53
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.4657300294379618,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.8669,
+      "step": 54
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.4383939014328478,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.8137,
+      "step": 55
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4395112496004527,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.9085,
+      "step": 56
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.5304044430296124,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 1.0164,
+      "step": 57
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.3614820299027725,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.6757,
+      "step": 58
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.45168774647002613,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.8429,
+      "step": 59
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.480628654761509,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.9253,
+      "step": 60
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.44062603086934643,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8753,
+      "step": 61
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4420390958086642,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.8654,
+      "step": 62
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.3936872645650932,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.8097,
+      "step": 63
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4053073352577127,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.8026,
+      "step": 64
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.47558333628895877,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8552,
+      "step": 65
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.41322518957290216,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.8326,
+      "step": 66
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.40526727949551145,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.7565,
+      "step": 67
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.43984934584398244,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.7443,
+      "step": 68
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.5633570704560397,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8122,
+      "step": 69
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.4550062994140467,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8266,
+      "step": 70
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.44306577214515325,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.8471,
+      "step": 71
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.460502515247839,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.8474,
+      "step": 72
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.7040213764246828,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.8557,
+      "step": 73
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.49032988752370005,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8772,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.4347823759443212,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.7704,
+      "step": 75
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.4233544696283672,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.7436,
+      "step": 76
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.47963772898838225,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.7285,
+      "step": 77
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.4087207360263281,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.8843,
+      "step": 78
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.4044541070129,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7651,
+      "step": 79
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.39898291558221216,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.8058,
+      "step": 80
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.4325162026240143,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8343,
+      "step": 81
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.44678270740796794,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.8671,
+      "step": 82
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.45888051318224715,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.7594,
+      "step": 83
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.44831895893069373,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8176,
+      "step": 84
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.41984502622281566,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.8446,
+      "step": 85
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.41044107463774593,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.8128,
+      "step": 86
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.44821025270461734,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.8759,
+      "step": 87
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5167747133762765,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8792,
+      "step": 88
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.4297095292559342,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7773,
+      "step": 89
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.45276162661542235,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.7908,
+      "step": 90
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.41541240217356085,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.8016,
+      "step": 91
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.3622747630689792,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.7264,
+      "step": 92
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.40031894800756795,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7834,
+      "step": 93
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.48082501114482634,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8875,
+      "step": 94
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.35600145331928734,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.7301,
+      "step": 95
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.45930464641779406,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8515,
+      "step": 96
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.41963271587690376,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.8478,
+      "step": 97
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.4500207122798498,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.8808,
+      "step": 98
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.35756275233876716,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.7014,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.42620631691726657,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8493,
+      "step": 100
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.4229131471825618,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8046,
+      "step": 101
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.4475019357926235,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.9009,
+      "step": 102
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.4063370550029476,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.8276,
+      "step": 103
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.48025442429594223,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.8528,
+      "step": 104
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.41513887163492563,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.8411,
+      "step": 105
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.39881679931357344,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.8223,
+      "step": 106
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.46167228602985083,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.8821,
+      "step": 107
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.4348299922553354,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.8155,
+      "step": 108
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.4606340496755724,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.8852,
+      "step": 109
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4261363671055583,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.8074,
+      "step": 110
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.41441921857228664,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7537,
+      "step": 111
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.34966187240410096,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7092,
+      "step": 112
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.44880769270300125,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.868,
+      "step": 113
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.4300532166432563,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.8659,
+      "step": 114
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.41459091492252037,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7918,
+      "step": 115
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.3936699595042386,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.8224,
+      "step": 116
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.4299016960501296,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.8132,
+      "step": 117
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.3859838782184397,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.742,
+      "step": 118
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.4176535375356266,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.8144,
+      "step": 119
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.42445455897420875,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7717,
+      "step": 120
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.406951765218565,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.7898,
+      "step": 121
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4129670966436122,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.7895,
+      "step": 122
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.3990480424499018,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.7479,
+      "step": 123
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.37947353309371373,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7335,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.43753429667586186,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.7925,
+      "step": 125
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.40563377239425713,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8314,
+      "step": 126
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.34805822464650815,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.6837,
+      "step": 127
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.40339949273873393,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7985,
+      "step": 128
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.45478075104512755,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.8746,
+      "step": 129
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4020475250433333,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.8274,
+      "step": 130
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.39769891080606107,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.7536,
+      "step": 131
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.49319202306759663,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.9169,
+      "step": 132
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.3482894565665991,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.7097,
+      "step": 133
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.398915525336025,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7959,
+      "step": 134
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.3925875461027216,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.8095,
+      "step": 135
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.3698735981233041,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7447,
+      "step": 136
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.45067845618559665,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.8078,
+      "step": 137
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.41091235860575553,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7702,
+      "step": 138
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.4440504220821974,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.8517,
+      "step": 139
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4538847821745842,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.8555,
+      "step": 140
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.4614655216354403,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.8675,
+      "step": 141
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.3807134184026545,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7813,
+      "step": 142
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.485645440042214,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.8972,
+      "step": 143
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3866526459560267,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7519,
+      "step": 144
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.4002641639066821,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7843,
+      "step": 145
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4245784518252565,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7682,
+      "step": 146
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.49493842111461256,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.8838,
+      "step": 147
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.42639065046813407,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7984,
+      "step": 148
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.44101418425470884,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.8171,
+      "step": 149
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.38863125402033316,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7071,
+      "step": 150
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.4402300611993132,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.9214,
+      "step": 151
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.3958990999259298,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7679,
+      "step": 152
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.3922774962026427,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7904,
+      "step": 153
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.38548872401946643,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7677,
+      "step": 154
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.4584703334288434,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.8509,
+      "step": 155
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.37807672222174266,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.7467,
+      "step": 156
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.3896875121644614,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.8045,
+      "step": 157
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.422446706378271,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.7931,
+      "step": 158
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.4345092830405177,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.802,
+      "step": 159
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4109805837514671,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7445,
+      "step": 160
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.4456762300372841,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.8024,
+      "step": 161
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.5806850234023688,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.777,
+      "step": 162
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.4651593782206959,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.9195,
+      "step": 163
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.4494929684904094,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.8541,
+      "step": 164
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.40354223512618753,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.828,
+      "step": 165
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.4319419814036804,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.8353,
+      "step": 166
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.4675999844652303,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.8615,
+      "step": 167
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.43362774505676915,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.7625,
+      "step": 168
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.43816533532445734,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.774,
+      "step": 169
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.43426690755805253,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.8425,
+      "step": 170
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.42823798037894056,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7345,
+      "step": 171
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4063338463286235,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7897,
+      "step": 172
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.39696713946999873,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.7894,
+      "step": 173
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.40317349721299145,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.8119,
+      "step": 174
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.399453657418752,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.8214,
+      "step": 175
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4265183819901487,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7938,
+      "step": 176
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.4403751664017105,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.8429,
+      "step": 177
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.39862194879594887,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.7645,
+      "step": 178
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.4716213034948484,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.8993,
+      "step": 179
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.40424827442610956,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.7667,
+      "step": 180
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.39321379967009523,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7626,
+      "step": 181
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.4324463899318374,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7403,
+      "step": 182
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.43587080051042243,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.8138,
+      "step": 183
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4351378421332776,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.8495,
+      "step": 184
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.42382258870089384,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.76,
+      "step": 185
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.3735845437624687,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7217,
+      "step": 186
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.3746451882880628,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.8167,
+      "step": 187
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.3757852868423534,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7396,
+      "step": 188
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.4193272800347855,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7896,
+      "step": 189
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4247677112289791,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.8306,
+      "step": 190
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.43674433809018215,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.8306,
+      "step": 191
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3785698696505638,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7203,
+      "step": 192
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.48021686661530827,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.8875,
+      "step": 193
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.3885208439653135,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7445,
+      "step": 194
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.3891959958675227,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7809,
+      "step": 195
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4056890355515356,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.6909,
+      "step": 196
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.39978656560334713,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.7134,
+      "step": 197
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.44775014777529165,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7949,
+      "step": 198
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.46304894978869354,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.855,
+      "step": 199
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.381657877086454,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7286,
+      "step": 200
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.4196539341868882,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.83,
+      "step": 201
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.38090185944636157,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.761,
+      "step": 202
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.45605409999909596,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.8152,
+      "step": 203
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4032099589518367,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.7734,
+      "step": 204
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.3893841740881215,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7019,
+      "step": 205
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.4012209486177639,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.8093,
+      "step": 206
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.3902971475144759,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7713,
+      "step": 207
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.402787069157667,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7885,
+      "step": 208
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.409791028207148,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7777,
+      "step": 209
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.44206483993103474,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7605,
+      "step": 210
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.5261416291348644,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7468,
+      "step": 211
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.41560987889298096,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7397,
+      "step": 212
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.39646930323212953,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7449,
+      "step": 213
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.4193611673285004,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.8182,
+      "step": 214
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.42154430598014214,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.8182,
+      "step": 215
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.38976448945002257,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.8131,
+      "step": 216
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.41060492677429766,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.8114,
+      "step": 217
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.39989288556293207,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7655,
+      "step": 218
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.4096185391840245,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.7672,
+      "step": 219
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3930935461396084,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7591,
+      "step": 220
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.46565303187633594,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.9019,
+      "step": 221
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.39310571443586123,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7677,
+      "step": 222
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.3904256737304809,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.779,
+      "step": 223
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.378364991725375,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.8013,
+      "step": 224
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4096030420043912,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.7956,
+      "step": 225
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4007692023729636,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7729,
+      "step": 226
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.43339974493101274,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.8337,
+      "step": 227
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.4530321071927405,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.8465,
+      "step": 228
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.4278293397766571,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.7763,
+      "step": 229
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.41652180547380624,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7509,
+      "step": 230
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.4731251149617755,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.8658,
+      "step": 231
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4446056116287485,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.7714,
+      "step": 232
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.3953730552417552,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7186,
+      "step": 233
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4518617483617707,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.8157,
+      "step": 234
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.42051458183639395,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.797,
+      "step": 235
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.4282040460541697,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.7773,
+      "step": 236
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.3981659644642087,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7925,
+      "step": 237
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.38031682467988365,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7004,
+      "step": 238
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.378734742002174,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7099,
+      "step": 239
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.40566754079083356,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.7416,
+      "step": 240
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.4285863769040255,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7839,
+      "step": 241
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.4486971547349298,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7981,
+      "step": 242
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.401294715482731,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.757,
+      "step": 243
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.429532769909926,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7652,
+      "step": 244
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.4078703061524831,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7723,
+      "step": 245
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.38395637745009437,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7701,
+      "step": 246
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.46608786737309366,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7662,
+      "step": 247
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.35045160988081664,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7096,
+      "step": 248
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.37791039178233876,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7667,
+      "step": 249
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3701446044366339,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7505,
+      "step": 250
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.4072252118024341,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.762,
+      "step": 251
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.5446744111289362,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7562,
+      "step": 252
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.39802460932591816,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7107,
+      "step": 253
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.41490940600074233,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.812,
+      "step": 254
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.382564640772057,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7406,
+      "step": 255
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.36688101890037983,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7235,
+      "step": 256
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.37758267877410473,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7257,
+      "step": 257
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4092480685877459,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7844,
+      "step": 258
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.4344992173838129,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7703,
+      "step": 259
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.40598239590824653,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7686,
+      "step": 260
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.4034797257147384,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.8054,
+      "step": 261
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.47011639450673637,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.8405,
+      "step": 262
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.44187715596052,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.8516,
+      "step": 263
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3540415585880858,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.6861,
+      "step": 264
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.3858217580819322,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7518,
+      "step": 265
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.4014913505425734,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7743,
+      "step": 266
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.3483869054788003,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7343,
+      "step": 267
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3877059266312812,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7729,
+      "step": 268
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.4427999619091166,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.9003,
+      "step": 269
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.3669647698336677,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7311,
+      "step": 270
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.37632648115591405,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.7328,
+      "step": 271
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.38890322786941506,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.7263,
+      "step": 272
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.3939325748769751,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7516,
+      "step": 273
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.45711681649704555,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7958,
+      "step": 274
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.3906650106035462,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.7286,
+      "step": 275
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.3890071506512066,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.7403,
+      "step": 276
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.41111095214733645,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7467,
+      "step": 277
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.44937322110097283,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7609,
+      "step": 278
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.37919580505064715,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7705,
+      "step": 279
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.37064227528614685,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7247,
+      "step": 280
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.3764518651002971,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.7376,
+      "step": 281
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3535946848882062,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7168,
+      "step": 282
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.3813621088573542,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7342,
+      "step": 283
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3269156299946925,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.6725,
+      "step": 284
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.3946878545722184,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.7833,
+      "step": 285
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.41330638203636794,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7286,
+      "step": 286
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.4361119728409083,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.6967,
+      "step": 287
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.4183377608817122,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.7852,
+      "step": 288
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.3953212306940315,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7396,
+      "step": 289
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.40560920084607494,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.8026,
+      "step": 290
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.35981353270429806,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7193,
+      "step": 291
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.35709845495917836,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.6829,
+      "step": 292
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.3539121121049415,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.6592,
+      "step": 293
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.4014672698806347,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.8046,
+      "step": 294
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4011937126204357,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7273,
+      "step": 295
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3628078110971893,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7352,
+      "step": 296
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.40435074788644143,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.8142,
+      "step": 297
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.3992911802650674,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.746,
+      "step": 298
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.35279408055295663,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7018,
+      "step": 299
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.38884407535041665,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7411,
+      "step": 300
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.3658038746256643,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.7154,
+      "step": 301
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3958013197872446,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.789,
+      "step": 302
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.38465550758524736,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7832,
+      "step": 303
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.38273579054099477,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7501,
+      "step": 304
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.4766554804629017,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.8083,
+      "step": 305
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.4507399064891884,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.7602,
+      "step": 306
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.39948409245215666,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7124,
+      "step": 307
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.36978831021665176,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.7425,
+      "step": 308
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.3596715994910583,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.6993,
+      "step": 309
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 1.0129120421227351,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7103,
+      "step": 310
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.37404697027334083,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.67,
+      "step": 311
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.4063613452265822,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.7125,
+      "step": 312
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.4311622529148588,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.805,
+      "step": 313
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.4002862057086655,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.7189,
+      "step": 314
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.35897131829240503,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.7231,
+      "step": 315
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3840002215719583,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7237,
+      "step": 316
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.3915382457016555,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7846,
+      "step": 317
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.4264033461354839,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.8394,
+      "step": 318
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.39704423994334814,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.7571,
+      "step": 319
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3937382052302273,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.6856,
+      "step": 320
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.4367566088275942,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.7608,
+      "step": 321
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.40495694712817476,
+      "learning_rate": 0.0001,
+      "loss": 0.7726,
+      "step": 322
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.38098392436937367,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7083,
+      "step": 323
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.38733430226751053,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.7439,
+      "step": 324
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.44064750233715166,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7715,
+      "step": 325
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.35230373552238536,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.6796,
+      "step": 326
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.4800720914552111,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.8364,
+      "step": 327
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3805193230557943,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7104,
+      "step": 328
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.5085183994179607,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.8648,
+      "step": 329
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.37929461103153767,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.7171,
+      "step": 330
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.3800242130516676,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.7054,
+      "step": 331
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.4336901328153918,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.783,
+      "step": 332
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.3868825632199866,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.7272,
+      "step": 333
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.35573721294798893,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.6274,
+      "step": 334
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.3877579169958648,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.715,
+      "step": 335
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.45314018920459403,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7513,
+      "step": 336
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.38336563912064303,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.7191,
+      "step": 337
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.400980502516487,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7804,
+      "step": 338
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.39925637895047644,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7971,
+      "step": 339
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3841983259664286,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.663,
+      "step": 340
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.36287443046027357,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.7876,
+      "step": 341
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.5084040575959375,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7187,
+      "step": 342
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.33765714798202046,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.6758,
+      "step": 343
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.4156605117410061,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7416,
+      "step": 344
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.3873188462882457,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7115,
+      "step": 345
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.46084313761255724,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.7266,
+      "step": 346
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.3760503391250545,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.7615,
+      "step": 347
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.4141877976256147,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7584,
+      "step": 348
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.39944168824261084,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.7102,
+      "step": 349
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.36133519899954275,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.7132,
+      "step": 350
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.3944336816628434,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7597,
+      "step": 351
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.4129981634077143,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.6973,
+      "step": 352
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.3980172989191709,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.7344,
+      "step": 353
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4187340354016876,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7642,
+      "step": 354
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.43473077350523287,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.7718,
+      "step": 355
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.42056656040527896,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7257,
+      "step": 356
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.4183719668101969,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.701,
+      "step": 357
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3645065233849404,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.7366,
+      "step": 358
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.4036044636169178,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7955,
+      "step": 359
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.37323141840597074,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.6729,
+      "step": 360
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.36364003272999235,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6943,
+      "step": 361
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.372790405258328,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.7192,
+      "step": 362
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.39078633590371337,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.7936,
+      "step": 363
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.40759819985709317,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.75,
+      "step": 364
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3533679152001031,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.6894,
+      "step": 365
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.4035765518667972,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.7355,
+      "step": 366
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.4178837302576139,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.7373,
+      "step": 367
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.47558833242507675,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.763,
+      "step": 368
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.46500805545229223,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.8097,
+      "step": 369
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.46002421900473145,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.875,
+      "step": 370
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.40789734231352603,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.7226,
+      "step": 371
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.4113445916236749,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.7658,
+      "step": 372
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.4174838902603154,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.7696,
+      "step": 373
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.41789225079157627,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.7765,
+      "step": 374
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.39444238581947505,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.806,
+      "step": 375
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3596258677892891,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.7327,
+      "step": 376
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.40451250926454907,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.7671,
+      "step": 377
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.37260153134251817,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.7026,
+      "step": 378
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.36730061022340127,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7141,
+      "step": 379
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.37221569937989507,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.7274,
+      "step": 380
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.5459296283985436,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.7002,
+      "step": 381
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.38125020228424533,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6742,
+      "step": 382
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.4448395502917699,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.7735,
+      "step": 383
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3836625050535906,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6939,
+      "step": 384
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.41170316536268653,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.7362,
+      "step": 385
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3720648756073222,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7185,
+      "step": 386
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.43163359631403647,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.8141,
+      "step": 387
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.49092374396968624,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.7933,
+      "step": 388
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.41203465556681096,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7825,
+      "step": 389
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.34874330169641377,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.685,
+      "step": 390
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.37734527159453984,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.6724,
+      "step": 391
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3834950796125026,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6698,
+      "step": 392
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.3758045774328492,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7996,
+      "step": 393
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.3351876872133651,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.6819,
+      "step": 394
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.42877069393958817,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.8079,
+      "step": 395
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.3813152540171064,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.7481,
+      "step": 396
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.35937473307719536,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.6833,
+      "step": 397
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.39170839948506153,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.7506,
+      "step": 398
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.3845078779013667,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.7419,
+      "step": 399
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.38223550799166633,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.7592,
+      "step": 400
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.3483973783122578,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.6416,
+      "step": 401
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.355210671910389,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.7059,
+      "step": 402
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.3394912634882092,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6609,
+      "step": 403
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.34382031484106235,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6563,
+      "step": 404
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.3887889453512105,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.7017,
+      "step": 405
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.472045409910418,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.7268,
+      "step": 406
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.442285904294921,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.8207,
+      "step": 407
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3741632001636999,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.7306,
+      "step": 408
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.3359492821294262,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.642,
+      "step": 409
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3792909475977315,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.6988,
+      "step": 410
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.4091566828975077,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.7251,
+      "step": 411
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.3853490829736871,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.7023,
+      "step": 412
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.3625820930850993,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.6429,
+      "step": 413
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4418338332757277,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.7317,
+      "step": 414
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4224791555196694,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.7059,
+      "step": 415
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.4038881731645075,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.7844,
+      "step": 416
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.3976998105956467,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6983,
+      "step": 417
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.4061098778627193,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.7284,
+      "step": 418
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.35198021454890427,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6564,
+      "step": 419
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3909180160479183,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.6891,
+      "step": 420
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.37386668018924474,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7024,
+      "step": 421
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.36816918010672717,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.7013,
+      "step": 422
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.44561999090184584,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7692,
+      "step": 423
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.38911370515307514,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6697,
+      "step": 424
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.3765715958260219,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6921,
+      "step": 425
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.38011344689911836,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.7282,
+      "step": 426
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.4167767279838049,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.7097,
+      "step": 427
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.3756370256031805,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.7363,
+      "step": 428
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.3493406120688842,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.6947,
+      "step": 429
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4134756820671265,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.7831,
+      "step": 430
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.3595003574635689,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.7227,
+      "step": 431
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.36324495205571183,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.6952,
+      "step": 432
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.411860655301704,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.8135,
+      "step": 433
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.45684704308654817,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.7334,
+      "step": 434
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.38234672207344367,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7387,
+      "step": 435
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3999340583677459,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.6767,
+      "step": 436
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.41152430409945573,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.7795,
+      "step": 437
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.4038304045299731,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.7319,
+      "step": 438
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.40001256015158243,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.7277,
+      "step": 439
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.36390171030656865,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.7553,
+      "step": 440
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.41043851211960763,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.7441,
+      "step": 441
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.40917804053191087,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.7092,
+      "step": 442
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.3883299801449483,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6696,
+      "step": 443
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 1.0963461420450433,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.6994,
+      "step": 444
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3568816293560058,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6823,
+      "step": 445
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.4023538231843036,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.7629,
+      "step": 446
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.40436434352028394,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.7225,
+      "step": 447
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3878036814885285,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.7552,
+      "step": 448
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.3631674545513451,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6771,
+      "step": 449
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.35252670588368107,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.5971,
+      "step": 450
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.3712317861407566,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.7034,
+      "step": 451
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.39408335238973335,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.7398,
+      "step": 452
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.3532409220108127,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.6397,
+      "step": 453
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.36451098268841636,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.7524,
+      "step": 454
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.40477922284122053,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.7629,
+      "step": 455
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.40274299719829526,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.7409,
+      "step": 456
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.3909281065215058,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.7027,
+      "step": 457
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4245347694176414,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.7496,
+      "step": 458
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.37669925736047244,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6715,
+      "step": 459
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.44194636784462143,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.718,
+      "step": 460
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.33576723517865975,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7073,
+      "step": 461
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.3526403484194062,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6737,
+      "step": 462
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.40613972281563926,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.6804,
+      "step": 463
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3652644263225935,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6482,
+      "step": 464
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.44840540807785684,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.7368,
+      "step": 465
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.37569432333962294,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.6399,
+      "step": 466
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.41847764928324893,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.7856,
+      "step": 467
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3913727858664601,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.7074,
+      "step": 468
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.36295695446323223,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6346,
+      "step": 469
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.37023749610856527,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6911,
+      "step": 470
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.4668246705267103,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.7567,
+      "step": 471
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3715919476936774,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.6404,
+      "step": 472
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.35973638741996034,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.662,
+      "step": 473
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.36509820892589967,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6892,
+      "step": 474
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.46598374458522496,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.7504,
+      "step": 475
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.42417348938962923,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.7465,
+      "step": 476
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.3502086934855863,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.5907,
+      "step": 477
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.43110347939361826,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.7651,
+      "step": 478
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.39398513650777234,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.7541,
+      "step": 479
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3770624916644866,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.7266,
+      "step": 480
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.39499788836012734,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.7463,
+      "step": 481
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3672763525548306,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6447,
+      "step": 482
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.39066711504394686,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.7406,
+      "step": 483
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.43563331580206266,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.7901,
+      "step": 484
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.40190328232436373,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.7205,
+      "step": 485
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.42431966071154587,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.7547,
+      "step": 486
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.3560331931544561,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6801,
+      "step": 487
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.3377482649836737,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6433,
+      "step": 488
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.4109657873465769,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.8159,
+      "step": 489
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.38005015654533436,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.7088,
+      "step": 490
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.5488690593328647,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.7626,
+      "step": 491
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3962249485454088,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.723,
+      "step": 492
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.419498559865438,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.7227,
+      "step": 493
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.41360813064611796,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.7207,
+      "step": 494
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.37988686552506207,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6963,
+      "step": 495
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.5041206543316924,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.8632,
+      "step": 496
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.3612199502341204,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.7003,
+      "step": 497
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3751721903936928,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.704,
+      "step": 498
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.38764348543483956,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.685,
+      "step": 499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.39281159642279473,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.7214,
+      "step": 500
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.34209994426755214,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.6792,
+      "step": 501
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.40163707214933136,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.7288,
+      "step": 502
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.41573749121963943,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7534,
+      "step": 503
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.391848400237993,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.7321,
+      "step": 504
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.43311391958987744,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7086,
+      "step": 505
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.4536007636519588,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.7244,
+      "step": 506
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.3652130722470363,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.6782,
+      "step": 507
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.31065133138155593,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6423,
+      "step": 508
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.3348317769014843,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.611,
+      "step": 509
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3989098217381608,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.7136,
+      "step": 510
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.4126240586209762,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.7518,
+      "step": 511
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.36563825916354054,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6462,
+      "step": 512
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.4101710514975283,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.7773,
+      "step": 513
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.4104198614977016,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6886,
+      "step": 514
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.5832241207784147,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.7153,
+      "step": 515
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.42754824152447896,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.7257,
+      "step": 516
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.36928201720825976,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6139,
+      "step": 517
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3974920487783144,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.7925,
+      "step": 518
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.3488093554801746,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.67,
+      "step": 519
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.4081500958560575,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.7698,
+      "step": 520
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.3510043183393745,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.6505,
+      "step": 521
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.40969858220650185,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.7218,
+      "step": 522
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.3533872161820804,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.7261,
+      "step": 523
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3693744283782947,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.5911,
+      "step": 524
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.3768413564202193,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.7112,
+      "step": 525
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.3298457775194949,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6377,
+      "step": 526
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.38690638394379856,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.7077,
+      "step": 527
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.37086823716999645,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.7148,
+      "step": 528
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.42330824488193325,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.7534,
+      "step": 529
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.48300690616154035,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6976,
+      "step": 530
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.3900694708123389,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6809,
+      "step": 531
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.3784642140178504,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.7073,
+      "step": 532
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.3732175647059916,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6731,
+      "step": 533
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3873008012047257,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.7025,
+      "step": 534
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.3844093543014457,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6612,
+      "step": 535
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.41060350831625075,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.7457,
+      "step": 536
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.4096525505586372,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6801,
+      "step": 537
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.4007266962985699,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6904,
+      "step": 538
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.4197327497544567,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.7528,
+      "step": 539
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.35904867567271265,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.7328,
+      "step": 540
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.40276001415498663,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6754,
+      "step": 541
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3813333734708716,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.7141,
+      "step": 542
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.5093889871823375,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6977,
+      "step": 543
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.400132373769689,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6798,
+      "step": 544
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.37453588167409657,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.6905,
+      "step": 545
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.445413282690777,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.8455,
+      "step": 546
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.40467766806413463,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.7624,
+      "step": 547
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.37321198097224145,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6554,
+      "step": 548
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.3677412242083649,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6719,
+      "step": 549
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.36164040774173506,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6459,
+      "step": 550
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.3650823538229352,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.707,
+      "step": 551
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.3804371610295385,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.7594,
+      "step": 552
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.44106616411053606,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6696,
+      "step": 553
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.3324267407818426,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.6161,
+      "step": 554
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.37600498420808864,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6506,
+      "step": 555
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.39884051843359386,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.7362,
+      "step": 556
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.366487353854839,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.6605,
+      "step": 557
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.36411247167166455,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6871,
+      "step": 558
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.4065998256016423,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.7495,
+      "step": 559
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.41231735691420857,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6689,
+      "step": 560
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.3854212616522437,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.7021,
+      "step": 561
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3392210742716694,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6756,
+      "step": 562
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.35168297042146535,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.627,
+      "step": 563
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.38103576987056464,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6984,
+      "step": 564
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.35298484224758003,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6598,
+      "step": 565
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.41249906064070707,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.7793,
+      "step": 566
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.40212432590182495,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6987,
+      "step": 567
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3287284941233722,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6385,
+      "step": 568
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.41328744455408084,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6854,
+      "step": 569
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3820446954303476,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.7051,
+      "step": 570
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.40956797346664336,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.7485,
+      "step": 571
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3682969904631875,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6781,
+      "step": 572
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.4012523688681302,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.717,
+      "step": 573
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.34306295120439945,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6434,
+      "step": 574
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.41628562886755166,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.7294,
+      "step": 575
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4366815833874182,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.7411,
+      "step": 576
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.34382043328844686,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6047,
+      "step": 577
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.42835073132185475,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.7502,
+      "step": 578
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.37914174525666405,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6901,
+      "step": 579
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.37645121302077367,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.7065,
+      "step": 580
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.38088786213147147,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.7029,
+      "step": 581
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.39503162545070064,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.7145,
+      "step": 582
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.3905990159592029,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.7125,
+      "step": 583
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.4200098993675833,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.7683,
+      "step": 584
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.4029202613775148,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.7574,
+      "step": 585
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.40597596536313096,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.7061,
+      "step": 586
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.4895979686581715,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.756,
+      "step": 587
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.35978484679119643,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.7009,
+      "step": 588
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.33757437669375695,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6475,
+      "step": 589
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3608122154310713,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6549,
+      "step": 590
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.33851915578617664,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6751,
+      "step": 591
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.393001219191284,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6747,
+      "step": 592
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.3802930029455698,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.6351,
+      "step": 593
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3668943160030006,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.7399,
+      "step": 594
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.5284019200681044,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.6583,
+      "step": 595
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.3672083835934735,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6405,
+      "step": 596
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.38329428540247795,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.7362,
+      "step": 597
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3496210739665151,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.6358,
+      "step": 598
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.40403696037973136,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6847,
+      "step": 599
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.4048348645650435,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6897,
+      "step": 600
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.33153420780396853,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6138,
+      "step": 601
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.39677315949372843,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6974,
+      "step": 602
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.4255411496687473,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.7638,
+      "step": 603
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.44715029336801493,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.734,
+      "step": 604
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.3823527796727132,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.69,
+      "step": 605
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.41897928091944026,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.8015,
+      "step": 606
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.3914526389133937,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.742,
+      "step": 607
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.4178354504278549,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6297,
+      "step": 608
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.40336635899089013,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.7265,
+      "step": 609
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3718102945561971,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6742,
+      "step": 610
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.3737535685050714,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.7229,
+      "step": 611
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.42769307916587873,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.7555,
+      "step": 612
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.3577862820441812,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6454,
+      "step": 613
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.41318583087110927,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.7695,
+      "step": 614
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.47977922845908477,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.7193,
+      "step": 615
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.42266778140194194,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.8069,
+      "step": 616
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.38078422349478713,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6981,
+      "step": 617
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.3477323908052074,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6715,
+      "step": 618
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.5458686512071936,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.7217,
+      "step": 619
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4146727887590274,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.7601,
+      "step": 620
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.4082313641603639,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.7122,
+      "step": 621
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.39072806435910845,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.7094,
+      "step": 622
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.40020469038001116,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.73,
+      "step": 623
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3910437689979269,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.7462,
+      "step": 624
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.3666639480621862,
+      "learning_rate": 0.0,
+      "loss": 0.6875,
+      "step": 625
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 571506442403840.0,
+      "train_loss": 0.7636905529975891,
+      "train_runtime": 9780.3316,
+      "train_samples_per_second": 1.022,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 571506442403840.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..98918016adb9851e1c5ca954526bb545a66ee1da
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "v_proj",
+    "q_proj",
+    "gate_proj",
+    "o_proj",
+    "down_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1432842b2ee569a073300a4d1387238800738fba
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54b5fd0cc5001f6e6cc9d42b52851474bbb03ef5d4b0a049aec1540a8b7ea3f7
+size 671150064
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..84294b6576d96199d8126d4fc0ce6ee6e4f2d304
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebbbff62258d6295b0d9016e1c944413d92bc04f1cf1a11cf8a750841d39401f
+size 918507402
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..4141d91c7ffae0051e6382b17e734540bf4e67ce
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,8792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.7770269714650887,
+      "learning_rate": 5.263157894736842e-06,
+      "loss": 1.2881,
+      "step": 1
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.024047044935287,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.2967,
+      "step": 2
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.9179324969023009,
+      "learning_rate": 1.5789473684210526e-05,
+      "loss": 1.3872,
+      "step": 3
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8298904257412587,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.3031,
+      "step": 4
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.7730459391219203,
+      "learning_rate": 2.6315789473684212e-05,
+      "loss": 1.2892,
+      "step": 5
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.6600669344799063,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.1634,
+      "step": 6
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.8296588219702797,
+      "learning_rate": 3.6842105263157895e-05,
+      "loss": 1.2471,
+      "step": 7
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.6964921879934483,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.1325,
+      "step": 8
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.6118135227381648,
+      "learning_rate": 4.736842105263158e-05,
+      "loss": 1.0626,
+      "step": 9
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 1.121380634255772,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.0503,
+      "step": 10
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.7396748278621545,
+      "learning_rate": 5.789473684210527e-05,
+      "loss": 1.0111,
+      "step": 11
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.6249189450745211,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 0.9344,
+      "step": 12
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.6499687655104689,
+      "learning_rate": 6.842105263157895e-05,
+      "loss": 0.9215,
+      "step": 13
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.5076935214653521,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.8516,
+      "step": 14
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.6888973666116883,
+      "learning_rate": 7.894736842105263e-05,
+      "loss": 0.99,
+      "step": 15
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.5591842011521481,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.906,
+      "step": 16
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.5321833673418794,
+      "learning_rate": 8.947368421052632e-05,
+      "loss": 0.8883,
+      "step": 17
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.48128609176908127,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.8505,
+      "step": 18
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.51363735744202,
+      "learning_rate": 0.0001,
+      "loss": 0.9719,
+      "step": 19
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5957002621267714,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 1.0021,
+      "step": 20
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.5163578810666871,
+      "learning_rate": 0.0001105263157894737,
+      "loss": 0.8585,
+      "step": 21
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.47777288535864215,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.8385,
+      "step": 22
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.5330569062110371,
+      "learning_rate": 0.00012105263157894738,
+      "loss": 1.0255,
+      "step": 23
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5980756167605328,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.9987,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.5206638044866894,
+      "learning_rate": 0.00013157894736842108,
+      "loss": 0.9226,
+      "step": 25
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.5487650960666887,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9099,
+      "step": 26
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.49145451533617074,
+      "learning_rate": 0.00014210526315789474,
+      "loss": 0.8579,
+      "step": 27
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.47189502298098335,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.8676,
+      "step": 28
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.5080289216879348,
+      "learning_rate": 0.00015263157894736845,
+      "loss": 0.895,
+      "step": 29
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.5226289806694986,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.9314,
+      "step": 30
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.4457264336120691,
+      "learning_rate": 0.0001631578947368421,
+      "loss": 0.8269,
+      "step": 31
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.4700824414453813,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8755,
+      "step": 32
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.5235401864352286,
+      "learning_rate": 0.0001736842105263158,
+      "loss": 0.9357,
+      "step": 33
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.5026293819871382,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.9186,
+      "step": 34
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.5855558657957484,
+      "learning_rate": 0.00018421052631578948,
+      "loss": 0.8541,
+      "step": 35
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.48471644560507177,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.9137,
+      "step": 36
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.46478434248194045,
+      "learning_rate": 0.00019473684210526317,
+      "loss": 0.8311,
+      "step": 37
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.40363878205922327,
+      "learning_rate": 0.0002,
+      "loss": 0.789,
+      "step": 38
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.4557241944006385,
+      "learning_rate": 0.00019999966405802826,
+      "loss": 0.9019,
+      "step": 39
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5136938589333343,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.9092,
+      "step": 40
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.4589064324372691,
+      "learning_rate": 0.00019999697653579705,
+      "loss": 0.8153,
+      "step": 41
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.4711520806636325,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.8642,
+      "step": 42
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.4642243410765449,
+      "learning_rate": 0.0001999916015635627,
+      "loss": 0.8678,
+      "step": 43
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.47418870887286674,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8437,
+      "step": 44
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.47714001354788554,
+      "learning_rate": 0.00019998353928577919,
+      "loss": 0.8387,
+      "step": 45
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.4084594735562426,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8178,
+      "step": 46
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.43334073220477404,
+      "learning_rate": 0.0001999727899191228,
+      "loss": 0.7866,
+      "step": 47
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.4094875104431594,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.737,
+      "step": 48
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.49985762919942306,
+      "learning_rate": 0.00019995935375248606,
+      "loss": 0.8179,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.44413007504680635,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.7888,
+      "step": 50
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.47530439086714704,
+      "learning_rate": 0.00019994323114697022,
+      "loss": 0.7823,
+      "step": 51
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.46994207893111456,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8512,
+      "step": 52
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.5374106034782066,
+      "learning_rate": 0.0001999244225358753,
+      "loss": 0.8281,
+      "step": 53
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.4634713085884633,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.816,
+      "step": 54
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.4990707735023886,
+      "learning_rate": 0.00019990292842468868,
+      "loss": 0.8629,
+      "step": 55
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.4823910263411063,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8284,
+      "step": 56
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.40558494647973825,
+      "learning_rate": 0.0001998787493910712,
+      "loss": 0.7613,
+      "step": 57
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.5058399519984393,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.9216,
+      "step": 58
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.4430971022831468,
+      "learning_rate": 0.000199851886084842,
+      "loss": 0.8424,
+      "step": 59
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.4599605477233072,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.9151,
+      "step": 60
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.41773726348192974,
+      "learning_rate": 0.00019982233922796085,
+      "loss": 0.802,
+      "step": 61
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5273326132401231,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.9706,
+      "step": 62
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.4346491260010976,
+      "learning_rate": 0.00019979010961450878,
+      "loss": 0.8258,
+      "step": 63
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.439233301783603,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.7435,
+      "step": 64
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.5408845811493046,
+      "learning_rate": 0.00019975519811066663,
+      "loss": 0.9646,
+      "step": 65
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.3839446160384672,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.7484,
+      "step": 66
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.4663067629148098,
+      "learning_rate": 0.0001997176056546921,
+      "loss": 0.838,
+      "step": 67
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.4811492144146111,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.872,
+      "step": 68
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.45118193674818063,
+      "learning_rate": 0.0001996773332568941,
+      "loss": 0.8246,
+      "step": 69
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.4486434942728305,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.7811,
+      "step": 70
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.4703205860664721,
+      "learning_rate": 0.00019963438199960599,
+      "loss": 0.8238,
+      "step": 71
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.4795012837534885,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.881,
+      "step": 72
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.4707544400703513,
+      "learning_rate": 0.00019958875303715615,
+      "loss": 0.9132,
+      "step": 73
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.4266920813293674,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.8602,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.4510992661418326,
+      "learning_rate": 0.0001995404475958373,
+      "loss": 0.9129,
+      "step": 75
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5040527366718769,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.9165,
+      "step": 76
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.41313107554868533,
+      "learning_rate": 0.0001994894669738732,
+      "loss": 0.8491,
+      "step": 77
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.4494431303697321,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.8507,
+      "step": 78
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.4376764818391794,
+      "learning_rate": 0.0001994358125413841,
+      "loss": 0.8205,
+      "step": 79
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5122357726085491,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.8123,
+      "step": 80
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.4115765441551589,
+      "learning_rate": 0.0001993794857403495,
+      "loss": 0.8026,
+      "step": 81
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.4509542037042227,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.8328,
+      "step": 82
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.42594272629359003,
+      "learning_rate": 0.0001993204880845699,
+      "loss": 0.7765,
+      "step": 83
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4260909589227541,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.8405,
+      "step": 84
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.41573168903833024,
+      "learning_rate": 0.00019925882115962568,
+      "loss": 0.8231,
+      "step": 85
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.39933734200305154,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.8063,
+      "step": 86
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.4345644903978346,
+      "learning_rate": 0.00019919448662283478,
+      "loss": 0.8267,
+      "step": 87
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.40127693336643405,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.7522,
+      "step": 88
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.40199553950201616,
+      "learning_rate": 0.00019912748620320794,
+      "loss": 0.8032,
+      "step": 89
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.48327385003933276,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.8217,
+      "step": 90
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.3908920050780294,
+      "learning_rate": 0.00019905782170140238,
+      "loss": 0.8312,
+      "step": 91
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.49522901161784727,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8805,
+      "step": 92
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.4720762873467958,
+      "learning_rate": 0.00019898549498967343,
+      "loss": 0.8353,
+      "step": 93
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.4673667387850208,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.8156,
+      "step": 94
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.48274563016239064,
+      "learning_rate": 0.000198910508011824,
+      "loss": 0.9052,
+      "step": 95
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.38141973015887337,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7523,
+      "step": 96
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.42718404335013443,
+      "learning_rate": 0.00019883286278315262,
+      "loss": 0.8815,
+      "step": 97
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.45654683275776975,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.79,
+      "step": 98
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.39958380818714795,
+      "learning_rate": 0.00019875256139039902,
+      "loss": 0.8769,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.49023723846133654,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.9067,
+      "step": 100
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.39793474917174537,
+      "learning_rate": 0.00019866960599168826,
+      "loss": 0.7653,
+      "step": 101
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.45045541780062376,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.8277,
+      "step": 102
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.44590880781273884,
+      "learning_rate": 0.0001985839988164726,
+      "loss": 0.8155,
+      "step": 103
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.4655315399141277,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.8375,
+      "step": 104
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.4838098171615128,
+      "learning_rate": 0.00019849574216547171,
+      "loss": 0.8239,
+      "step": 105
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.40968833788240205,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.7784,
+      "step": 106
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.4729664739308913,
+      "learning_rate": 0.00019840483841061058,
+      "loss": 0.7989,
+      "step": 107
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.4524948610255115,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.8658,
+      "step": 108
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.42381451152368793,
+      "learning_rate": 0.00019831128999495606,
+      "loss": 0.8504,
+      "step": 109
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.40214305600150546,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.8271,
+      "step": 110
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.4628861171033263,
+      "learning_rate": 0.0001982150994326511,
+      "loss": 0.8439,
+      "step": 111
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.44188860733222457,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.7837,
+      "step": 112
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.4483825994700799,
+      "learning_rate": 0.0001981162693088471,
+      "loss": 0.9282,
+      "step": 113
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.4710324365514096,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8705,
+      "step": 114
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.46510154454495684,
+      "learning_rate": 0.0001980148022796345,
+      "loss": 0.8115,
+      "step": 115
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.405556088903293,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.8209,
+      "step": 116
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.3933253178211884,
+      "learning_rate": 0.00019791070107197153,
+      "loss": 0.7824,
+      "step": 117
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.4441308602732538,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.8158,
+      "step": 118
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.41600800450120323,
+      "learning_rate": 0.0001978039684836106,
+      "loss": 0.806,
+      "step": 119
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4505417888946031,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.86,
+      "step": 120
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.44419662115731584,
+      "learning_rate": 0.0001976946073830234,
+      "loss": 0.7928,
+      "step": 121
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.46078770584507683,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.759,
+      "step": 122
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.4861036696461452,
+      "learning_rate": 0.00019758262070932375,
+      "loss": 0.909,
+      "step": 123
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.47935094149952195,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.8432,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5021621797308456,
+      "learning_rate": 0.00019746801147218842,
+      "loss": 0.8899,
+      "step": 125
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.41210306509262695,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.7877,
+      "step": 126
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.4398250863148764,
+      "learning_rate": 0.00019735078275177654,
+      "loss": 0.7817,
+      "step": 127
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.3828979564308189,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.7777,
+      "step": 128
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.4624433449613049,
+      "learning_rate": 0.00019723093769864663,
+      "loss": 0.7517,
+      "step": 129
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.4598025687772892,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.7982,
+      "step": 130
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.4425508776530567,
+      "learning_rate": 0.0001971084795336719,
+      "loss": 0.8692,
+      "step": 131
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.4842287203577144,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.84,
+      "step": 132
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.45435282722373527,
+      "learning_rate": 0.00019698341154795389,
+      "loss": 0.8132,
+      "step": 133
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.5513821385591284,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.8852,
+      "step": 134
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.44676033887524785,
+      "learning_rate": 0.00019685573710273376,
+      "loss": 0.8316,
+      "step": 135
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.4258188039172344,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.7812,
+      "step": 136
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.4824087947941197,
+      "learning_rate": 0.00019672545962930215,
+      "loss": 0.818,
+      "step": 137
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.4187342117306936,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8092,
+      "step": 138
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.4017058881104004,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.6962,
+      "step": 139
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.448079610421751,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8068,
+      "step": 140
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.3986220602603605,
+      "learning_rate": 0.00019645710967265882,
+      "loss": 0.8267,
+      "step": 141
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.4677803646502211,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.8303,
+      "step": 142
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.3967906879343072,
+      "learning_rate": 0.00019631904440143612,
+      "loss": 0.8039,
+      "step": 143
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4641287483774464,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.8314,
+      "step": 144
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.4371119687642268,
+      "learning_rate": 0.00019617839052578603,
+      "loss": 0.8006,
+      "step": 145
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.4251072016794893,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7494,
+      "step": 146
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.43688346737206174,
+      "learning_rate": 0.0001960351518258255,
+      "loss": 0.765,
+      "step": 147
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.42924160099480074,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8122,
+      "step": 148
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.40941526606709683,
+      "learning_rate": 0.00019588933215113926,
+      "loss": 0.8025,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.4753545185318249,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.8097,
+      "step": 150
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.4297319370236763,
+      "learning_rate": 0.00019574093542067673,
+      "loss": 0.8136,
+      "step": 151
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.44389185288420413,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.8309,
+      "step": 152
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.4454598228098689,
+      "learning_rate": 0.0001955899656226464,
+      "loss": 0.8655,
+      "step": 153
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.4379720315213458,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.8103,
+      "step": 154
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.5229416617277775,
+      "learning_rate": 0.0001954364268144088,
+      "loss": 0.995,
+      "step": 155
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.3861539358658946,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7538,
+      "step": 156
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.4277791103455093,
+      "learning_rate": 0.00019528032312236736,
+      "loss": 0.8023,
+      "step": 157
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.47272910144735575,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.8189,
+      "step": 158
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.4464716182886099,
+      "learning_rate": 0.00019512165874185767,
+      "loss": 0.7289,
+      "step": 159
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4227171621774377,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.8555,
+      "step": 160
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.4395286840376491,
+      "learning_rate": 0.0001949604379370345,
+      "loss": 0.8033,
+      "step": 161
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.43507375337288545,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8616,
+      "step": 162
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.4632310561001686,
+      "learning_rate": 0.00019479666504075736,
+      "loss": 0.7553,
+      "step": 163
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.43899436188218655,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.7203,
+      "step": 164
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.4237430849392596,
+      "learning_rate": 0.0001946303444544741,
+      "loss": 0.7229,
+      "step": 165
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.4527915263031912,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8702,
+      "step": 166
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.43131694202801923,
+      "learning_rate": 0.00019446148064810242,
+      "loss": 0.7961,
+      "step": 167
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.49437880601950374,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.9102,
+      "step": 168
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.43659462457447684,
+      "learning_rate": 0.00019429007815990993,
+      "loss": 0.7614,
+      "step": 169
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.41918191497256263,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7852,
+      "step": 170
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.4291690963237692,
+      "learning_rate": 0.00019411614159639204,
+      "loss": 0.7746,
+      "step": 171
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.6143376957940382,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.8101,
+      "step": 172
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.4314498742806767,
+      "learning_rate": 0.00019393967563214833,
+      "loss": 0.7557,
+      "step": 173
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.48829490311943763,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.7513,
+      "step": 174
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.43851150291390606,
+      "learning_rate": 0.00019376068500975667,
+      "loss": 0.8017,
+      "step": 175
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.38046129450067623,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.696,
+      "step": 176
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.4291326347607385,
+      "learning_rate": 0.000193579174539646,
+      "loss": 0.7703,
+      "step": 177
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.4917391816960092,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.8717,
+      "step": 178
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.4353352524294646,
+      "learning_rate": 0.00019339514909996706,
+      "loss": 0.7938,
+      "step": 179
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4418832858666523,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8093,
+      "step": 180
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.470468993784615,
+      "learning_rate": 0.00019320861363646095,
+      "loss": 0.8472,
+      "step": 181
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.39426705301336207,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7785,
+      "step": 182
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.4018827466079507,
+      "learning_rate": 0.00019301957316232658,
+      "loss": 0.7205,
+      "step": 183
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.38227905944522594,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.7488,
+      "step": 184
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.4253768460834158,
+      "learning_rate": 0.0001928280327580858,
+      "loss": 0.7844,
+      "step": 185
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.433341824982204,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7815,
+      "step": 186
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.4416209521555928,
+      "learning_rate": 0.00019263399757144683,
+      "loss": 0.8655,
+      "step": 187
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.4540173442027962,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8846,
+      "step": 188
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.4428878025484908,
+      "learning_rate": 0.000192437472817166,
+      "loss": 0.8015,
+      "step": 189
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.4904578720842893,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.8315,
+      "step": 190
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.44336847472518826,
+      "learning_rate": 0.00019223846377690754,
+      "loss": 0.8542,
+      "step": 191
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.49226825803525953,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8338,
+      "step": 192
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.501756094514099,
+      "learning_rate": 0.00019203697579910154,
+      "loss": 0.7713,
+      "step": 193
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.5435447056177455,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.8257,
+      "step": 194
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.44313370250703266,
+      "learning_rate": 0.00019183301429880043,
+      "loss": 0.7919,
+      "step": 195
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.47342288389157794,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.9051,
+      "step": 196
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.3999241068503602,
+      "learning_rate": 0.00019162658475753327,
+      "loss": 0.7938,
+      "step": 197
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.4631029137866705,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8189,
+      "step": 198
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.40421567014884724,
+      "learning_rate": 0.00019141769272315858,
+      "loss": 0.7728,
+      "step": 199
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3723392140339022,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.7384,
+      "step": 200
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.430167081220453,
+      "learning_rate": 0.00019120634380971496,
+      "loss": 0.7903,
+      "step": 201
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.40241535953398005,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8059,
+      "step": 202
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.38901205632140157,
+      "learning_rate": 0.0001909925436972706,
+      "loss": 0.7093,
+      "step": 203
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.4899648722987168,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.8009,
+      "step": 204
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.43679788350937493,
+      "learning_rate": 0.00019077629813177036,
+      "loss": 0.8485,
+      "step": 205
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.41474942601321974,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7366,
+      "step": 206
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.3965788027409051,
+      "learning_rate": 0.00019055761292488142,
+      "loss": 0.6985,
+      "step": 207
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.45497223123457115,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.8644,
+      "step": 208
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.4283454248563262,
+      "learning_rate": 0.00019033649395383702,
+      "loss": 0.8169,
+      "step": 209
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.3950155450695892,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.8138,
+      "step": 210
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.4136694620708334,
+      "learning_rate": 0.00019011294716127867,
+      "loss": 0.7565,
+      "step": 211
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.39632663144028374,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.8258,
+      "step": 212
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.4114557247555061,
+      "learning_rate": 0.0001898869785550963,
+      "loss": 0.7951,
+      "step": 213
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.40632063758759807,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.777,
+      "step": 214
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.42803025719964816,
+      "learning_rate": 0.00018965859420826684,
+      "loss": 0.8007,
+      "step": 215
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.39737743505525436,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.7175,
+      "step": 216
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.5020832745669306,
+      "learning_rate": 0.00018942780025869098,
+      "loss": 0.8899,
+      "step": 217
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.4080587031809224,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.751,
+      "step": 218
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.39062572686119307,
+      "learning_rate": 0.00018919460290902826,
+      "loss": 0.7361,
+      "step": 219
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4001701095298602,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.7102,
+      "step": 220
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.4275503431259605,
+      "learning_rate": 0.0001889590084265304,
+      "loss": 0.7437,
+      "step": 221
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.4255501208026851,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.8217,
+      "step": 222
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.45702297406020853,
+      "learning_rate": 0.0001887210231428727,
+      "loss": 0.8768,
+      "step": 223
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4441420909638248,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.8756,
+      "step": 224
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.4184670778035845,
+      "learning_rate": 0.0001884806534539841,
+      "loss": 0.7577,
+      "step": 225
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.4487239911209529,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.7787,
+      "step": 226
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.4139557396543828,
+      "learning_rate": 0.0001882379058198751,
+      "loss": 0.6914,
+      "step": 227
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.3889468554789524,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.7042,
+      "step": 228
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.43149944673496776,
+      "learning_rate": 0.00018799278676446423,
+      "loss": 0.7721,
+      "step": 229
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.36052127629596975,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.6844,
+      "step": 230
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.5080376534107922,
+      "learning_rate": 0.00018774530287540278,
+      "loss": 0.8687,
+      "step": 231
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.45171658664266007,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.7657,
+      "step": 232
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.4738203786339585,
+      "learning_rate": 0.00018749546080389757,
+      "loss": 0.8564,
+      "step": 233
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.48625884167655675,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.8189,
+      "step": 234
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.44129173607747263,
+      "learning_rate": 0.00018724326726453244,
+      "loss": 0.7819,
+      "step": 235
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.3984774646323755,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7441,
+      "step": 236
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.41477157567690254,
+      "learning_rate": 0.00018698872903508755,
+      "loss": 0.7725,
+      "step": 237
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.41718842233637154,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7659,
+      "step": 238
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.44251795593787574,
+      "learning_rate": 0.0001867318529563574,
+      "loss": 0.8403,
+      "step": 239
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.41318693117299315,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7442,
+      "step": 240
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.48641842669637025,
+      "learning_rate": 0.00018647264593196688,
+      "loss": 0.8138,
+      "step": 241
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.42387328232905025,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.8126,
+      "step": 242
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.3931945023044329,
+      "learning_rate": 0.00018621111492818585,
+      "loss": 0.7536,
+      "step": 243
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.3997122267628017,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.7441,
+      "step": 244
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.404588720437904,
+      "learning_rate": 0.00018594726697374175,
+      "loss": 0.7707,
+      "step": 245
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.45216094511412713,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.7601,
+      "step": 246
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.4326863137511993,
+      "learning_rate": 0.0001856811091596308,
+      "loss": 0.7883,
+      "step": 247
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4071967675769803,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.7515,
+      "step": 248
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.41938525195695797,
+      "learning_rate": 0.00018541264863892754,
+      "loss": 0.79,
+      "step": 249
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.40042410823512975,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.7623,
+      "step": 250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.4113599828024844,
+      "learning_rate": 0.00018514189262659235,
+      "loss": 0.7854,
+      "step": 251
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4728098642283454,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.9151,
+      "step": 252
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.40323520325355633,
+      "learning_rate": 0.00018486884839927768,
+      "loss": 0.7357,
+      "step": 253
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.3791079153765052,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.7362,
+      "step": 254
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.42769699257246035,
+      "learning_rate": 0.0001845935232951325,
+      "loss": 0.8031,
+      "step": 255
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.3746111112502971,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7349,
+      "step": 256
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.4613476961289269,
+      "learning_rate": 0.00018431592471360503,
+      "loss": 0.794,
+      "step": 257
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4303021025297545,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7774,
+      "step": 258
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.42397060786526264,
+      "learning_rate": 0.000184036060115244,
+      "loss": 0.7682,
+      "step": 259
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.48690701433814254,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.8032,
+      "step": 260
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.4105137729498451,
+      "learning_rate": 0.00018375393702149787,
+      "loss": 0.7547,
+      "step": 261
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.3912642589897561,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.7919,
+      "step": 262
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.4080005214517403,
+      "learning_rate": 0.00018346956301451304,
+      "loss": 0.7707,
+      "step": 263
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.4337941426942863,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.802,
+      "step": 264
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.40181116809323447,
+      "learning_rate": 0.00018318294573692985,
+      "loss": 0.7609,
+      "step": 265
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.41880680034037987,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.8195,
+      "step": 266
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.42910528130461506,
+      "learning_rate": 0.0001828940928916772,
+      "loss": 0.7449,
+      "step": 267
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.41345533405075885,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7912,
+      "step": 268
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.4094307996948985,
+      "learning_rate": 0.00018260301224176558,
+      "loss": 0.7605,
+      "step": 269
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.3895176750928822,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.77,
+      "step": 270
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.37143027527339506,
+      "learning_rate": 0.00018230971161007853,
+      "loss": 0.716,
+      "step": 271
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4062755623152971,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7575,
+      "step": 272
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.41625469536375176,
+      "learning_rate": 0.00018201419887916214,
+      "loss": 0.8403,
+      "step": 273
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.41188441901889533,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.796,
+      "step": 274
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.4341196489324385,
+      "learning_rate": 0.00018171648199101346,
+      "loss": 0.7704,
+      "step": 275
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4532063166425961,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.8187,
+      "step": 276
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.44091211069612063,
+      "learning_rate": 0.00018141656894686689,
+      "loss": 0.7616,
+      "step": 277
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.4069755993489469,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.8089,
+      "step": 278
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.4382020418168502,
+      "learning_rate": 0.00018111446780697929,
+      "loss": 0.7732,
+      "step": 279
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3855838394816359,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7443,
+      "step": 280
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.4479039803767857,
+      "learning_rate": 0.00018081018669041324,
+      "loss": 0.7873,
+      "step": 281
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.44642356676805167,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.854,
+      "step": 282
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.3721039804434336,
+      "learning_rate": 0.00018050373377481878,
+      "loss": 0.6817,
+      "step": 283
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.40861524285292483,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7492,
+      "step": 284
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.4416102456142807,
+      "learning_rate": 0.0001801951172962139,
+      "loss": 0.735,
+      "step": 285
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.43530327527758694,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7981,
+      "step": 286
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.4374583283813062,
+      "learning_rate": 0.0001798843455487629,
+      "loss": 0.7864,
+      "step": 287
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.6698972887381797,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7676,
+      "step": 288
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.5739032302057141,
+      "learning_rate": 0.00017957142688455362,
+      "loss": 0.8291,
+      "step": 289
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.3962191799864678,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7208,
+      "step": 290
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.40797488365072615,
+      "learning_rate": 0.00017925636971337304,
+      "loss": 0.7671,
+      "step": 291
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.40762829380890714,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7676,
+      "step": 292
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.3827737010265142,
+      "learning_rate": 0.00017893918250248104,
+      "loss": 0.7365,
+      "step": 293
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.3812057580305516,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.751,
+      "step": 294
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.37992151906117433,
+      "learning_rate": 0.00017861987377638312,
+      "loss": 0.7454,
+      "step": 295
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.3590342086282725,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7102,
+      "step": 296
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.3989236843068554,
+      "learning_rate": 0.0001782984521166011,
+      "loss": 0.7709,
+      "step": 297
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.380756116585582,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7821,
+      "step": 298
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.39172912345873656,
+      "learning_rate": 0.00017797492616144256,
+      "loss": 0.7082,
+      "step": 299
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3920341448417732,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7288,
+      "step": 300
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.432987963265769,
+      "learning_rate": 0.00017764930460576866,
+      "loss": 0.8452,
+      "step": 301
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.43805844914407543,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.8103,
+      "step": 302
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.39726718393916,
+      "learning_rate": 0.00017732159620076053,
+      "loss": 0.7568,
+      "step": 303
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.4927533672406732,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.8236,
+      "step": 304
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.38944794969047297,
+      "learning_rate": 0.00017699180975368396,
+      "loss": 0.6928,
+      "step": 305
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.42662679526881486,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7484,
+      "step": 306
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.43251903973122213,
+      "learning_rate": 0.00017665995412765285,
+      "loss": 0.7706,
+      "step": 307
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.4819805473665938,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.8178,
+      "step": 308
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.4340033691860238,
+      "learning_rate": 0.00017632603824139085,
+      "loss": 0.7853,
+      "step": 309
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.4198050552753312,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.7306,
+      "step": 310
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.3667608095911679,
+      "learning_rate": 0.0001759900710689918,
+      "loss": 0.7313,
+      "step": 311
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.45854263014244123,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8512,
+      "step": 312
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.3870267720499676,
+      "learning_rate": 0.00017565206163967846,
+      "loss": 0.7389,
+      "step": 313
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.3629421043115878,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.7279,
+      "step": 314
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.45435742792899936,
+      "learning_rate": 0.00017531201903755994,
+      "loss": 0.7784,
+      "step": 315
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.38769257652266653,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.6862,
+      "step": 316
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.43544323597179285,
+      "learning_rate": 0.00017496995240138744,
+      "loss": 0.7836,
+      "step": 317
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.4303548195748457,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7015,
+      "step": 318
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.3988815649130947,
+      "learning_rate": 0.00017462587092430875,
+      "loss": 0.7428,
+      "step": 319
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.36695351109613955,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.6559,
+      "step": 320
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.4536080222581322,
+      "learning_rate": 0.00017427978385362112,
+      "loss": 0.7842,
+      "step": 321
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.47821792563657084,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.8441,
+      "step": 322
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.3704778738018211,
+      "learning_rate": 0.0001739317004905227,
+      "loss": 0.6822,
+      "step": 323
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.428327767075066,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.7791,
+      "step": 324
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.3961450976529274,
+      "learning_rate": 0.00017358163018986282,
+      "loss": 0.7394,
+      "step": 325
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.41929806219428845,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.7883,
+      "step": 326
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.3894422610473363,
+      "learning_rate": 0.00017322958235989016,
+      "loss": 0.7458,
+      "step": 327
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.36295835419452754,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.684,
+      "step": 328
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.3569673265635979,
+      "learning_rate": 0.00017287556646200018,
+      "loss": 0.7176,
+      "step": 329
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3865910785194446,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.7707,
+      "step": 330
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.38216796057368174,
+      "learning_rate": 0.00017251959201048083,
+      "loss": 0.704,
+      "step": 331
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.45114147371211993,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7484,
+      "step": 332
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.3646183328954607,
+      "learning_rate": 0.00017216166857225674,
+      "loss": 0.671,
+      "step": 333
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.4368523232160318,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.755,
+      "step": 334
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.4247692667642482,
+      "learning_rate": 0.00017180180576663228,
+      "loss": 0.7828,
+      "step": 335
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4901671777097176,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8043,
+      "step": 336
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.4178507084032011,
+      "learning_rate": 0.00017144001326503273,
+      "loss": 0.834,
+      "step": 337
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.42204066116677874,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.7418,
+      "step": 338
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.4081564035171982,
+      "learning_rate": 0.00017107630079074478,
+      "loss": 0.7796,
+      "step": 339
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.40402309874689885,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7859,
+      "step": 340
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.42467107811937443,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.754,
+      "step": 341
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.3954585462658397,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7454,
+      "step": 342
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.4699401441706718,
+      "learning_rate": 0.00017034315507498635,
+      "loss": 0.7971,
+      "step": 343
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.43203384628124064,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7608,
+      "step": 344
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.3989431542163795,
+      "learning_rate": 0.00016997374153703625,
+      "loss": 0.7594,
+      "step": 345
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.5004464475895747,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.8348,
+      "step": 346
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.44866367241052296,
+      "learning_rate": 0.00016960244743290868,
+      "loss": 0.8461,
+      "step": 347
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.4208747472469387,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7646,
+      "step": 348
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.4250138345747438,
+      "learning_rate": 0.00016922928274124886,
+      "loss": 0.7661,
+      "step": 349
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.42876210270998055,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.7558,
+      "step": 350
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.3919074809576166,
+      "learning_rate": 0.00016885425749097444,
+      "loss": 0.731,
+      "step": 351
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4390458646036582,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.8194,
+      "step": 352
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.40728141579339516,
+      "learning_rate": 0.00016847738176100632,
+      "loss": 0.7575,
+      "step": 353
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.4007468595299471,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7493,
+      "step": 354
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 1.4849889980591207,
+      "learning_rate": 0.0001680986656799975,
+      "loss": 0.7374,
+      "step": 355
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.45413652126351345,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.7726,
+      "step": 356
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.345963315677467,
+      "learning_rate": 0.00016771811942606108,
+      "loss": 0.6959,
+      "step": 357
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.4270195519889394,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.8031,
+      "step": 358
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.443768314112945,
+      "learning_rate": 0.00016733575322649657,
+      "loss": 0.7047,
+      "step": 359
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.44007610886830467,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.8055,
+      "step": 360
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.41782305359904554,
+      "learning_rate": 0.00016695157735751513,
+      "loss": 0.7742,
+      "step": 361
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.46127543477045563,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.812,
+      "step": 362
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.4708279566376157,
+      "learning_rate": 0.0001665656021439633,
+      "loss": 0.8455,
+      "step": 363
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.3820793856650901,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7407,
+      "step": 364
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.4418698534422898,
+      "learning_rate": 0.00016617783795904565,
+      "loss": 0.8059,
+      "step": 365
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.42559259411748557,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7907,
+      "step": 366
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.40526252997293044,
+      "learning_rate": 0.00016578829522404583,
+      "loss": 0.7387,
+      "step": 367
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4537639887124046,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.8205,
+      "step": 368
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.41607678697428313,
+      "learning_rate": 0.00016539698440804661,
+      "loss": 0.7634,
+      "step": 369
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.46647291980529587,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.8456,
+      "step": 370
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.42211238390913575,
+      "learning_rate": 0.0001650039160276485,
+      "loss": 0.7943,
+      "step": 371
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.7479041557290732,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.8312,
+      "step": 372
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.44974913045518755,
+      "learning_rate": 0.0001646091006466871,
+      "loss": 0.7288,
+      "step": 373
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.42900284177400566,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.7376,
+      "step": 374
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.43745295104653914,
+      "learning_rate": 0.00016421254887594917,
+      "loss": 0.8216,
+      "step": 375
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.34707937466301986,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.6859,
+      "step": 376
+    },
+    {
+      "epoch": 0.3016,
+      "grad_norm": 0.4857945786176909,
+      "learning_rate": 0.00016381427137288754,
+      "loss": 0.8421,
+      "step": 377
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.39121290861369795,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7159,
+      "step": 378
+    },
+    {
+      "epoch": 0.3032,
+      "grad_norm": 0.46354388130478685,
+      "learning_rate": 0.0001634142788413346,
+      "loss": 0.7939,
+      "step": 379
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4270592250949217,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.8159,
+      "step": 380
+    },
+    {
+      "epoch": 0.3048,
+      "grad_norm": 0.3871489562745451,
+      "learning_rate": 0.00016301258203121462,
+      "loss": 0.7152,
+      "step": 381
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.43235168230976634,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7932,
+      "step": 382
+    },
+    {
+      "epoch": 0.3064,
+      "grad_norm": 0.39913774093727283,
+      "learning_rate": 0.00016260919173825508,
+      "loss": 0.7317,
+      "step": 383
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4100573891738185,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7645,
+      "step": 384
+    },
+    {
+      "epoch": 0.308,
+      "grad_norm": 0.4231866067832828,
+      "learning_rate": 0.00016220411880369601,
+      "loss": 0.7223,
+      "step": 385
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.43092174954976703,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7011,
+      "step": 386
+    },
+    {
+      "epoch": 0.3096,
+      "grad_norm": 0.44583569757552366,
+      "learning_rate": 0.00016179737411399926,
+      "loss": 0.8321,
+      "step": 387
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.4484364582764281,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7902,
+      "step": 388
+    },
+    {
+      "epoch": 0.3112,
+      "grad_norm": 0.3756430709642739,
+      "learning_rate": 0.00016138896860055555,
+      "loss": 0.7238,
+      "step": 389
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.4302534613418457,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7639,
+      "step": 390
+    },
+    {
+      "epoch": 0.3128,
+      "grad_norm": 0.4366161009671638,
+      "learning_rate": 0.00016097891323939062,
+      "loss": 0.8006,
+      "step": 391
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4080078257128167,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7529,
+      "step": 392
+    },
+    {
+      "epoch": 0.3144,
+      "grad_norm": 0.38655595838544954,
+      "learning_rate": 0.00016056721905087056,
+      "loss": 0.7384,
+      "step": 393
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.42310522433274306,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.6947,
+      "step": 394
+    },
+    {
+      "epoch": 0.316,
+      "grad_norm": 0.4425047945464679,
+      "learning_rate": 0.00016015389709940538,
+      "loss": 0.7619,
+      "step": 395
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.37997601824260263,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7174,
+      "step": 396
+    },
+    {
+      "epoch": 0.3176,
+      "grad_norm": 0.4546294953554832,
+      "learning_rate": 0.0001597389584931517,
+      "loss": 0.7458,
+      "step": 397
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.39330551079256165,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.6811,
+      "step": 398
+    },
+    {
+      "epoch": 0.3192,
+      "grad_norm": 0.45042609210794154,
+      "learning_rate": 0.0001593224143837142,
+      "loss": 0.7373,
+      "step": 399
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3637892815630396,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.6868,
+      "step": 400
+    },
+    {
+      "epoch": 0.3208,
+      "grad_norm": 0.40543358824559494,
+      "learning_rate": 0.00015890427596584617,
+      "loss": 0.758,
+      "step": 401
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.4121138567827428,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7679,
+      "step": 402
+    },
+    {
+      "epoch": 0.3224,
+      "grad_norm": 0.40666299722383215,
+      "learning_rate": 0.00015848455447714822,
+      "loss": 0.7196,
+      "step": 403
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.41336832248445105,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7178,
+      "step": 404
+    },
+    {
+      "epoch": 0.324,
+      "grad_norm": 0.44566015950782456,
+      "learning_rate": 0.00015806326119776663,
+      "loss": 0.8355,
+      "step": 405
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.41409975993872233,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7177,
+      "step": 406
+    },
+    {
+      "epoch": 0.3256,
+      "grad_norm": 0.42395424691211214,
+      "learning_rate": 0.00015764040745008988,
+      "loss": 0.8186,
+      "step": 407
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.346635875303523,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.6821,
+      "step": 408
+    },
+    {
+      "epoch": 0.3272,
+      "grad_norm": 0.4176984516392487,
+      "learning_rate": 0.00015721600459844468,
+      "loss": 0.797,
+      "step": 409
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.45983357652109397,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.8478,
+      "step": 410
+    },
+    {
+      "epoch": 0.3288,
+      "grad_norm": 0.4077699212094806,
+      "learning_rate": 0.00015679006404879033,
+      "loss": 0.7857,
+      "step": 411
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.36392275707834104,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.6917,
+      "step": 412
+    },
+    {
+      "epoch": 0.3304,
+      "grad_norm": 0.40663804131126274,
+      "learning_rate": 0.00015636259724841222,
+      "loss": 0.7344,
+      "step": 413
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.4342493920150342,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.8365,
+      "step": 414
+    },
+    {
+      "epoch": 0.332,
+      "grad_norm": 0.4227117158794363,
+      "learning_rate": 0.00015593361568561428,
+      "loss": 0.7778,
+      "step": 415
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.41270891685667144,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7261,
+      "step": 416
+    },
+    {
+      "epoch": 0.3336,
+      "grad_norm": 0.4158768104130257,
+      "learning_rate": 0.0001555031308894101,
+      "loss": 0.7938,
+      "step": 417
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.3775853119004546,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7217,
+      "step": 418
+    },
+    {
+      "epoch": 0.3352,
+      "grad_norm": 0.3941198166676895,
+      "learning_rate": 0.0001550711544292131,
+      "loss": 0.6884,
+      "step": 419
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.42915734195324934,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7512,
+      "step": 420
+    },
+    {
+      "epoch": 0.3368,
+      "grad_norm": 0.38564142369791643,
+      "learning_rate": 0.00015463769791452574,
+      "loss": 0.6865,
+      "step": 421
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.3693977836154458,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.6848,
+      "step": 422
+    },
+    {
+      "epoch": 0.3384,
+      "grad_norm": 0.4200972826590631,
+      "learning_rate": 0.00015420277299462736,
+      "loss": 0.7806,
+      "step": 423
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4117289702100441,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7887,
+      "step": 424
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.3958065401208961,
+      "learning_rate": 0.00015376639135826107,
+      "loss": 0.7447,
+      "step": 425
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.379122263890142,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7405,
+      "step": 426
+    },
+    {
+      "epoch": 0.3416,
+      "grad_norm": 0.38761228847035534,
+      "learning_rate": 0.00015332856473331978,
+      "loss": 0.7099,
+      "step": 427
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.37619366133112303,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7609,
+      "step": 428
+    },
+    {
+      "epoch": 0.3432,
+      "grad_norm": 0.4562285195993237,
+      "learning_rate": 0.00015288930488653094,
+      "loss": 0.7751,
+      "step": 429
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.41552372433757795,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7491,
+      "step": 430
+    },
+    {
+      "epoch": 0.3448,
+      "grad_norm": 0.45218399720416264,
+      "learning_rate": 0.0001524486236231402,
+      "loss": 0.8822,
+      "step": 431
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.36646616666120246,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7087,
+      "step": 432
+    },
+    {
+      "epoch": 0.3464,
+      "grad_norm": 0.4440092106035693,
+      "learning_rate": 0.00015200653278659432,
+      "loss": 0.7601,
+      "step": 433
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.4264021944001832,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7661,
+      "step": 434
+    },
+    {
+      "epoch": 0.348,
+      "grad_norm": 0.4830509634544988,
+      "learning_rate": 0.00015156304425822267,
+      "loss": 0.7945,
+      "step": 435
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.47615679985015075,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.8693,
+      "step": 436
+    },
+    {
+      "epoch": 0.3496,
+      "grad_norm": 0.405127275114156,
+      "learning_rate": 0.00015111816995691809,
+      "loss": 0.7853,
+      "step": 437
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.43826022955665683,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.7163,
+      "step": 438
+    },
+    {
+      "epoch": 0.3512,
+      "grad_norm": 0.43433703098792414,
+      "learning_rate": 0.00015067192183881658,
+      "loss": 0.7845,
+      "step": 439
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.4056094338704598,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7391,
+      "step": 440
+    },
+    {
+      "epoch": 0.3528,
+      "grad_norm": 0.5160656198284714,
+      "learning_rate": 0.00015022431189697568,
+      "loss": 0.8635,
+      "step": 441
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.3876592269599419,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.7177,
+      "step": 442
+    },
+    {
+      "epoch": 0.3544,
+      "grad_norm": 0.424270065327115,
+      "learning_rate": 0.0001497753521610526,
+      "loss": 0.7222,
+      "step": 443
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.37854882642483223,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7027,
+      "step": 444
+    },
+    {
+      "epoch": 0.356,
+      "grad_norm": 0.431669147423509,
+      "learning_rate": 0.00014932505469698052,
+      "loss": 0.7044,
+      "step": 445
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.40362916033901547,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7245,
+      "step": 446
+    },
+    {
+      "epoch": 0.3576,
+      "grad_norm": 0.3651916039821122,
+      "learning_rate": 0.0001488734316066446,
+      "loss": 0.7154,
+      "step": 447
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3948131618499892,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7141,
+      "step": 448
+    },
+    {
+      "epoch": 0.3592,
+      "grad_norm": 0.43297346709780593,
+      "learning_rate": 0.0001484204950275565,
+      "loss": 0.7856,
+      "step": 449
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4081859143003985,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.7498,
+      "step": 450
+    },
+    {
+      "epoch": 0.3608,
+      "grad_norm": 0.4239090344818062,
+      "learning_rate": 0.00014796625713252848,
+      "loss": 0.7557,
+      "step": 451
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.3956920569834454,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7802,
+      "step": 452
+    },
+    {
+      "epoch": 0.3624,
+      "grad_norm": 0.38601196388397474,
+      "learning_rate": 0.00014751073012934587,
+      "loss": 0.738,
+      "step": 453
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.42270336528219926,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7315,
+      "step": 454
+    },
+    {
+      "epoch": 0.364,
+      "grad_norm": 0.4199661760942862,
+      "learning_rate": 0.0001470539262604393,
+      "loss": 0.7757,
+      "step": 455
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.39532678138017263,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7166,
+      "step": 456
+    },
+    {
+      "epoch": 0.3656,
+      "grad_norm": 0.3451845364961653,
+      "learning_rate": 0.00014659585780255556,
+      "loss": 0.6645,
+      "step": 457
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.3759791521821676,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.7803,
+      "step": 458
+    },
+    {
+      "epoch": 0.3672,
+      "grad_norm": 0.3946098699766991,
+      "learning_rate": 0.0001461365370664276,
+      "loss": 0.74,
+      "step": 459
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.42363585128536285,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7539,
+      "step": 460
+    },
+    {
+      "epoch": 0.3688,
+      "grad_norm": 0.4292403665539383,
+      "learning_rate": 0.00014567597639644387,
+      "loss": 0.704,
+      "step": 461
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.4223905551143545,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7423,
+      "step": 462
+    },
+    {
+      "epoch": 0.3704,
+      "grad_norm": 0.4083339126716739,
+      "learning_rate": 0.00014521418817031628,
+      "loss": 0.7534,
+      "step": 463
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4266340273316555,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.7944,
+      "step": 464
+    },
+    {
+      "epoch": 0.372,
+      "grad_norm": 0.4244010609467904,
+      "learning_rate": 0.00014475118479874774,
+      "loss": 0.7322,
+      "step": 465
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.3792106682271149,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.6975,
+      "step": 466
+    },
+    {
+      "epoch": 0.3736,
+      "grad_norm": 0.3978947925627141,
+      "learning_rate": 0.0001442869787250987,
+      "loss": 0.7467,
+      "step": 467
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4470667818288522,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.7744,
+      "step": 468
+    },
+    {
+      "epoch": 0.3752,
+      "grad_norm": 0.42487360452471123,
+      "learning_rate": 0.00014382158242505234,
+      "loss": 0.7553,
+      "step": 469
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.409805908518081,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7162,
+      "step": 470
+    },
+    {
+      "epoch": 0.3768,
+      "grad_norm": 0.4529557352357633,
+      "learning_rate": 0.00014335500840627986,
+      "loss": 0.7749,
+      "step": 471
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.399071833197901,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.8104,
+      "step": 472
+    },
+    {
+      "epoch": 0.3784,
+      "grad_norm": 0.4115889798381014,
+      "learning_rate": 0.0001428872692081038,
+      "loss": 0.8162,
+      "step": 473
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.37200763367107226,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7473,
+      "step": 474
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.3633670300337601,
+      "learning_rate": 0.00014241837740116132,
+      "loss": 0.7011,
+      "step": 475
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.3858944272926558,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7235,
+      "step": 476
+    },
+    {
+      "epoch": 0.3816,
+      "grad_norm": 0.40358992250738557,
+      "learning_rate": 0.00014194834558706632,
+      "loss": 0.7344,
+      "step": 477
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.4165317885836963,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7464,
+      "step": 478
+    },
+    {
+      "epoch": 0.3832,
+      "grad_norm": 0.4161390699346155,
+      "learning_rate": 0.0001414771863980707,
+      "loss": 0.8025,
+      "step": 479
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.5327238138409245,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.9732,
+      "step": 480
+    },
+    {
+      "epoch": 0.3848,
+      "grad_norm": 0.37746784110298387,
+      "learning_rate": 0.00014100491249672498,
+      "loss": 0.6851,
+      "step": 481
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.4096403959250972,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7213,
+      "step": 482
+    },
+    {
+      "epoch": 0.3864,
+      "grad_norm": 0.36094519904258365,
+      "learning_rate": 0.0001405315365755379,
+      "loss": 0.6588,
+      "step": 483
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.3926746452925469,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7005,
+      "step": 484
+    },
+    {
+      "epoch": 0.388,
+      "grad_norm": 0.42257281513267286,
+      "learning_rate": 0.00014005707135663527,
+      "loss": 0.8084,
+      "step": 485
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.3424676525058974,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.638,
+      "step": 486
+    },
+    {
+      "epoch": 0.3896,
+      "grad_norm": 0.38916858808887456,
+      "learning_rate": 0.00013958152959141825,
+      "loss": 0.7487,
+      "step": 487
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.4246159738231621,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.773,
+      "step": 488
+    },
+    {
+      "epoch": 0.3912,
+      "grad_norm": 0.4281633221166481,
+      "learning_rate": 0.00013910492406022033,
+      "loss": 0.7671,
+      "step": 489
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.48387099878159534,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.8217,
+      "step": 490
+    },
+    {
+      "epoch": 0.3928,
+      "grad_norm": 0.3703953881605659,
+      "learning_rate": 0.0001386272675719642,
+      "loss": 0.7481,
+      "step": 491
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.38374095321400503,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7198,
+      "step": 492
+    },
+    {
+      "epoch": 0.3944,
+      "grad_norm": 0.44187232005789906,
+      "learning_rate": 0.00013814857296381728,
+      "loss": 0.7858,
+      "step": 493
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.42479757659136425,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7582,
+      "step": 494
+    },
+    {
+      "epoch": 0.396,
+      "grad_norm": 0.37778309773123786,
+      "learning_rate": 0.00013766885310084688,
+      "loss": 0.7027,
+      "step": 495
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.38595074168769156,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.6871,
+      "step": 496
+    },
+    {
+      "epoch": 0.3976,
+      "grad_norm": 0.47644774807222423,
+      "learning_rate": 0.00013718812087567414,
+      "loss": 0.8481,
+      "step": 497
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.4438823472718032,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7446,
+      "step": 498
+    },
+    {
+      "epoch": 0.3992,
+      "grad_norm": 0.39444101812002935,
+      "learning_rate": 0.000136706389208128,
+      "loss": 0.7029,
+      "step": 499
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3989016333638246,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7502,
+      "step": 500
+    },
+    {
+      "epoch": 0.4008,
+      "grad_norm": 0.3929916527658517,
+      "learning_rate": 0.00013622367104489756,
+      "loss": 0.7168,
+      "step": 501
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.4330542431318848,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.7895,
+      "step": 502
+    },
+    {
+      "epoch": 0.4024,
+      "grad_norm": 0.46511998999871673,
+      "learning_rate": 0.0001357399793591844,
+      "loss": 0.779,
+      "step": 503
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.4120613132882409,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7321,
+      "step": 504
+    },
+    {
+      "epoch": 0.404,
+      "grad_norm": 0.42892550263482604,
+      "learning_rate": 0.00013525532715035366,
+      "loss": 0.7812,
+      "step": 505
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.41065320820919565,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7743,
+      "step": 506
+    },
+    {
+      "epoch": 0.4056,
+      "grad_norm": 0.38414982621095234,
+      "learning_rate": 0.00013476972744358507,
+      "loss": 0.6896,
+      "step": 507
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.3955180803765117,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.7284,
+      "step": 508
+    },
+    {
+      "epoch": 0.4072,
+      "grad_norm": 0.43118579124945994,
+      "learning_rate": 0.00013428319328952253,
+      "loss": 0.7303,
+      "step": 509
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.3803715757708683,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.6969,
+      "step": 510
+    },
+    {
+      "epoch": 0.4088,
+      "grad_norm": 0.42238276187713786,
+      "learning_rate": 0.0001337957377639235,
+      "loss": 0.7433,
+      "step": 511
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.43112653439928567,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7976,
+      "step": 512
+    },
+    {
+      "epoch": 0.4104,
+      "grad_norm": 0.4177853496713614,
+      "learning_rate": 0.0001333073739673076,
+      "loss": 0.7387,
+      "step": 513
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.3876745806420402,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.6952,
+      "step": 514
+    },
+    {
+      "epoch": 0.412,
+      "grad_norm": 0.3839120757027477,
+      "learning_rate": 0.0001328181150246045,
+      "loss": 0.6984,
+      "step": 515
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.430712755838551,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7869,
+      "step": 516
+    },
+    {
+      "epoch": 0.4136,
+      "grad_norm": 0.36874939887173747,
+      "learning_rate": 0.00013232797408480127,
+      "loss": 0.7045,
+      "step": 517
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.39771894299769867,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7074,
+      "step": 518
+    },
+    {
+      "epoch": 0.4152,
+      "grad_norm": 0.40636985692015276,
+      "learning_rate": 0.00013183696432058888,
+      "loss": 0.7234,
+      "step": 519
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4362192462858971,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7513,
+      "step": 520
+    },
+    {
+      "epoch": 0.4168,
+      "grad_norm": 0.41059237448087205,
+      "learning_rate": 0.00013134509892800822,
+      "loss": 0.7866,
+      "step": 521
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.45428657486743507,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7614,
+      "step": 522
+    },
+    {
+      "epoch": 0.4184,
+      "grad_norm": 0.3826319609724033,
+      "learning_rate": 0.00013085239112609547,
+      "loss": 0.7032,
+      "step": 523
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.3986937022304439,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.6939,
+      "step": 524
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.3855053555062931,
+      "learning_rate": 0.00013035885415652685,
+      "loss": 0.6806,
+      "step": 525
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.422175727203896,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.751,
+      "step": 526
+    },
+    {
+      "epoch": 0.4216,
+      "grad_norm": 0.4114716181367486,
+      "learning_rate": 0.00012986450128326266,
+      "loss": 0.7218,
+      "step": 527
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.42006078486237336,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7178,
+      "step": 528
+    },
+    {
+      "epoch": 0.4232,
+      "grad_norm": 0.37219227951366723,
+      "learning_rate": 0.00012936934579219094,
+      "loss": 0.6568,
+      "step": 529
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.4059513618713587,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7555,
+      "step": 530
+    },
+    {
+      "epoch": 0.4248,
+      "grad_norm": 0.4305993332670556,
+      "learning_rate": 0.00012887340099077024,
+      "loss": 0.7678,
+      "step": 531
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.35206527911535446,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.6382,
+      "step": 532
+    },
+    {
+      "epoch": 0.4264,
+      "grad_norm": 0.41113202720973163,
+      "learning_rate": 0.0001283766802076722,
+      "loss": 0.7057,
+      "step": 533
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.3901613555563609,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7342,
+      "step": 534
+    },
+    {
+      "epoch": 0.428,
+      "grad_norm": 0.4113244043759376,
+      "learning_rate": 0.00012787919679242306,
+      "loss": 0.78,
+      "step": 535
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4017800562369879,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.744,
+      "step": 536
+    },
+    {
+      "epoch": 0.4296,
+      "grad_norm": 0.41562416829574583,
+      "learning_rate": 0.00012738096411504522,
+      "loss": 0.7514,
+      "step": 537
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.39518105073910975,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.6914,
+      "step": 538
+    },
+    {
+      "epoch": 0.4312,
+      "grad_norm": 0.3695062349334831,
+      "learning_rate": 0.00012688199556569753,
+      "loss": 0.7162,
+      "step": 539
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.3821071049370452,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7041,
+      "step": 540
+    },
+    {
+      "epoch": 0.4328,
+      "grad_norm": 0.3895151331287037,
+      "learning_rate": 0.0001263823045543158,
+      "loss": 0.7108,
+      "step": 541
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.4555783174734416,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.7449,
+      "step": 542
+    },
+    {
+      "epoch": 0.4344,
+      "grad_norm": 0.40261820838834467,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.8089,
+      "step": 543
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4537195432426405,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.8016,
+      "step": 544
+    },
+    {
+      "epoch": 0.436,
+      "grad_norm": 0.41674676712927666,
+      "learning_rate": 0.00012538080888191408,
+      "loss": 0.7191,
+      "step": 545
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.3615959762032306,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.6631,
+      "step": 546
+    },
+    {
+      "epoch": 0.4376,
+      "grad_norm": 0.4045962545723302,
+      "learning_rate": 0.00012487903113640337,
+      "loss": 0.731,
+      "step": 547
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.39050807685917494,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7772,
+      "step": 548
+    },
+    {
+      "epoch": 0.4392,
+      "grad_norm": 0.3757834980550243,
+      "learning_rate": 0.00012437658475915377,
+      "loss": 0.6894,
+      "step": 549
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.3689254682213251,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.6835,
+      "step": 550
+    },
+    {
+      "epoch": 0.4408,
+      "grad_norm": 0.42097646607624756,
+      "learning_rate": 0.00012387348325356874,
+      "loss": 0.7117,
+      "step": 551
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.3528241671936896,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.6413,
+      "step": 552
+    },
+    {
+      "epoch": 0.4424,
+      "grad_norm": 0.3809898049081442,
+      "learning_rate": 0.00012336974014065844,
+      "loss": 0.7273,
+      "step": 553
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.40189423114516165,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7671,
+      "step": 554
+    },
+    {
+      "epoch": 0.444,
+      "grad_norm": 0.44260966954087205,
+      "learning_rate": 0.00012286536895867654,
+      "loss": 0.7555,
+      "step": 555
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.4121804448249583,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7634,
+      "step": 556
+    },
+    {
+      "epoch": 0.4456,
+      "grad_norm": 0.4147017931293983,
+      "learning_rate": 0.00012236038326275626,
+      "loss": 0.7624,
+      "step": 557
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.3966852226772586,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7028,
+      "step": 558
+    },
+    {
+      "epoch": 0.4472,
+      "grad_norm": 0.40152430826623603,
+      "learning_rate": 0.00012185479662454595,
+      "loss": 0.6969,
+      "step": 559
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.45074530522342926,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7938,
+      "step": 560
+    },
+    {
+      "epoch": 0.4488,
+      "grad_norm": 0.4388723825809723,
+      "learning_rate": 0.00012134862263184467,
+      "loss": 0.8302,
+      "step": 561
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.40997854160175323,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.8194,
+      "step": 562
+    },
+    {
+      "epoch": 0.4504,
+      "grad_norm": 0.37438786055649476,
+      "learning_rate": 0.00012084187488823657,
+      "loss": 0.7173,
+      "step": 563
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3972270190292053,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7061,
+      "step": 564
+    },
+    {
+      "epoch": 0.452,
+      "grad_norm": 0.3795007296042973,
+      "learning_rate": 0.00012033456701272576,
+      "loss": 0.6833,
+      "step": 565
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.39273732706955466,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.6964,
+      "step": 566
+    },
+    {
+      "epoch": 0.4536,
+      "grad_norm": 0.39557876005056253,
+      "learning_rate": 0.00011982671263936995,
+      "loss": 0.7345,
+      "step": 567
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.37767135940417723,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.6585,
+      "step": 568
+    },
+    {
+      "epoch": 0.4552,
+      "grad_norm": 0.38797574363602066,
+      "learning_rate": 0.00011931832541691418,
+      "loss": 0.7205,
+      "step": 569
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.3630726329629002,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.677,
+      "step": 570
+    },
+    {
+      "epoch": 0.4568,
+      "grad_norm": 0.4387536553028977,
+      "learning_rate": 0.00011880941900842397,
+      "loss": 0.7619,
+      "step": 571
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.38464703104051634,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7789,
+      "step": 572
+    },
+    {
+      "epoch": 0.4584,
+      "grad_norm": 0.39167937019149635,
+      "learning_rate": 0.00011830000709091815,
+      "loss": 0.6815,
+      "step": 573
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.3977451991379984,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.6787,
+      "step": 574
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.3931384610009728,
+      "learning_rate": 0.0001177901033550012,
+      "loss": 0.7128,
+      "step": 575
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.4069419597097396,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.7026,
+      "step": 576
+    },
+    {
+      "epoch": 0.4616,
+      "grad_norm": 0.39761087525955485,
+      "learning_rate": 0.00011727972150449544,
+      "loss": 0.7373,
+      "step": 577
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.3946874053496708,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.6923,
+      "step": 578
+    },
+    {
+      "epoch": 0.4632,
+      "grad_norm": 0.38192717118076763,
+      "learning_rate": 0.00011676887525607271,
+      "loss": 0.7299,
+      "step": 579
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.41838798523777854,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7477,
+      "step": 580
+    },
+    {
+      "epoch": 0.4648,
+      "grad_norm": 0.3694964924391603,
+      "learning_rate": 0.00011625757833888551,
+      "loss": 0.6517,
+      "step": 581
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.3969000692318527,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.6973,
+      "step": 582
+    },
+    {
+      "epoch": 0.4664,
+      "grad_norm": 0.37636076226194104,
+      "learning_rate": 0.0001157458444941984,
+      "loss": 0.7195,
+      "step": 583
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.43492259485161183,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7638,
+      "step": 584
+    },
+    {
+      "epoch": 0.468,
+      "grad_norm": 0.4126453745393548,
+      "learning_rate": 0.00011523368747501839,
+      "loss": 0.7832,
+      "step": 585
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.4108148514985448,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.721,
+      "step": 586
+    },
+    {
+      "epoch": 0.4696,
+      "grad_norm": 0.5666162307393274,
+      "learning_rate": 0.00011472112104572547,
+      "loss": 0.8376,
+      "step": 587
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3817260351961357,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.6471,
+      "step": 588
+    },
+    {
+      "epoch": 0.4712,
+      "grad_norm": 0.43432725967521973,
+      "learning_rate": 0.0001142081589817027,
+      "loss": 0.7645,
+      "step": 589
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.427447555440724,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7164,
+      "step": 590
+    },
+    {
+      "epoch": 0.4728,
+      "grad_norm": 0.34959973158873514,
+      "learning_rate": 0.00011369481506896582,
+      "loss": 0.6934,
+      "step": 591
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.4375141744900696,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.6958,
+      "step": 592
+    },
+    {
+      "epoch": 0.4744,
+      "grad_norm": 0.41045570654286445,
+      "learning_rate": 0.00011318110310379301,
+      "loss": 0.7066,
+      "step": 593
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.43400068046834894,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.748,
+      "step": 594
+    },
+    {
+      "epoch": 0.476,
+      "grad_norm": 0.3965691779769979,
+      "learning_rate": 0.00011266703689235394,
+      "loss": 0.6862,
+      "step": 595
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.42202372939300253,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.7473,
+      "step": 596
+    },
+    {
+      "epoch": 0.4776,
+      "grad_norm": 0.38843352566967987,
+      "learning_rate": 0.00011215263025033869,
+      "loss": 0.6793,
+      "step": 597
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.40712476358977573,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7815,
+      "step": 598
+    },
+    {
+      "epoch": 0.4792,
+      "grad_norm": 0.45280542740712704,
+      "learning_rate": 0.00011163789700258655,
+      "loss": 0.8345,
+      "step": 599
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3988135723412154,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.6403,
+      "step": 600
+    },
+    {
+      "epoch": 0.4808,
+      "grad_norm": 0.4104719902931324,
+      "learning_rate": 0.00011112285098271451,
+      "loss": 0.8083,
+      "step": 601
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.43497733154663787,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.786,
+      "step": 602
+    },
+    {
+      "epoch": 0.4824,
+      "grad_norm": 0.4282110309376388,
+      "learning_rate": 0.00011060750603274535,
+      "loss": 0.8164,
+      "step": 603
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3805016494025352,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.7215,
+      "step": 604
+    },
+    {
+      "epoch": 0.484,
+      "grad_norm": 0.3776584079691621,
+      "learning_rate": 0.00011009187600273566,
+      "loss": 0.7185,
+      "step": 605
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.36549179775783314,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.6423,
+      "step": 606
+    },
+    {
+      "epoch": 0.4856,
+      "grad_norm": 0.42155805120324596,
+      "learning_rate": 0.00010957597475040373,
+      "loss": 0.7358,
+      "step": 607
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3797360196402241,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7047,
+      "step": 608
+    },
+    {
+      "epoch": 0.4872,
+      "grad_norm": 0.40773799010081985,
+      "learning_rate": 0.00010905981614075693,
+      "loss": 0.6931,
+      "step": 609
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.3610044222301794,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.6668,
+      "step": 610
+    },
+    {
+      "epoch": 0.4888,
+      "grad_norm": 0.4399774369868542,
+      "learning_rate": 0.00010854341404571928,
+      "loss": 0.7299,
+      "step": 611
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3795477589972217,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.6782,
+      "step": 612
+    },
+    {
+      "epoch": 0.4904,
+      "grad_norm": 0.3976887802313931,
+      "learning_rate": 0.00010802678234375851,
+      "loss": 0.6831,
+      "step": 613
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.4646173856321158,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7669,
+      "step": 614
+    },
+    {
+      "epoch": 0.492,
+      "grad_norm": 0.3779829407750109,
+      "learning_rate": 0.0001075099349195131,
+      "loss": 0.7257,
+      "step": 615
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.39575116618933087,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.6947,
+      "step": 616
+    },
+    {
+      "epoch": 0.4936,
+      "grad_norm": 0.39042799949121,
+      "learning_rate": 0.00010699288566341914,
+      "loss": 0.6909,
+      "step": 617
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.40122823176564265,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.72,
+      "step": 618
+    },
+    {
+      "epoch": 0.4952,
+      "grad_norm": 0.4063853232863709,
+      "learning_rate": 0.000106475648471337,
+      "loss": 0.7005,
+      "step": 619
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3620850889881979,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7287,
+      "step": 620
+    },
+    {
+      "epoch": 0.4968,
+      "grad_norm": 0.4602973221668992,
+      "learning_rate": 0.00010595823724417795,
+      "loss": 0.7608,
+      "step": 621
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.34145601526624864,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.6831,
+      "step": 622
+    },
+    {
+      "epoch": 0.4984,
+      "grad_norm": 0.40756445625679594,
+      "learning_rate": 0.00010544066588753044,
+      "loss": 0.7884,
+      "step": 623
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.4071185750181592,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.7479,
+      "step": 624
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.3645809044587315,
+      "learning_rate": 0.00010492294831128641,
+      "loss": 0.6517,
+      "step": 625
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.39643622655244104,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7431,
+      "step": 626
+    },
+    {
+      "epoch": 0.5016,
+      "grad_norm": 0.41844749866098513,
+      "learning_rate": 0.00010440509842926767,
+      "loss": 0.7584,
+      "step": 627
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.4490939032967661,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.776,
+      "step": 628
+    },
+    {
+      "epoch": 0.5032,
+      "grad_norm": 0.41286117568180564,
+      "learning_rate": 0.00010388713015885161,
+      "loss": 0.7442,
+      "step": 629
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.394910202672662,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.7218,
+      "step": 630
+    },
+    {
+      "epoch": 0.5048,
+      "grad_norm": 0.34524918277952554,
+      "learning_rate": 0.00010336905742059742,
+      "loss": 0.6657,
+      "step": 631
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.4013001926497867,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7162,
+      "step": 632
+    },
+    {
+      "epoch": 0.5064,
+      "grad_norm": 0.3626432188482639,
+      "learning_rate": 0.0001028508941378719,
+      "loss": 0.6827,
+      "step": 633
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.40406396202226275,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7414,
+      "step": 634
+    },
+    {
+      "epoch": 0.508,
+      "grad_norm": 0.35123972431962647,
+      "learning_rate": 0.00010233265423647523,
+      "loss": 0.6682,
+      "step": 635
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.42085291283779447,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.7348,
+      "step": 636
+    },
+    {
+      "epoch": 0.5096,
+      "grad_norm": 0.3919260383650782,
+      "learning_rate": 0.00010181435164426676,
+      "loss": 0.6383,
+      "step": 637
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.3846362087718709,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.7089,
+      "step": 638
+    },
+    {
+      "epoch": 0.5112,
+      "grad_norm": 0.39416896980117394,
+      "learning_rate": 0.00010129600029079072,
+      "loss": 0.7382,
+      "step": 639
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.43852824981762756,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7824,
+      "step": 640
+    },
+    {
+      "epoch": 0.5128,
+      "grad_norm": 0.4371752703502908,
+      "learning_rate": 0.00010077761410690172,
+      "loss": 0.7316,
+      "step": 641
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.3680561270793559,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.6562,
+      "step": 642
+    },
+    {
+      "epoch": 0.5144,
+      "grad_norm": 0.413711406358694,
+      "learning_rate": 0.00010025920702439051,
+      "loss": 0.7163,
+      "step": 643
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.3930940635380676,
+      "learning_rate": 0.0001,
+      "loss": 0.7017,
+      "step": 644
+    },
+    {
+      "epoch": 0.516,
+      "grad_norm": 0.4046241071819806,
+      "learning_rate": 9.97407929756095e-05,
+      "loss": 0.7353,
+      "step": 645
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.36892581761882737,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7146,
+      "step": 646
+    },
+    {
+      "epoch": 0.5176,
+      "grad_norm": 0.35157467797738684,
+      "learning_rate": 9.92223858930983e-05,
+      "loss": 0.6643,
+      "step": 647
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3708634742026403,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.696,
+      "step": 648
+    },
+    {
+      "epoch": 0.5192,
+      "grad_norm": 0.3550674722920594,
+      "learning_rate": 9.870399970920932e-05,
+      "loss": 0.67,
+      "step": 649
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3775707423902258,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.6261,
+      "step": 650
+    },
+    {
+      "epoch": 0.5208,
+      "grad_norm": 0.38402987317749526,
+      "learning_rate": 9.818564835573323e-05,
+      "loss": 0.695,
+      "step": 651
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.34315102173734846,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.652,
+      "step": 652
+    },
+    {
+      "epoch": 0.5224,
+      "grad_norm": 0.42916661417714574,
+      "learning_rate": 9.766734576352478e-05,
+      "loss": 0.7455,
+      "step": 653
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.4010076974964933,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.7155,
+      "step": 654
+    },
+    {
+      "epoch": 0.524,
+      "grad_norm": 0.36917368065934286,
+      "learning_rate": 9.714910586212816e-05,
+      "loss": 0.6521,
+      "step": 655
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3991390058129583,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7011,
+      "step": 656
+    },
+    {
+      "epoch": 0.5256,
+      "grad_norm": 0.39216997183273594,
+      "learning_rate": 9.663094257940258e-05,
+      "loss": 0.7065,
+      "step": 657
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.442582130429027,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.6977,
+      "step": 658
+    },
+    {
+      "epoch": 0.5272,
+      "grad_norm": 0.3999220733985173,
+      "learning_rate": 9.611286984114841e-05,
+      "loss": 0.7035,
+      "step": 659
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4335486544309202,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.687,
+      "step": 660
+    },
+    {
+      "epoch": 0.5288,
+      "grad_norm": 0.42951244215856904,
+      "learning_rate": 9.559490157073236e-05,
+      "loss": 0.7494,
+      "step": 661
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.3916325898469945,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.6748,
+      "step": 662
+    },
+    {
+      "epoch": 0.5304,
+      "grad_norm": 0.39839736695567196,
+      "learning_rate": 9.507705168871358e-05,
+      "loss": 0.7084,
+      "step": 663
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.3602102298252946,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6287,
+      "step": 664
+    },
+    {
+      "epoch": 0.532,
+      "grad_norm": 0.39360077695176615,
+      "learning_rate": 9.455933411246958e-05,
+      "loss": 0.6908,
+      "step": 665
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.3813408459588176,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.6999,
+      "step": 666
+    },
+    {
+      "epoch": 0.5336,
+      "grad_norm": 0.3824402547979255,
+      "learning_rate": 9.404176275582208e-05,
+      "loss": 0.6541,
+      "step": 667
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.5083614991019088,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.855,
+      "step": 668
+    },
+    {
+      "epoch": 0.5352,
+      "grad_norm": 0.445603718596326,
+      "learning_rate": 9.352435152866298e-05,
+      "loss": 0.7599,
+      "step": 669
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.47146828085890363,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.8023,
+      "step": 670
+    },
+    {
+      "epoch": 0.5368,
+      "grad_norm": 0.43408367978037793,
+      "learning_rate": 9.300711433658087e-05,
+      "loss": 0.7014,
+      "step": 671
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.425448138566533,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7164,
+      "step": 672
+    },
+    {
+      "epoch": 0.5384,
+      "grad_norm": 0.39227384599061355,
+      "learning_rate": 9.249006508048694e-05,
+      "loss": 0.6953,
+      "step": 673
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.38518524255604936,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.7348,
+      "step": 674
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.3780220958648424,
+      "learning_rate": 9.197321765624152e-05,
+      "loss": 0.6585,
+      "step": 675
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.37491163307793673,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7239,
+      "step": 676
+    },
+    {
+      "epoch": 0.5416,
+      "grad_norm": 0.4297851476523369,
+      "learning_rate": 9.145658595428074e-05,
+      "loss": 0.7229,
+      "step": 677
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.4270389158180757,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7099,
+      "step": 678
+    },
+    {
+      "epoch": 0.5432,
+      "grad_norm": 0.44843851052946626,
+      "learning_rate": 9.09401838592431e-05,
+      "loss": 0.7708,
+      "step": 679
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.39071195168014267,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.7267,
+      "step": 680
+    },
+    {
+      "epoch": 0.5448,
+      "grad_norm": 0.3686899418824359,
+      "learning_rate": 9.04240252495963e-05,
+      "loss": 0.7128,
+      "step": 681
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.484195844823364,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.6819,
+      "step": 682
+    },
+    {
+      "epoch": 0.5464,
+      "grad_norm": 0.3832259944118132,
+      "learning_rate": 8.990812399726435e-05,
+      "loss": 0.6828,
+      "step": 683
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.41458472472208135,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7577,
+      "step": 684
+    },
+    {
+      "epoch": 0.548,
+      "grad_norm": 0.3617464827857215,
+      "learning_rate": 8.939249396725467e-05,
+      "loss": 0.7103,
+      "step": 685
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.4058552873419775,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.7412,
+      "step": 686
+    },
+    {
+      "epoch": 0.5496,
+      "grad_norm": 0.41965368293910055,
+      "learning_rate": 8.887714901728551e-05,
+      "loss": 0.7487,
+      "step": 687
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.44970944641672744,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.778,
+      "step": 688
+    },
+    {
+      "epoch": 0.5512,
+      "grad_norm": 0.3545719381566445,
+      "learning_rate": 8.836210299741346e-05,
+      "loss": 0.6984,
+      "step": 689
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.3790480088341582,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.6866,
+      "step": 690
+    },
+    {
+      "epoch": 0.5528,
+      "grad_norm": 0.3852953689419974,
+      "learning_rate": 8.784736974966135e-05,
+      "loss": 0.7205,
+      "step": 691
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.36277085602796183,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.6941,
+      "step": 692
+    },
+    {
+      "epoch": 0.5544,
+      "grad_norm": 0.414706108808844,
+      "learning_rate": 8.733296310764611e-05,
+      "loss": 0.7056,
+      "step": 693
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.4476749836974746,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.7337,
+      "step": 694
+    },
+    {
+      "epoch": 0.556,
+      "grad_norm": 0.40156586271414774,
+      "learning_rate": 8.6818896896207e-05,
+      "loss": 0.7279,
+      "step": 695
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.36420512506545966,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7013,
+      "step": 696
+    },
+    {
+      "epoch": 0.5576,
+      "grad_norm": 0.4016790412811444,
+      "learning_rate": 8.63051849310342e-05,
+      "loss": 0.7029,
+      "step": 697
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.4071846828907595,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.7902,
+      "step": 698
+    },
+    {
+      "epoch": 0.5592,
+      "grad_norm": 0.4193000771027037,
+      "learning_rate": 8.579184101829734e-05,
+      "loss": 0.7654,
+      "step": 699
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3475117889941879,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6732,
+      "step": 700
+    },
+    {
+      "epoch": 0.5608,
+      "grad_norm": 0.35458055124710486,
+      "learning_rate": 8.527887895427454e-05,
+      "loss": 0.6897,
+      "step": 701
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.38841476335998537,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7214,
+      "step": 702
+    },
+    {
+      "epoch": 0.5624,
+      "grad_norm": 0.45433014014250656,
+      "learning_rate": 8.476631252498162e-05,
+      "loss": 0.7998,
+      "step": 703
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.3586743870703551,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.6571,
+      "step": 704
+    },
+    {
+      "epoch": 0.564,
+      "grad_norm": 0.35312943544102154,
+      "learning_rate": 8.425415550580162e-05,
+      "loss": 0.6652,
+      "step": 705
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.39348577997621986,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.701,
+      "step": 706
+    },
+    {
+      "epoch": 0.5656,
+      "grad_norm": 0.3794069943977316,
+      "learning_rate": 8.374242166111448e-05,
+      "loss": 0.6863,
+      "step": 707
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.40509681006948783,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.6918,
+      "step": 708
+    },
+    {
+      "epoch": 0.5672,
+      "grad_norm": 0.4064801302224138,
+      "learning_rate": 8.323112474392731e-05,
+      "loss": 0.709,
+      "step": 709
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.37515881106172344,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.7065,
+      "step": 710
+    },
+    {
+      "epoch": 0.5688,
+      "grad_norm": 0.42111045551957627,
+      "learning_rate": 8.272027849550457e-05,
+      "loss": 0.6952,
+      "step": 711
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.47257730917513174,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7128,
+      "step": 712
+    },
+    {
+      "epoch": 0.5704,
+      "grad_norm": 0.5796908578294863,
+      "learning_rate": 8.220989664499878e-05,
+      "loss": 0.751,
+      "step": 713
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.3862793085569805,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.7303,
+      "step": 714
+    },
+    {
+      "epoch": 0.572,
+      "grad_norm": 0.37681659135460804,
+      "learning_rate": 8.169999290908188e-05,
+      "loss": 0.6707,
+      "step": 715
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.38351037595009896,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.6837,
+      "step": 716
+    },
+    {
+      "epoch": 0.5736,
+      "grad_norm": 0.41570489940380556,
+      "learning_rate": 8.119058099157604e-05,
+      "loss": 0.6992,
+      "step": 717
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.4070342608417595,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7116,
+      "step": 718
+    },
+    {
+      "epoch": 0.5752,
+      "grad_norm": 0.37017913236409716,
+      "learning_rate": 8.068167458308582e-05,
+      "loss": 0.6707,
+      "step": 719
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3715292483742941,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.6981,
+      "step": 720
+    },
+    {
+      "epoch": 0.5768,
+      "grad_norm": 0.38306244022912933,
+      "learning_rate": 8.017328736063006e-05,
+      "loss": 0.637,
+      "step": 721
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.3518606534199549,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6622,
+      "step": 722
+    },
+    {
+      "epoch": 0.5784,
+      "grad_norm": 0.3999015657045575,
+      "learning_rate": 7.966543298727425e-05,
+      "loss": 0.7308,
+      "step": 723
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.4110426055818609,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6783,
+      "step": 724
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.39756580345933124,
+      "learning_rate": 7.915812511176347e-05,
+      "loss": 0.7405,
+      "step": 725
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.36916620841185016,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.609,
+      "step": 726
+    },
+    {
+      "epoch": 0.5816,
+      "grad_norm": 0.34784605132280716,
+      "learning_rate": 7.865137736815535e-05,
+      "loss": 0.6209,
+      "step": 727
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.40278232284216864,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.728,
+      "step": 728
+    },
+    {
+      "epoch": 0.5832,
+      "grad_norm": 0.3812820287572344,
+      "learning_rate": 7.814520337545406e-05,
+      "loss": 0.7133,
+      "step": 729
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3620196046589468,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.6592,
+      "step": 730
+    },
+    {
+      "epoch": 0.5848,
+      "grad_norm": 0.4142686294295778,
+      "learning_rate": 7.763961673724379e-05,
+      "loss": 0.7162,
+      "step": 731
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.38944524715253764,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6986,
+      "step": 732
+    },
+    {
+      "epoch": 0.5864,
+      "grad_norm": 0.3598739842145071,
+      "learning_rate": 7.713463104132345e-05,
+      "loss": 0.6821,
+      "step": 733
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.40094634739098434,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.67,
+      "step": 734
+    },
+    {
+      "epoch": 0.588,
+      "grad_norm": 0.41535343948306597,
+      "learning_rate": 7.663025985934158e-05,
+      "loss": 0.7362,
+      "step": 735
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.41051936650403853,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.6998,
+      "step": 736
+    },
+    {
+      "epoch": 0.5896,
+      "grad_norm": 0.4414551091260474,
+      "learning_rate": 7.61265167464313e-05,
+      "loss": 0.8053,
+      "step": 737
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.37054674845555746,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.662,
+      "step": 738
+    },
+    {
+      "epoch": 0.5912,
+      "grad_norm": 0.3803653850905569,
+      "learning_rate": 7.562341524084623e-05,
+      "loss": 0.6725,
+      "step": 739
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4279603552936147,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.7433,
+      "step": 740
+    },
+    {
+      "epoch": 0.5928,
+      "grad_norm": 0.39183934094472184,
+      "learning_rate": 7.512096886359664e-05,
+      "loss": 0.6898,
+      "step": 741
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.44355466321467885,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6637,
+      "step": 742
+    },
+    {
+      "epoch": 0.5944,
+      "grad_norm": 0.37384479459881104,
+      "learning_rate": 7.461919111808595e-05,
+      "loss": 0.7046,
+      "step": 743
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.42085403164939966,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.772,
+      "step": 744
+    },
+    {
+      "epoch": 0.596,
+      "grad_norm": 0.3647244240418084,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.6847,
+      "step": 745
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.37293835326603875,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6968,
+      "step": 746
+    },
+    {
+      "epoch": 0.5976,
+      "grad_norm": 0.4089421010509937,
+      "learning_rate": 7.361769544568425e-05,
+      "loss": 0.7072,
+      "step": 747
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4180382473394056,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.7906,
+      "step": 748
+    },
+    {
+      "epoch": 0.5992,
+      "grad_norm": 0.3979438751592118,
+      "learning_rate": 7.311800443430251e-05,
+      "loss": 0.7198,
+      "step": 749
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.39928123637778606,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7031,
+      "step": 750
+    },
+    {
+      "epoch": 0.6008,
+      "grad_norm": 0.3674395446163755,
+      "learning_rate": 7.26190358849548e-05,
+      "loss": 0.6569,
+      "step": 751
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3859486385750213,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.7024,
+      "step": 752
+    },
+    {
+      "epoch": 0.6024,
+      "grad_norm": 0.4276363677168132,
+      "learning_rate": 7.212080320757695e-05,
+      "loss": 0.7025,
+      "step": 753
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.4880835991048266,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.7482,
+      "step": 754
+    },
+    {
+      "epoch": 0.604,
+      "grad_norm": 0.36705791881157884,
+      "learning_rate": 7.162331979232783e-05,
+      "loss": 0.7005,
+      "step": 755
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.43099468027721016,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.7631,
+      "step": 756
+    },
+    {
+      "epoch": 0.6056,
+      "grad_norm": 0.36758250425564015,
+      "learning_rate": 7.112659900922976e-05,
+      "loss": 0.6947,
+      "step": 757
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.36867475791558263,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.694,
+      "step": 758
+    },
+    {
+      "epoch": 0.6072,
+      "grad_norm": 0.36077953534345314,
+      "learning_rate": 7.06306542078091e-05,
+      "loss": 0.6616,
+      "step": 759
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.49265019267157223,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.7989,
+      "step": 760
+    },
+    {
+      "epoch": 0.6088,
+      "grad_norm": 0.357291492711498,
+      "learning_rate": 7.013549871673736e-05,
+      "loss": 0.6781,
+      "step": 761
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.43128032811348194,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.7786,
+      "step": 762
+    },
+    {
+      "epoch": 0.6104,
+      "grad_norm": 0.4351649930566372,
+      "learning_rate": 6.964114584347316e-05,
+      "loss": 0.7996,
+      "step": 763
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3854285270860244,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6873,
+      "step": 764
+    },
+    {
+      "epoch": 0.612,
+      "grad_norm": 0.4222448806722589,
+      "learning_rate": 6.914760887390452e-05,
+      "loss": 0.7686,
+      "step": 765
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.3437253460142556,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6542,
+      "step": 766
+    },
+    {
+      "epoch": 0.6136,
+      "grad_norm": 0.3819110837227683,
+      "learning_rate": 6.865490107199181e-05,
+      "loss": 0.6832,
+      "step": 767
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3574634348965136,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6163,
+      "step": 768
+    },
+    {
+      "epoch": 0.6152,
+      "grad_norm": 0.3958268568472108,
+      "learning_rate": 6.816303567941112e-05,
+      "loss": 0.7076,
+      "step": 769
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.3876078568257905,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.6768,
+      "step": 770
+    },
+    {
+      "epoch": 0.6168,
+      "grad_norm": 0.3543407877240521,
+      "learning_rate": 6.767202591519875e-05,
+      "loss": 0.6646,
+      "step": 771
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.5169624011043301,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7584,
+      "step": 772
+    },
+    {
+      "epoch": 0.6184,
+      "grad_norm": 0.4080700210190037,
+      "learning_rate": 6.718188497539554e-05,
+      "loss": 0.6894,
+      "step": 773
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.39850725465254166,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.7047,
+      "step": 774
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.3929053806516543,
+      "learning_rate": 6.669262603269246e-05,
+      "loss": 0.6746,
+      "step": 775
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3855220091053093,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.6719,
+      "step": 776
+    },
+    {
+      "epoch": 0.6216,
+      "grad_norm": 0.372356925574433,
+      "learning_rate": 6.620426223607654e-05,
+      "loss": 0.6594,
+      "step": 777
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.38465464803819926,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6652,
+      "step": 778
+    },
+    {
+      "epoch": 0.6232,
+      "grad_norm": 0.40130152781871936,
+      "learning_rate": 6.571680671047749e-05,
+      "loss": 0.6937,
+      "step": 779
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.4351001554035714,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7249,
+      "step": 780
+    },
+    {
+      "epoch": 0.6248,
+      "grad_norm": 0.4115902223631893,
+      "learning_rate": 6.523027255641493e-05,
+      "loss": 0.7151,
+      "step": 781
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.4121347859261653,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.702,
+      "step": 782
+    },
+    {
+      "epoch": 0.6264,
+      "grad_norm": 0.38667507587206656,
+      "learning_rate": 6.474467284964634e-05,
+      "loss": 0.678,
+      "step": 783
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3844946676003662,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6724,
+      "step": 784
+    },
+    {
+      "epoch": 0.628,
+      "grad_norm": 0.4415379812057678,
+      "learning_rate": 6.426002064081565e-05,
+      "loss": 0.7513,
+      "step": 785
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.39192207589977685,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.6651,
+      "step": 786
+    },
+    {
+      "epoch": 0.6296,
+      "grad_norm": 0.3389566163968938,
+      "learning_rate": 6.377632895510248e-05,
+      "loss": 0.6258,
+      "step": 787
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.44741155883279,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.7784,
+      "step": 788
+    },
+    {
+      "epoch": 0.6312,
+      "grad_norm": 0.4052730316786015,
+      "learning_rate": 6.329361079187199e-05,
+      "loss": 0.7085,
+      "step": 789
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.40419389102631725,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7488,
+      "step": 790
+    },
+    {
+      "epoch": 0.6328,
+      "grad_norm": 0.3758590354481773,
+      "learning_rate": 6.281187912432587e-05,
+      "loss": 0.681,
+      "step": 791
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.33656533436835895,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6377,
+      "step": 792
+    },
+    {
+      "epoch": 0.6344,
+      "grad_norm": 0.4248891720992723,
+      "learning_rate": 6.233114689915316e-05,
+      "loss": 0.7404,
+      "step": 793
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.5083490920128887,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.7262,
+      "step": 794
+    },
+    {
+      "epoch": 0.636,
+      "grad_norm": 0.4226612634135987,
+      "learning_rate": 6.18514270361827e-05,
+      "loss": 0.6931,
+      "step": 795
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.37781319738057323,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.6793,
+      "step": 796
+    },
+    {
+      "epoch": 0.6376,
+      "grad_norm": 0.36609240474136434,
+      "learning_rate": 6.13727324280358e-05,
+      "loss": 0.6842,
+      "step": 797
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.4510442700857087,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.6874,
+      "step": 798
+    },
+    {
+      "epoch": 0.6392,
+      "grad_norm": 0.410218508042047,
+      "learning_rate": 6.08950759397797e-05,
+      "loss": 0.6726,
+      "step": 799
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.36784783267298815,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6728,
+      "step": 800
+    },
+    {
+      "epoch": 0.6408,
+      "grad_norm": 0.3508863599433168,
+      "learning_rate": 6.0418470408581774e-05,
+      "loss": 0.6246,
+      "step": 801
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.3861635430903922,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.6916,
+      "step": 802
+    },
+    {
+      "epoch": 0.6424,
+      "grad_norm": 0.39761478038483955,
+      "learning_rate": 5.9942928643364724e-05,
+      "loss": 0.6865,
+      "step": 803
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.4174642749474727,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6636,
+      "step": 804
+    },
+    {
+      "epoch": 0.644,
+      "grad_norm": 0.3970447301181033,
+      "learning_rate": 5.946846342446214e-05,
+      "loss": 0.6712,
+      "step": 805
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.4185172143582579,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.7518,
+      "step": 806
+    },
+    {
+      "epoch": 0.6456,
+      "grad_norm": 0.42893284870193243,
+      "learning_rate": 5.899508750327501e-05,
+      "loss": 0.7415,
+      "step": 807
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.34074490348733405,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6421,
+      "step": 808
+    },
+    {
+      "epoch": 0.6472,
+      "grad_norm": 0.4185296047112624,
+      "learning_rate": 5.8522813601929324e-05,
+      "loss": 0.7498,
+      "step": 809
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.37522628453525303,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6293,
+      "step": 810
+    },
+    {
+      "epoch": 0.6488,
+      "grad_norm": 0.4732993099724249,
+      "learning_rate": 5.80516544129337e-05,
+      "loss": 0.7681,
+      "step": 811
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.45813852136378314,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.7215,
+      "step": 812
+    },
+    {
+      "epoch": 0.6504,
+      "grad_norm": 0.3951081586207811,
+      "learning_rate": 5.758162259883867e-05,
+      "loss": 0.7173,
+      "step": 813
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.3519802287916903,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.6131,
+      "step": 814
+    },
+    {
+      "epoch": 0.652,
+      "grad_norm": 0.37336416938345224,
+      "learning_rate": 5.7112730791896207e-05,
+      "loss": 0.6599,
+      "step": 815
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.34585353519014184,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.6665,
+      "step": 816
+    },
+    {
+      "epoch": 0.6536,
+      "grad_norm": 0.38433583340493543,
+      "learning_rate": 5.664499159372017e-05,
+      "loss": 0.7129,
+      "step": 817
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.36033709634244937,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.6903,
+      "step": 818
+    },
+    {
+      "epoch": 0.6552,
+      "grad_norm": 0.35504478143908713,
+      "learning_rate": 5.617841757494762e-05,
+      "loss": 0.7094,
+      "step": 819
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.40254939614920054,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.705,
+      "step": 820
+    },
+    {
+      "epoch": 0.6568,
+      "grad_norm": 0.3819902029976929,
+      "learning_rate": 5.5713021274901335e-05,
+      "loss": 0.6737,
+      "step": 821
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.33563599565287516,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.5854,
+      "step": 822
+    },
+    {
+      "epoch": 0.6584,
+      "grad_norm": 0.38048738975216345,
+      "learning_rate": 5.524881520125229e-05,
+      "loss": 0.696,
+      "step": 823
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.39463923524519234,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6579,
+      "step": 824
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.36919831664676683,
+      "learning_rate": 5.4785811829683764e-05,
+      "loss": 0.6612,
+      "step": 825
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.40099308760264113,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.7229,
+      "step": 826
+    },
+    {
+      "epoch": 0.6616,
+      "grad_norm": 0.37460079877045543,
+      "learning_rate": 5.432402360355615e-05,
+      "loss": 0.6234,
+      "step": 827
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4774541551947916,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.7966,
+      "step": 828
+    },
+    {
+      "epoch": 0.6632,
+      "grad_norm": 0.7561719087296993,
+      "learning_rate": 5.386346293357242e-05,
+      "loss": 0.7106,
+      "step": 829
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4944294627944639,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.7827,
+      "step": 830
+    },
+    {
+      "epoch": 0.6648,
+      "grad_norm": 0.3988663927068123,
+      "learning_rate": 5.3404142197444506e-05,
+      "loss": 0.691,
+      "step": 831
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.387011407683691,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.6942,
+      "step": 832
+    },
+    {
+      "epoch": 0.6664,
+      "grad_norm": 0.37000355400540574,
+      "learning_rate": 5.2946073739560706e-05,
+      "loss": 0.6871,
+      "step": 833
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.3502077070269879,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6395,
+      "step": 834
+    },
+    {
+      "epoch": 0.668,
+      "grad_norm": 0.3618854175964115,
+      "learning_rate": 5.248926987065417e-05,
+      "loss": 0.6422,
+      "step": 835
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.4110760311370138,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.7164,
+      "step": 836
+    },
+    {
+      "epoch": 0.6696,
+      "grad_norm": 0.3812106705856821,
+      "learning_rate": 5.203374286747158e-05,
+      "loss": 0.6677,
+      "step": 837
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.3961513132352477,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6548,
+      "step": 838
+    },
+    {
+      "epoch": 0.6712,
+      "grad_norm": 0.44302798222292566,
+      "learning_rate": 5.15795049724435e-05,
+      "loss": 0.7168,
+      "step": 839
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3259426827412798,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.6102,
+      "step": 840
+    },
+    {
+      "epoch": 0.6728,
+      "grad_norm": 0.43795911251231234,
+      "learning_rate": 5.112656839335543e-05,
+      "loss": 0.6708,
+      "step": 841
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.4003310878287411,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.6586,
+      "step": 842
+    },
+    {
+      "epoch": 0.6744,
+      "grad_norm": 0.4053910786415956,
+      "learning_rate": 5.0674945303019526e-05,
+      "loss": 0.6586,
+      "step": 843
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.3598752059918062,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6041,
+      "step": 844
+    },
+    {
+      "epoch": 0.676,
+      "grad_norm": 0.5320693967787626,
+      "learning_rate": 5.022464783894744e-05,
+      "loss": 0.6331,
+      "step": 845
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.41893315449125795,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.6564,
+      "step": 846
+    },
+    {
+      "epoch": 0.6776,
+      "grad_norm": 0.374339592096868,
+      "learning_rate": 4.977568810302432e-05,
+      "loss": 0.6935,
+      "step": 847
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.3520734439758967,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6039,
+      "step": 848
+    },
+    {
+      "epoch": 0.6792,
+      "grad_norm": 0.42883636780261936,
+      "learning_rate": 4.9328078161183464e-05,
+      "loss": 0.6703,
+      "step": 849
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.42231638922587356,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6704,
+      "step": 850
+    },
+    {
+      "epoch": 0.6808,
+      "grad_norm": 0.40143325519452655,
+      "learning_rate": 4.88818300430819e-05,
+      "loss": 0.6424,
+      "step": 851
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.38747811662920445,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6868,
+      "step": 852
+    },
+    {
+      "epoch": 0.6824,
+      "grad_norm": 0.4437868087919298,
+      "learning_rate": 4.843695574177737e-05,
+      "loss": 0.778,
+      "step": 853
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.40466048504673857,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.6558,
+      "step": 854
+    },
+    {
+      "epoch": 0.684,
+      "grad_norm": 0.38222637554375055,
+      "learning_rate": 4.7993467213405706e-05,
+      "loss": 0.7072,
+      "step": 855
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.49677925510795323,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.7291,
+      "step": 856
+    },
+    {
+      "epoch": 0.6856,
+      "grad_norm": 0.4326515984567661,
+      "learning_rate": 4.755137637685979e-05,
+      "loss": 0.6775,
+      "step": 857
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.3734136131599116,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.6105,
+      "step": 858
+    },
+    {
+      "epoch": 0.6872,
+      "grad_norm": 0.3914199028046532,
+      "learning_rate": 4.7110695113469085e-05,
+      "loss": 0.6766,
+      "step": 859
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.37051539308134473,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.6432,
+      "step": 860
+    },
+    {
+      "epoch": 0.6888,
+      "grad_norm": 0.40630829200241264,
+      "learning_rate": 4.6671435266680216e-05,
+      "loss": 0.6693,
+      "step": 861
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.38278665269565504,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.6326,
+      "step": 862
+    },
+    {
+      "epoch": 0.6904,
+      "grad_norm": 0.4713791334980711,
+      "learning_rate": 4.623360864173893e-05,
+      "loss": 0.6961,
+      "step": 863
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.41525455685034257,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.6906,
+      "step": 864
+    },
+    {
+      "epoch": 0.692,
+      "grad_norm": 0.47351257421378523,
+      "learning_rate": 4.579722700537268e-05,
+      "loss": 0.6856,
+      "step": 865
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.4117121674376699,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6759,
+      "step": 866
+    },
+    {
+      "epoch": 0.6936,
+      "grad_norm": 0.4065285849628081,
+      "learning_rate": 4.5362302085474254e-05,
+      "loss": 0.6924,
+      "step": 867
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3867487613641489,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.6441,
+      "step": 868
+    },
+    {
+      "epoch": 0.6952,
+      "grad_norm": 0.4505060652300184,
+      "learning_rate": 4.492884557078688e-05,
+      "loss": 0.6634,
+      "step": 869
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.400603244643206,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.6884,
+      "step": 870
+    },
+    {
+      "epoch": 0.6968,
+      "grad_norm": 0.37652240477713944,
+      "learning_rate": 4.449686911058992e-05,
+      "loss": 0.6583,
+      "step": 871
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.38526985535595526,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.6328,
+      "step": 872
+    },
+    {
+      "epoch": 0.6984,
+      "grad_norm": 0.423397196617986,
+      "learning_rate": 4.406638431438576e-05,
+      "loss": 0.6883,
+      "step": 873
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.3701394363125963,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.6213,
+      "step": 874
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.3921608532534879,
+      "learning_rate": 4.36374027515878e-05,
+      "loss": 0.7311,
+      "step": 875
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.39700260154913886,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.7046,
+      "step": 876
+    },
+    {
+      "epoch": 0.7016,
+      "grad_norm": 0.4149741300992126,
+      "learning_rate": 4.320993595120969e-05,
+      "loss": 0.7112,
+      "step": 877
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.43756902112109974,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.7412,
+      "step": 878
+    },
+    {
+      "epoch": 0.7032,
+      "grad_norm": 0.3866813232683912,
+      "learning_rate": 4.278399540155536e-05,
+      "loss": 0.6727,
+      "step": 879
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.41430265524677684,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.722,
+      "step": 880
+    },
+    {
+      "epoch": 0.7048,
+      "grad_norm": 0.3966499810715081,
+      "learning_rate": 4.2359592549910145e-05,
+      "loss": 0.6674,
+      "step": 881
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.39833178178931555,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.6995,
+      "step": 882
+    },
+    {
+      "epoch": 0.7064,
+      "grad_norm": 0.4159629589849531,
+      "learning_rate": 4.193673880223339e-05,
+      "loss": 0.7203,
+      "step": 883
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.401390774208145,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6434,
+      "step": 884
+    },
+    {
+      "epoch": 0.708,
+      "grad_norm": 0.4043627983147242,
+      "learning_rate": 4.1515445522851784e-05,
+      "loss": 0.7046,
+      "step": 885
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.4324011450674324,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6595,
+      "step": 886
+    },
+    {
+      "epoch": 0.7096,
+      "grad_norm": 0.37240511002958815,
+      "learning_rate": 4.109572403415386e-05,
+      "loss": 0.6738,
+      "step": 887
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4070645096267617,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.677,
+      "step": 888
+    },
+    {
+      "epoch": 0.7112,
+      "grad_norm": 0.38932169604234196,
+      "learning_rate": 4.0677585616285774e-05,
+      "loss": 0.6803,
+      "step": 889
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.37185870326470627,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.678,
+      "step": 890
+    },
+    {
+      "epoch": 0.7128,
+      "grad_norm": 0.36611981750305955,
+      "learning_rate": 4.026104150684835e-05,
+      "loss": 0.6722,
+      "step": 891
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.3442390261507083,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6463,
+      "step": 892
+    },
+    {
+      "epoch": 0.7144,
+      "grad_norm": 0.4968410757107018,
+      "learning_rate": 3.984610290059467e-05,
+      "loss": 0.8251,
+      "step": 893
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.37161377786132654,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.6658,
+      "step": 894
+    },
+    {
+      "epoch": 0.716,
+      "grad_norm": 0.35375458841986335,
+      "learning_rate": 3.943278094912946e-05,
+      "loss": 0.5788,
+      "step": 895
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3754231113757112,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.7152,
+      "step": 896
+    },
+    {
+      "epoch": 0.7176,
+      "grad_norm": 0.3758166178101138,
+      "learning_rate": 3.902108676060937e-05,
+      "loss": 0.6908,
+      "step": 897
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.3804299058221594,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6545,
+      "step": 898
+    },
+    {
+      "epoch": 0.7192,
+      "grad_norm": 0.3654851011954076,
+      "learning_rate": 3.861103139944449e-05,
+      "loss": 0.6307,
+      "step": 899
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.44475715948671113,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.7652,
+      "step": 900
+    },
+    {
+      "epoch": 0.7208,
+      "grad_norm": 0.3950888083795495,
+      "learning_rate": 3.820262588600074e-05,
+      "loss": 0.6678,
+      "step": 901
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.3437194647641541,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.5405,
+      "step": 902
+    },
+    {
+      "epoch": 0.7224,
+      "grad_norm": 0.3479004406263855,
+      "learning_rate": 3.7795881196303995e-05,
+      "loss": 0.5977,
+      "step": 903
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.40146136196053567,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.6936,
+      "step": 904
+    },
+    {
+      "epoch": 0.724,
+      "grad_norm": 0.41475881491378036,
+      "learning_rate": 3.739080826174498e-05,
+      "loss": 0.7502,
+      "step": 905
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.4029168618382945,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.6722,
+      "step": 906
+    },
+    {
+      "epoch": 0.7256,
+      "grad_norm": 0.3924302456099913,
+      "learning_rate": 3.6987417968785366e-05,
+      "loss": 0.7086,
+      "step": 907
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.36298594811321255,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.6135,
+      "step": 908
+    },
+    {
+      "epoch": 0.7272,
+      "grad_norm": 0.325462761884952,
+      "learning_rate": 3.658572115866541e-05,
+      "loss": 0.5866,
+      "step": 909
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.3821441445340449,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6541,
+      "step": 910
+    },
+    {
+      "epoch": 0.7288,
+      "grad_norm": 0.39807602958990873,
+      "learning_rate": 3.618572862711247e-05,
+      "loss": 0.7102,
+      "step": 911
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.40302811259743876,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.6601,
+      "step": 912
+    },
+    {
+      "epoch": 0.7304,
+      "grad_norm": 0.4090597729618086,
+      "learning_rate": 3.578745112405083e-05,
+      "loss": 0.7116,
+      "step": 913
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.4242164133707293,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.6933,
+      "step": 914
+    },
+    {
+      "epoch": 0.732,
+      "grad_norm": 0.381356552185187,
+      "learning_rate": 3.539089935331294e-05,
+      "loss": 0.701,
+      "step": 915
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.42053075164360254,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.7176,
+      "step": 916
+    },
+    {
+      "epoch": 0.7336,
+      "grad_norm": 0.43103610147891575,
+      "learning_rate": 3.4996083972351515e-05,
+      "loss": 0.731,
+      "step": 917
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.426687475459895,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6651,
+      "step": 918
+    },
+    {
+      "epoch": 0.7352,
+      "grad_norm": 0.34436746852459027,
+      "learning_rate": 3.4603015591953395e-05,
+      "loss": 0.6021,
+      "step": 919
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.3322724494396109,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.5933,
+      "step": 920
+    },
+    {
+      "epoch": 0.7368,
+      "grad_norm": 0.46660288239031206,
+      "learning_rate": 3.421170477595419e-05,
+      "loss": 0.7253,
+      "step": 921
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.39972170697874304,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.6955,
+      "step": 922
+    },
+    {
+      "epoch": 0.7384,
+      "grad_norm": 0.4484926930912154,
+      "learning_rate": 3.3822162040954354e-05,
+      "loss": 0.7472,
+      "step": 923
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.37842392041542816,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6795,
+      "step": 924
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.33310293753759246,
+      "learning_rate": 3.34343978560367e-05,
+      "loss": 0.5907,
+      "step": 925
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.4084447231786101,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.683,
+      "step": 926
+    },
+    {
+      "epoch": 0.7416,
+      "grad_norm": 0.37544847644307205,
+      "learning_rate": 3.3048422642484886e-05,
+      "loss": 0.7079,
+      "step": 927
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.4048292248179034,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6581,
+      "step": 928
+    },
+    {
+      "epoch": 0.7432,
+      "grad_norm": 0.39325536956079654,
+      "learning_rate": 3.266424677350346e-05,
+      "loss": 0.7142,
+      "step": 929
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.387008656575126,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6504,
+      "step": 930
+    },
+    {
+      "epoch": 0.7448,
+      "grad_norm": 0.40176436480323197,
+      "learning_rate": 3.228188057393895e-05,
+      "loss": 0.6707,
+      "step": 931
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.35642080243266405,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.6398,
+      "step": 932
+    },
+    {
+      "epoch": 0.7464,
+      "grad_norm": 0.3840829495853328,
+      "learning_rate": 3.190133432000252e-05,
+      "loss": 0.7108,
+      "step": 933
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.34226618771386697,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.5883,
+      "step": 934
+    },
+    {
+      "epoch": 0.748,
+      "grad_norm": 0.3730526708966016,
+      "learning_rate": 3.1522618238993725e-05,
+      "loss": 0.6561,
+      "step": 935
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.5133943423547346,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.7501,
+      "step": 936
+    },
+    {
+      "epoch": 0.7496,
+      "grad_norm": 0.45245367415824433,
+      "learning_rate": 3.114574250902558e-05,
+      "loss": 0.7075,
+      "step": 937
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.38279937588237534,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6271,
+      "step": 938
+    },
+    {
+      "epoch": 0.7512,
+      "grad_norm": 0.48128672007504014,
+      "learning_rate": 3.077071725875116e-05,
+      "loss": 0.7937,
+      "step": 939
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.38337871736257334,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.671,
+      "step": 940
+    },
+    {
+      "epoch": 0.7528,
+      "grad_norm": 0.5835878948960984,
+      "learning_rate": 3.0397552567091337e-05,
+      "loss": 0.5881,
+      "step": 941
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.3869196656143522,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.6597,
+      "step": 942
+    },
+    {
+      "epoch": 0.7544,
+      "grad_norm": 0.40884136149356204,
+      "learning_rate": 3.0026258462963787e-05,
+      "loss": 0.6587,
+      "step": 943
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3643087609647497,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.619,
+      "step": 944
+    },
+    {
+      "epoch": 0.756,
+      "grad_norm": 0.38046995770456293,
+      "learning_rate": 2.9656844925013637e-05,
+      "loss": 0.6563,
+      "step": 945
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.3673923745337542,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6593,
+      "step": 946
+    },
+    {
+      "epoch": 0.7576,
+      "grad_norm": 0.3903864667528407,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.6201,
+      "step": 947
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.4334368171640703,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.7329,
+      "step": 948
+    },
+    {
+      "epoch": 0.7592,
+      "grad_norm": 0.4470775758297611,
+      "learning_rate": 2.8923699209255284e-05,
+      "loss": 0.7725,
+      "step": 949
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.36984125610935664,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.6816,
+      "step": 950
+    },
+    {
+      "epoch": 0.7608,
+      "grad_norm": 0.393785262217113,
+      "learning_rate": 2.8559986734967282e-05,
+      "loss": 0.675,
+      "step": 951
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3862927167505613,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6531,
+      "step": 952
+    },
+    {
+      "epoch": 0.7624,
+      "grad_norm": 0.3991015244488483,
+      "learning_rate": 2.819819423336775e-05,
+      "loss": 0.7225,
+      "step": 953
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.4217027050575871,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6542,
+      "step": 954
+    },
+    {
+      "epoch": 0.764,
+      "grad_norm": 0.424375649472465,
+      "learning_rate": 2.7838331427743282e-05,
+      "loss": 0.7528,
+      "step": 955
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.3879980379001011,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6424,
+      "step": 956
+    },
+    {
+      "epoch": 0.7656,
+      "grad_norm": 0.4419791925895399,
+      "learning_rate": 2.7480407989519198e-05,
+      "loss": 0.7411,
+      "step": 957
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.560057032805807,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.6546,
+      "step": 958
+    },
+    {
+      "epoch": 0.7672,
+      "grad_norm": 0.4247675870095834,
+      "learning_rate": 2.712443353799984e-05,
+      "loss": 0.6494,
+      "step": 959
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3762748956971542,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6904,
+      "step": 960
+    },
+    {
+      "epoch": 0.7688,
+      "grad_norm": 0.3702584404711336,
+      "learning_rate": 2.677041764010988e-05,
+      "loss": 0.6745,
+      "step": 961
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.38356258934630244,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6497,
+      "step": 962
+    },
+    {
+      "epoch": 0.7704,
+      "grad_norm": 0.40398104418654535,
+      "learning_rate": 2.6418369810137188e-05,
+      "loss": 0.6966,
+      "step": 963
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.39109643003107714,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6976,
+      "step": 964
+    },
+    {
+      "epoch": 0.772,
+      "grad_norm": 0.42202229115813555,
+      "learning_rate": 2.6068299509477266e-05,
+      "loss": 0.6624,
+      "step": 965
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.39471222639644493,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.7129,
+      "step": 966
+    },
+    {
+      "epoch": 0.7736,
+      "grad_norm": 0.38103435763156235,
+      "learning_rate": 2.5720216146378917e-05,
+      "loss": 0.7002,
+      "step": 967
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.3891360443952935,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.6737,
+      "step": 968
+    },
+    {
+      "epoch": 0.7752,
+      "grad_norm": 0.3891741702846402,
+      "learning_rate": 2.5374129075691265e-05,
+      "loss": 0.6595,
+      "step": 969
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.39202566812660267,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.714,
+      "step": 970
+    },
+    {
+      "epoch": 0.7768,
+      "grad_norm": 0.40870840104373474,
+      "learning_rate": 2.503004759861258e-05,
+      "loss": 0.709,
+      "step": 971
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.34838708575276983,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6257,
+      "step": 972
+    },
+    {
+      "epoch": 0.7784,
+      "grad_norm": 0.39229349183084955,
+      "learning_rate": 2.4687980962440072e-05,
+      "loss": 0.6919,
+      "step": 973
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.3885075596834621,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.68,
+      "step": 974
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.3653202740567082,
+      "learning_rate": 2.4347938360321566e-05,
+      "loss": 0.6456,
+      "step": 975
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.38122600677793544,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6755,
+      "step": 976
+    },
+    {
+      "epoch": 0.7816,
+      "grad_norm": 0.40469846986495295,
+      "learning_rate": 2.400992893100822e-05,
+      "loss": 0.7281,
+      "step": 977
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.3874646319132377,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6987,
+      "step": 978
+    },
+    {
+      "epoch": 0.7832,
+      "grad_norm": 0.38652009318610764,
+      "learning_rate": 2.3673961758609152e-05,
+      "loss": 0.6498,
+      "step": 979
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4195905775332029,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.6878,
+      "step": 980
+    },
+    {
+      "epoch": 0.7848,
+      "grad_norm": 0.3332753764965799,
+      "learning_rate": 2.334004587234717e-05,
+      "loss": 0.631,
+      "step": 981
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.4424376017968654,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6776,
+      "step": 982
+    },
+    {
+      "epoch": 0.7864,
+      "grad_norm": 0.3501051595988133,
+      "learning_rate": 2.300819024631603e-05,
+      "loss": 0.5838,
+      "step": 983
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.40590581314438823,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.7233,
+      "step": 984
+    },
+    {
+      "epoch": 0.788,
+      "grad_norm": 0.3865285519165861,
+      "learning_rate": 2.26784037992395e-05,
+      "loss": 0.6915,
+      "step": 985
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.3729463126646092,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6419,
+      "step": 986
+    },
+    {
+      "epoch": 0.7896,
+      "grad_norm": 0.3891660028099139,
+      "learning_rate": 2.2350695394231345e-05,
+      "loss": 0.6574,
+      "step": 987
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.37516748591806176,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.694,
+      "step": 988
+    },
+    {
+      "epoch": 0.7912,
+      "grad_norm": 0.3627710948078153,
+      "learning_rate": 2.2025073838557454e-05,
+      "loss": 0.6426,
+      "step": 989
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.34430677201245696,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6224,
+      "step": 990
+    },
+    {
+      "epoch": 0.7928,
+      "grad_norm": 0.34143260859065744,
+      "learning_rate": 2.1701547883398922e-05,
+      "loss": 0.616,
+      "step": 991
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.42325194237531244,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6802,
+      "step": 992
+    },
+    {
+      "epoch": 0.7944,
+      "grad_norm": 0.374011082379245,
+      "learning_rate": 2.138012622361689e-05,
+      "loss": 0.6682,
+      "step": 993
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.3924664703808738,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6557,
+      "step": 994
+    },
+    {
+      "epoch": 0.796,
+      "grad_norm": 0.3453172403567277,
+      "learning_rate": 2.106081749751897e-05,
+      "loss": 0.6273,
+      "step": 995
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3709064056768632,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.675,
+      "step": 996
+    },
+    {
+      "epoch": 0.7976,
+      "grad_norm": 0.409732990870048,
+      "learning_rate": 2.0743630286627002e-05,
+      "loss": 0.7157,
+      "step": 997
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.367012562447277,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6951,
+      "step": 998
+    },
+    {
+      "epoch": 0.7992,
+      "grad_norm": 0.3509939591651848,
+      "learning_rate": 2.0428573115446392e-05,
+      "loss": 0.65,
+      "step": 999
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.44200231466098977,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6547,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8008,
+      "grad_norm": 0.40441240990297683,
+      "learning_rate": 2.011565445123711e-05,
+      "loss": 0.7347,
+      "step": 1001
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.38852090414582974,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.6448,
+      "step": 1002
+    },
+    {
+      "epoch": 0.8024,
+      "grad_norm": 0.37208957771394713,
+      "learning_rate": 1.980488270378612e-05,
+      "loss": 0.6137,
+      "step": 1003
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.36708020272901665,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6749,
+      "step": 1004
+    },
+    {
+      "epoch": 0.804,
+      "grad_norm": 0.3866779440010189,
+      "learning_rate": 1.9496266225181248e-05,
+      "loss": 0.6213,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.4388966309440129,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7146,
+      "step": 1006
+    },
+    {
+      "epoch": 0.8056,
+      "grad_norm": 0.4163581109381185,
+      "learning_rate": 1.918981330958678e-05,
+      "loss": 0.6568,
+      "step": 1007
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.34924193907392403,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6308,
+      "step": 1008
+    },
+    {
+      "epoch": 0.8072,
+      "grad_norm": 0.38487892629432047,
+      "learning_rate": 1.8885532193020704e-05,
+      "loss": 0.658,
+      "step": 1009
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.419568928978815,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7342,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8088,
+      "grad_norm": 0.4824537645721057,
+      "learning_rate": 1.8583431053133127e-05,
+      "loss": 0.8479,
+      "step": 1011
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.4886160452022071,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.654,
+      "step": 1012
+    },
+    {
+      "epoch": 0.8104,
+      "grad_norm": 0.3899441504671415,
+      "learning_rate": 1.8283518008986567e-05,
+      "loss": 0.7006,
+      "step": 1013
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.41500226449489414,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.6719,
+      "step": 1014
+    },
+    {
+      "epoch": 0.812,
+      "grad_norm": 0.36673090297500144,
+      "learning_rate": 1.7985801120837865e-05,
+      "loss": 0.6516,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.46624315341238187,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6608,
+      "step": 1016
+    },
+    {
+      "epoch": 0.8136,
+      "grad_norm": 0.4046771077568921,
+      "learning_rate": 1.7690288389921493e-05,
+      "loss": 0.7124,
+      "step": 1017
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.45519910379364,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.6935,
+      "step": 1018
+    },
+    {
+      "epoch": 0.8152,
+      "grad_norm": 0.4163406513330448,
+      "learning_rate": 1.739698775823442e-05,
+      "loss": 0.6798,
+      "step": 1019
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3847504340934355,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.6495,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8168,
+      "grad_norm": 0.3837095644385684,
+      "learning_rate": 1.7105907108322816e-05,
+      "loss": 0.5916,
+      "step": 1021
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.3798862260824479,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.5924,
+      "step": 1022
+    },
+    {
+      "epoch": 0.8184,
+      "grad_norm": 0.4342551838691338,
+      "learning_rate": 1.6817054263070174e-05,
+      "loss": 0.7215,
+      "step": 1023
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3891573762790059,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6261,
+      "step": 1024
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.39374066114535394,
+      "learning_rate": 1.6530436985486996e-05,
+      "loss": 0.737,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.4235606753007906,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.7349,
+      "step": 1026
+    },
+    {
+      "epoch": 0.8216,
+      "grad_norm": 0.38219200505475237,
+      "learning_rate": 1.6246062978502164e-05,
+      "loss": 0.6664,
+      "step": 1027
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.4168244866608693,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6793,
+      "step": 1028
+    },
+    {
+      "epoch": 0.8232,
+      "grad_norm": 0.37050304408128276,
+      "learning_rate": 1.5963939884756042e-05,
+      "loss": 0.6408,
+      "step": 1029
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.7747700697770986,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.7048,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8248,
+      "grad_norm": 0.4379270226232551,
+      "learning_rate": 1.5684075286394985e-05,
+      "loss": 0.6912,
+      "step": 1031
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.38641061387438236,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6661,
+      "step": 1032
+    },
+    {
+      "epoch": 0.8264,
+      "grad_norm": 0.3699145427107198,
+      "learning_rate": 1.5406476704867524e-05,
+      "loss": 0.5911,
+      "step": 1033
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.37678064603454414,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6379,
+      "step": 1034
+    },
+    {
+      "epoch": 0.828,
+      "grad_norm": 0.38276573145225384,
+      "learning_rate": 1.5131151600722337e-05,
+      "loss": 0.6688,
+      "step": 1035
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.35659385165223567,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6418,
+      "step": 1036
+    },
+    {
+      "epoch": 0.8296,
+      "grad_norm": 0.4385984497720718,
+      "learning_rate": 1.485810737340767e-05,
+      "loss": 0.7101,
+      "step": 1037
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.37131113697001,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.6694,
+      "step": 1038
+    },
+    {
+      "epoch": 0.8312,
+      "grad_norm": 0.39251911154546165,
+      "learning_rate": 1.4587351361072454e-05,
+      "loss": 0.6851,
+      "step": 1039
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.39604885394823114,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6319,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8328,
+      "grad_norm": 0.40946975711386363,
+      "learning_rate": 1.4318890840369182e-05,
+      "loss": 0.6883,
+      "step": 1041
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.3962931848637794,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.7025,
+      "step": 1042
+    },
+    {
+      "epoch": 0.8344,
+      "grad_norm": 0.35824411852444177,
+      "learning_rate": 1.4052733026258281e-05,
+      "loss": 0.647,
+      "step": 1043
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.44805131354996924,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.7499,
+      "step": 1044
+    },
+    {
+      "epoch": 0.836,
+      "grad_norm": 0.37839428607888675,
+      "learning_rate": 1.3788885071814172e-05,
+      "loss": 0.6739,
+      "step": 1045
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.4316955359945715,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6555,
+      "step": 1046
+    },
+    {
+      "epoch": 0.8376,
+      "grad_norm": 0.485425163010363,
+      "learning_rate": 1.3527354068033139e-05,
+      "loss": 0.7962,
+      "step": 1047
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.4265406221697592,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.7392,
+      "step": 1048
+    },
+    {
+      "epoch": 0.8392,
+      "grad_norm": 0.3996657451117564,
+      "learning_rate": 1.326814704364262e-05,
+      "loss": 0.7219,
+      "step": 1049
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.33785709813005427,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6191,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8408,
+      "grad_norm": 0.3986088500140371,
+      "learning_rate": 1.3011270964912459e-05,
+      "loss": 0.7453,
+      "step": 1051
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.4304343313428344,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.7237,
+      "step": 1052
+    },
+    {
+      "epoch": 0.8424,
+      "grad_norm": 0.33128590330089164,
+      "learning_rate": 1.275673273546758e-05,
+      "loss": 0.5727,
+      "step": 1053
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.3834852029532719,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6106,
+      "step": 1054
+    },
+    {
+      "epoch": 0.844,
+      "grad_norm": 0.4003658469139451,
+      "learning_rate": 1.2504539196102439e-05,
+      "loss": 0.739,
+      "step": 1055
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.39407968931740417,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6444,
+      "step": 1056
+    },
+    {
+      "epoch": 0.8456,
+      "grad_norm": 0.3795392149229405,
+      "learning_rate": 1.2254697124597237e-05,
+      "loss": 0.6499,
+      "step": 1057
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.36122921429516935,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6214,
+      "step": 1058
+    },
+    {
+      "epoch": 0.8472,
+      "grad_norm": 0.35053543406634635,
+      "learning_rate": 1.2007213235535786e-05,
+      "loss": 0.6659,
+      "step": 1059
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.37714585249730426,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6293,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8488,
+      "grad_norm": 0.3355620267539819,
+      "learning_rate": 1.176209418012495e-05,
+      "loss": 0.6256,
+      "step": 1061
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.4178624387608608,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6817,
+      "step": 1062
+    },
+    {
+      "epoch": 0.8504,
+      "grad_norm": 0.4087431841898358,
+      "learning_rate": 1.1519346546015907e-05,
+      "loss": 0.7479,
+      "step": 1063
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.4338977119500379,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6792,
+      "step": 1064
+    },
+    {
+      "epoch": 0.852,
+      "grad_norm": 0.4014367638097192,
+      "learning_rate": 1.1278976857127311e-05,
+      "loss": 0.6514,
+      "step": 1065
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.36813594794623244,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.635,
+      "step": 1066
+    },
+    {
+      "epoch": 0.8536,
+      "grad_norm": 0.5653983945250799,
+      "learning_rate": 1.1040991573469629e-05,
+      "loss": 0.7264,
+      "step": 1067
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3957528246204374,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6174,
+      "step": 1068
+    },
+    {
+      "epoch": 0.8552,
+      "grad_norm": 0.41306059413718516,
+      "learning_rate": 1.0805397090971737e-05,
+      "loss": 0.6637,
+      "step": 1069
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.3510167161446504,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6491,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8568,
+      "grad_norm": 0.3266469766942705,
+      "learning_rate": 1.057219974130903e-05,
+      "loss": 0.6036,
+      "step": 1071
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.39283761058480665,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6192,
+      "step": 1072
+    },
+    {
+      "epoch": 0.8584,
+      "grad_norm": 0.3823300008834905,
+      "learning_rate": 1.0341405791733183e-05,
+      "loss": 0.6677,
+      "step": 1073
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.38639058342171195,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.7045,
+      "step": 1074
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.34967644464562886,
+      "learning_rate": 1.0113021444903726e-05,
+      "loss": 0.6394,
+      "step": 1075
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.36660670673179757,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6443,
+      "step": 1076
+    },
+    {
+      "epoch": 0.8616,
+      "grad_norm": 0.4103450467697293,
+      "learning_rate": 9.887052838721322e-06,
+      "loss": 0.693,
+      "step": 1077
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.3729181374994256,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6148,
+      "step": 1078
+    },
+    {
+      "epoch": 0.8632,
+      "grad_norm": 0.380542232685799,
+      "learning_rate": 9.663506046162985e-06,
+      "loss": 0.6933,
+      "step": 1079
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3801809923588539,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6373,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8648,
+      "grad_norm": 0.4290350941082846,
+      "learning_rate": 9.44238707511862e-06,
+      "loss": 0.6806,
+      "step": 1081
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.40742883431682103,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6822,
+      "step": 1082
+    },
+    {
+      "epoch": 0.8664,
+      "grad_norm": 0.3990060719895597,
+      "learning_rate": 9.22370186822965e-06,
+      "loss": 0.6832,
+      "step": 1083
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.42772499475750525,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.757,
+      "step": 1084
+    },
+    {
+      "epoch": 0.868,
+      "grad_norm": 0.4410430739892881,
+      "learning_rate": 9.0074563027294e-06,
+      "loss": 0.687,
+      "step": 1085
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.37499319503363765,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.7777,
+      "step": 1086
+    },
+    {
+      "epoch": 0.8696,
+      "grad_norm": 0.3857697346139549,
+      "learning_rate": 8.79365619028507e-06,
+      "loss": 0.6491,
+      "step": 1087
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4005388401587205,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6342,
+      "step": 1088
+    },
+    {
+      "epoch": 0.8712,
+      "grad_norm": 0.4245050931906198,
+      "learning_rate": 8.582307276841462e-06,
+      "loss": 0.6503,
+      "step": 1089
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.39072486114874533,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.6484,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8728,
+      "grad_norm": 0.3370705386163272,
+      "learning_rate": 8.37341524246672e-06,
+      "loss": 0.6533,
+      "step": 1091
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4577072302250668,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.7518,
+      "step": 1092
+    },
+    {
+      "epoch": 0.8744,
+      "grad_norm": 0.3865547149956887,
+      "learning_rate": 8.166985701199582e-06,
+      "loss": 0.6319,
+      "step": 1093
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.4037353871825628,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.7181,
+      "step": 1094
+    },
+    {
+      "epoch": 0.876,
+      "grad_norm": 0.3893878280163203,
+      "learning_rate": 7.963024200898462e-06,
+      "loss": 0.6872,
+      "step": 1095
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.4353209478547202,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6773,
+      "step": 1096
+    },
+    {
+      "epoch": 0.8776,
+      "grad_norm": 0.443063602249416,
+      "learning_rate": 7.761536223092458e-06,
+      "loss": 0.6929,
+      "step": 1097
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.3672377957442898,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6417,
+      "step": 1098
+    },
+    {
+      "epoch": 0.8792,
+      "grad_norm": 0.4741654482655923,
+      "learning_rate": 7.562527182833978e-06,
+      "loss": 0.7018,
+      "step": 1099
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.35363508243119146,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6645,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8808,
+      "grad_norm": 0.43547070255930115,
+      "learning_rate": 7.366002428553153e-06,
+      "loss": 0.6793,
+      "step": 1101
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.4127577831340952,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.663,
+      "step": 1102
+    },
+    {
+      "epoch": 0.8824,
+      "grad_norm": 0.3984495801609078,
+      "learning_rate": 7.171967241914224e-06,
+      "loss": 0.6869,
+      "step": 1103
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.4264269134271497,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6845,
+      "step": 1104
+    },
+    {
+      "epoch": 0.884,
+      "grad_norm": 0.4475181742562311,
+      "learning_rate": 6.980426837673437e-06,
+      "loss": 0.7188,
+      "step": 1105
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.38976354506503263,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6712,
+      "step": 1106
+    },
+    {
+      "epoch": 0.8856,
+      "grad_norm": 0.4072056344974993,
+      "learning_rate": 6.791386363539065e-06,
+      "loss": 0.7189,
+      "step": 1107
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.3814969538983557,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.622,
+      "step": 1108
+    },
+    {
+      "epoch": 0.8872,
+      "grad_norm": 0.39198374184146784,
+      "learning_rate": 6.604850900032955e-06,
+      "loss": 0.72,
+      "step": 1109
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.4299031653523382,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6712,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8888,
+      "grad_norm": 0.42062112658601636,
+      "learning_rate": 6.420825460353974e-06,
+      "loss": 0.6634,
+      "step": 1111
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.381095188428428,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6104,
+      "step": 1112
+    },
+    {
+      "epoch": 0.8904,
+      "grad_norm": 0.3740011730979217,
+      "learning_rate": 6.239314990243339e-06,
+      "loss": 0.6459,
+      "step": 1113
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.3482311112245457,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.5719,
+      "step": 1114
+    },
+    {
+      "epoch": 0.892,
+      "grad_norm": 0.34026593781578274,
+      "learning_rate": 6.0603243678516995e-06,
+      "loss": 0.5916,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.4443275218242843,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6623,
+      "step": 1116
+    },
+    {
+      "epoch": 0.8936,
+      "grad_norm": 0.37590807547566096,
+      "learning_rate": 5.883858403607967e-06,
+      "loss": 0.6557,
+      "step": 1117
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.41442118569066855,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.7623,
+      "step": 1118
+    },
+    {
+      "epoch": 0.8952,
+      "grad_norm": 0.35148898910324156,
+      "learning_rate": 5.7099218400900716e-06,
+      "loss": 0.6086,
+      "step": 1119
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.41219269590085295,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.7074,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8968,
+      "grad_norm": 0.38394476824780704,
+      "learning_rate": 5.538519351897575e-06,
+      "loss": 0.7202,
+      "step": 1121
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.4086093487566508,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6803,
+      "step": 1122
+    },
+    {
+      "epoch": 0.8984,
+      "grad_norm": 0.40780618120587225,
+      "learning_rate": 5.369655545525909e-06,
+      "loss": 0.6717,
+      "step": 1123
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.40160835831526687,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6519,
+      "step": 1124
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.42618142278286036,
+      "learning_rate": 5.2033349592426335e-06,
+      "loss": 0.7201,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.4495467272165354,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.7003,
+      "step": 1126
+    },
+    {
+      "epoch": 0.9016,
+      "grad_norm": 0.40258287954211475,
+      "learning_rate": 5.039562062965508e-06,
+      "loss": 0.7064,
+      "step": 1127
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.38701138060079804,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6509,
+      "step": 1128
+    },
+    {
+      "epoch": 0.9032,
+      "grad_norm": 0.3918323584972186,
+      "learning_rate": 4.87834125814235e-06,
+      "loss": 0.6932,
+      "step": 1129
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.4267477243262357,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.7286,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9048,
+      "grad_norm": 0.4042167728382733,
+      "learning_rate": 4.719676877632639e-06,
+      "loss": 0.6026,
+      "step": 1131
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.3800089340729763,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6303,
+      "step": 1132
+    },
+    {
+      "epoch": 0.9064,
+      "grad_norm": 0.40746341645128786,
+      "learning_rate": 4.563573185591219e-06,
+      "loss": 0.616,
+      "step": 1133
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.45472749983902716,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.7022,
+      "step": 1134
+    },
+    {
+      "epoch": 0.908,
+      "grad_norm": 0.38580009315205044,
+      "learning_rate": 4.4100343773536225e-06,
+      "loss": 0.6165,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.36183809853024934,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6026,
+      "step": 1136
+    },
+    {
+      "epoch": 0.9096,
+      "grad_norm": 0.3973413421604641,
+      "learning_rate": 4.259064579323302e-06,
+      "loss": 0.6289,
+      "step": 1137
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.43386908181467376,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6912,
+      "step": 1138
+    },
+    {
+      "epoch": 0.9112,
+      "grad_norm": 0.40332765590901476,
+      "learning_rate": 4.1106678488607495e-06,
+      "loss": 0.6324,
+      "step": 1139
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3287877487873551,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.5785,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9128,
+      "grad_norm": 0.400587936119373,
+      "learning_rate": 3.964848174174541e-06,
+      "loss": 0.7204,
+      "step": 1141
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.37522794345615756,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.6462,
+      "step": 1142
+    },
+    {
+      "epoch": 0.9144,
+      "grad_norm": 0.3804765757964083,
+      "learning_rate": 3.821609474213983e-06,
+      "loss": 0.6166,
+      "step": 1143
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.34944340362908555,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.5801,
+      "step": 1144
+    },
+    {
+      "epoch": 0.916,
+      "grad_norm": 0.3725022787413,
+      "learning_rate": 3.6809555985639068e-06,
+      "loss": 0.5849,
+      "step": 1145
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.3834493715771417,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.632,
+      "step": 1146
+    },
+    {
+      "epoch": 0.9176,
+      "grad_norm": 0.40476102820663795,
+      "learning_rate": 3.5428903273411863e-06,
+      "loss": 0.6604,
+      "step": 1147
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.34598749177431276,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.5964,
+      "step": 1148
+    },
+    {
+      "epoch": 0.9192,
+      "grad_norm": 0.37544762752433286,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.6738,
+      "step": 1149
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.3688018050984644,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6182,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9208,
+      "grad_norm": 0.3762250347026776,
+      "learning_rate": 3.2745403706978872e-06,
+      "loss": 0.6341,
+      "step": 1151
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.372296414482718,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6501,
+      "step": 1152
+    },
+    {
+      "epoch": 0.9224,
+      "grad_norm": 0.42738576169972864,
+      "learning_rate": 3.1442628972662704e-06,
+      "loss": 0.67,
+      "step": 1153
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.40312706215282335,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6839,
+      "step": 1154
+    },
+    {
+      "epoch": 0.924,
+      "grad_norm": 0.36720059452943676,
+      "learning_rate": 3.0165884520461316e-06,
+      "loss": 0.6225,
+      "step": 1155
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3868428302824951,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.6697,
+      "step": 1156
+    },
+    {
+      "epoch": 0.9256,
+      "grad_norm": 0.33361508488679614,
+      "learning_rate": 2.8915204663281013e-06,
+      "loss": 0.6161,
+      "step": 1157
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.3709586419165963,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6492,
+      "step": 1158
+    },
+    {
+      "epoch": 0.9272,
+      "grad_norm": 0.40523771678934845,
+      "learning_rate": 2.7690623013533976e-06,
+      "loss": 0.6375,
+      "step": 1159
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.40686652277172136,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.7814,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9288,
+      "grad_norm": 0.4035953631570672,
+      "learning_rate": 2.649217248223468e-06,
+      "loss": 0.7262,
+      "step": 1161
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.38453505690308615,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6721,
+      "step": 1162
+    },
+    {
+      "epoch": 0.9304,
+      "grad_norm": 0.4893722918508808,
+      "learning_rate": 2.5319885278115906e-06,
+      "loss": 0.6595,
+      "step": 1163
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.3796131533890088,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.618,
+      "step": 1164
+    },
+    {
+      "epoch": 0.932,
+      "grad_norm": 0.389940702523096,
+      "learning_rate": 2.4173792906762804e-06,
+      "loss": 0.6717,
+      "step": 1165
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.3780420711864244,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6411,
+      "step": 1166
+    },
+    {
+      "epoch": 0.9336,
+      "grad_norm": 0.3766267031435862,
+      "learning_rate": 2.3053926169765984e-06,
+      "loss": 0.6437,
+      "step": 1167
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.39133134402719205,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6361,
+      "step": 1168
+    },
+    {
+      "epoch": 0.9352,
+      "grad_norm": 0.3674878182398129,
+      "learning_rate": 2.1960315163894075e-06,
+      "loss": 0.6219,
+      "step": 1169
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.41221470805706845,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.7125,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9368,
+      "grad_norm": 0.42060309637275045,
+      "learning_rate": 2.0892989280284823e-06,
+      "loss": 0.7036,
+      "step": 1171
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.4314788318508879,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.7002,
+      "step": 1172
+    },
+    {
+      "epoch": 0.9384,
+      "grad_norm": 0.3687559261760369,
+      "learning_rate": 1.9851977203654835e-06,
+      "loss": 0.6284,
+      "step": 1173
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.40892865584466087,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.7167,
+      "step": 1174
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.3927527761081754,
+      "learning_rate": 1.8837306911529184e-06,
+      "loss": 0.6535,
+      "step": 1175
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.45343822891686525,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6567,
+      "step": 1176
+    },
+    {
+      "epoch": 0.9416,
+      "grad_norm": 0.3987423062295389,
+      "learning_rate": 1.7849005673489127e-06,
+      "loss": 0.7027,
+      "step": 1177
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.34101337590193037,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6058,
+      "step": 1178
+    },
+    {
+      "epoch": 0.9432,
+      "grad_norm": 0.37467888023325496,
+      "learning_rate": 1.6887100050439587e-06,
+      "loss": 0.6611,
+      "step": 1179
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4279880666384542,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6429,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9448,
+      "grad_norm": 0.4008990653373971,
+      "learning_rate": 1.595161589389449e-06,
+      "loss": 0.6791,
+      "step": 1181
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.4040976062647741,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6614,
+      "step": 1182
+    },
+    {
+      "epoch": 0.9464,
+      "grad_norm": 0.40009160645320296,
+      "learning_rate": 1.5042578345283108e-06,
+      "loss": 0.6184,
+      "step": 1183
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3584149652915633,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6534,
+      "step": 1184
+    },
+    {
+      "epoch": 0.948,
+      "grad_norm": 0.42308879251991566,
+      "learning_rate": 1.4160011835273934e-06,
+      "loss": 0.6429,
+      "step": 1185
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.3530676336565875,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.5875,
+      "step": 1186
+    },
+    {
+      "epoch": 0.9496,
+      "grad_norm": 0.45952337480543376,
+      "learning_rate": 1.3303940083117527e-06,
+      "loss": 0.7039,
+      "step": 1187
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.4000313956953184,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6528,
+      "step": 1188
+    },
+    {
+      "epoch": 0.9512,
+      "grad_norm": 0.3390785776944779,
+      "learning_rate": 1.2474386096010039e-06,
+      "loss": 0.5444,
+      "step": 1189
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4164879554122682,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.7395,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9528,
+      "grad_norm": 0.4018332369116052,
+      "learning_rate": 1.1671372168474138e-06,
+      "loss": 0.7091,
+      "step": 1191
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.401606509590131,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.7054,
+      "step": 1192
+    },
+    {
+      "epoch": 0.9544,
+      "grad_norm": 0.4385767023072862,
+      "learning_rate": 1.089491988176017e-06,
+      "loss": 0.6476,
+      "step": 1193
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.3793691830981734,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6676,
+      "step": 1194
+    },
+    {
+      "epoch": 0.956,
+      "grad_norm": 0.3992745703002846,
+      "learning_rate": 1.014505010326583e-06,
+      "loss": 0.6818,
+      "step": 1195
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.38130181064017943,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.6855,
+      "step": 1196
+    },
+    {
+      "epoch": 0.9576,
+      "grad_norm": 0.31922457047948377,
+      "learning_rate": 9.421782985976068e-07,
+      "loss": 0.5842,
+      "step": 1197
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.4284172920959308,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6487,
+      "step": 1198
+    },
+    {
+      "epoch": 0.9592,
+      "grad_norm": 0.3492126493938296,
+      "learning_rate": 8.725137967920738e-07,
+      "loss": 0.6214,
+      "step": 1199
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.40249258176511293,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.7139,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9608,
+      "grad_norm": 0.40261627692051555,
+      "learning_rate": 8.055133771652345e-07,
+      "loss": 0.6671,
+      "step": 1201
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.37974840778161195,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6705,
+      "step": 1202
+    },
+    {
+      "epoch": 0.9624,
+      "grad_norm": 0.4246235923407765,
+      "learning_rate": 7.411788403743237e-07,
+      "loss": 0.5937,
+      "step": 1203
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.37635799514836554,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6311,
+      "step": 1204
+    },
+    {
+      "epoch": 0.964,
+      "grad_norm": 0.3853814024029868,
+      "learning_rate": 6.7951191543012e-07,
+      "loss": 0.6393,
+      "step": 1205
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.41252531259882763,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6276,
+      "step": 1206
+    },
+    {
+      "epoch": 0.9656,
+      "grad_norm": 0.45897883538473566,
+      "learning_rate": 6.205142596505176e-07,
+      "loss": 0.7155,
+      "step": 1207
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.4379137702549422,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.7196,
+      "step": 1208
+    },
+    {
+      "epoch": 0.9672,
+      "grad_norm": 0.34869941274882527,
+      "learning_rate": 5.64187458615939e-07,
+      "loss": 0.6355,
+      "step": 1209
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.38492263243996416,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.7184,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9688,
+      "grad_norm": 0.3880733678371908,
+      "learning_rate": 5.105330261267916e-07,
+      "loss": 0.6957,
+      "step": 1211
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.405049813155491,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.7065,
+      "step": 1212
+    },
+    {
+      "epoch": 0.9704,
+      "grad_norm": 0.44029289611675254,
+      "learning_rate": 4.5955240416271084e-07,
+      "loss": 0.6582,
+      "step": 1213
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.44664165480840895,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6719,
+      "step": 1214
+    },
+    {
+      "epoch": 0.972,
+      "grad_norm": 0.35226701230834057,
+      "learning_rate": 4.112469628438365e-07,
+      "loss": 0.6235,
+      "step": 1215
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.3593587552529369,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.581,
+      "step": 1216
+    },
+    {
+      "epoch": 0.9736,
+      "grad_norm": 0.4087204707252798,
+      "learning_rate": 3.6561800039403016e-07,
+      "loss": 0.6853,
+      "step": 1217
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.3891805891204021,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.6547,
+      "step": 1218
+    },
+    {
+      "epoch": 0.9752,
+      "grad_norm": 0.41121914812015825,
+      "learning_rate": 3.2266674310589273e-07,
+      "loss": 0.7006,
+      "step": 1219
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3629778752294901,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6327,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9768,
+      "grad_norm": 0.4013955562090015,
+      "learning_rate": 2.8239434530792365e-07,
+      "loss": 0.6526,
+      "step": 1221
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.4013227646872803,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6855,
+      "step": 1222
+    },
+    {
+      "epoch": 0.9784,
+      "grad_norm": 0.5205466487643553,
+      "learning_rate": 2.448018893333681e-07,
+      "loss": 0.6296,
+      "step": 1223
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.44612035490967855,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6549,
+      "step": 1224
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.42556267254277397,
+      "learning_rate": 2.098903854912515e-07,
+      "loss": 0.6728,
+      "step": 1225
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.3732850799010509,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6635,
+      "step": 1226
+    },
+    {
+      "epoch": 0.9816,
+      "grad_norm": 0.3907179778195809,
+      "learning_rate": 1.7766077203915655e-07,
+      "loss": 0.6396,
+      "step": 1227
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.38192502548447554,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6523,
+      "step": 1228
+    },
+    {
+      "epoch": 0.9832,
+      "grad_norm": 0.3682024172847496,
+      "learning_rate": 1.481139151579991e-07,
+      "loss": 0.629,
+      "step": 1229
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.3886285664402178,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6777,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9848,
+      "grad_norm": 0.37918581250956984,
+      "learning_rate": 1.2125060892881346e-07,
+      "loss": 0.6488,
+      "step": 1231
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3257783036245148,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.5451,
+      "step": 1232
+    },
+    {
+      "epoch": 0.9864,
+      "grad_norm": 0.41977397713779263,
+      "learning_rate": 9.707157531134713e-08,
+      "loss": 0.6171,
+      "step": 1233
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.41724976693934324,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6751,
+      "step": 1234
+    },
+    {
+      "epoch": 0.988,
+      "grad_norm": 0.4533361462180586,
+      "learning_rate": 7.557746412468758e-08,
+      "loss": 0.6946,
+      "step": 1235
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.38325422753154753,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6017,
+      "step": 1236
+    },
+    {
+      "epoch": 0.9896,
+      "grad_norm": 0.3421681867758102,
+      "learning_rate": 5.6768853029787184e-08,
+      "loss": 0.6061,
+      "step": 1237
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.39514516668517125,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6764,
+      "step": 1238
+    },
+    {
+      "epoch": 0.9912,
+      "grad_norm": 0.35344840546408507,
+      "learning_rate": 4.064624751394242e-08,
+      "loss": 0.6253,
+      "step": 1239
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.40727430718765384,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6221,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9928,
+      "grad_norm": 0.4688219114010321,
+      "learning_rate": 2.7210080877237976e-08,
+      "loss": 0.7087,
+      "step": 1241
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.375242685302969,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.7089,
+      "step": 1242
+    },
+    {
+      "epoch": 0.9944,
+      "grad_norm": 0.3607480887273893,
+      "learning_rate": 1.646071422083395e-08,
+      "loss": 0.5905,
+      "step": 1243
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.46496149960965294,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6538,
+      "step": 1244
+    },
+    {
+      "epoch": 0.996,
+      "grad_norm": 0.5837228889124457,
+      "learning_rate": 8.398436437317969e-09,
+      "loss": 0.6289,
+      "step": 1245
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.42242509832078073,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.694,
+      "step": 1246
+    },
+    {
+      "epoch": 0.9976,
+      "grad_norm": 0.39721952854255504,
+      "learning_rate": 3.023464202944748e-09,
+      "loss": 0.706,
+      "step": 1247
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3939331671218691,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.6699,
+      "step": 1248
+    },
+    {
+      "epoch": 0.9992,
+      "grad_norm": 0.47584973128656116,
+      "learning_rate": 3.3594197175190745e-10,
+      "loss": 0.6701,
+      "step": 1249
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.43754064618906224,
+      "learning_rate": 0.0,
+      "loss": 0.6158,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0,
+      "step": 1250,
+      "total_flos": 1127522763669504.0,
+      "train_loss": 0.7323800681114196,
+      "train_runtime": 19553.3332,
+      "train_samples_per_second": 1.023,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1127522763669504.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b8a71b16dd6a1ff6fe91480e5f7898dcadf8b92
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "down_proj",
+    "v_proj",
+    "gate_proj",
+    "q_proj",
+    "o_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b390b5ad1061169f9a00619f9abaf18c1c9279e4
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f971eca109ad6061df4d9b33538c61d8335775627482a094aab237718af7919
+size 671150064
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e10de482eb63139299526970c6edc3d9db9614d9
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb1f04035eeab132dae7260e49c61a1c263972fd2515a86d7ab2d46b030ee2f4
+size 918507402
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..97c2e6281b4ceef0b9c6f6ffbd0c87030236670d
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,8792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.740625375259329,
+      "learning_rate": 5.263157894736842e-06,
+      "loss": 1.2251,
+      "step": 1
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.9552990309915874,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.4193,
+      "step": 2
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.9569001483724141,
+      "learning_rate": 1.5789473684210526e-05,
+      "loss": 1.397,
+      "step": 3
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.7823732381672138,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.3412,
+      "step": 4
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.7645083869407853,
+      "learning_rate": 2.6315789473684212e-05,
+      "loss": 1.3502,
+      "step": 5
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.6154109919230503,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.0955,
+      "step": 6
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.8123309890741669,
+      "learning_rate": 3.6842105263157895e-05,
+      "loss": 1.2582,
+      "step": 7
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.6726504133934107,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.0359,
+      "step": 8
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.6791240317261311,
+      "learning_rate": 4.736842105263158e-05,
+      "loss": 1.0694,
+      "step": 9
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.6903249153403939,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 0.9503,
+      "step": 10
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.7348336374289809,
+      "learning_rate": 5.789473684210527e-05,
+      "loss": 1.0206,
+      "step": 11
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7849115277282124,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.0129,
+      "step": 12
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.5834940596241346,
+      "learning_rate": 6.842105263157895e-05,
+      "loss": 0.9385,
+      "step": 13
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.6080288380250879,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.9041,
+      "step": 14
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.6289182782111001,
+      "learning_rate": 7.894736842105263e-05,
+      "loss": 0.95,
+      "step": 15
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.5246426592790335,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.8953,
+      "step": 16
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.571697601687305,
+      "learning_rate": 8.947368421052632e-05,
+      "loss": 0.9473,
+      "step": 17
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.4695061930640392,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.9375,
+      "step": 18
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.49104776110117504,
+      "learning_rate": 0.0001,
+      "loss": 0.8592,
+      "step": 19
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.557235516915619,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.9125,
+      "step": 20
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.4773950173485819,
+      "learning_rate": 0.0001105263157894737,
+      "loss": 0.861,
+      "step": 21
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.4647782924155993,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.8597,
+      "step": 22
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.5478978441950723,
+      "learning_rate": 0.00012105263157894738,
+      "loss": 0.9283,
+      "step": 23
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5763142218308771,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.9972,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.5420229317803669,
+      "learning_rate": 0.00013157894736842108,
+      "loss": 0.9065,
+      "step": 25
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.5407268561510891,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.8659,
+      "step": 26
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.488056013662824,
+      "learning_rate": 0.00014210526315789474,
+      "loss": 0.8485,
+      "step": 27
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.476656380319156,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.8913,
+      "step": 28
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.5546409263632991,
+      "learning_rate": 0.00015263157894736845,
+      "loss": 0.9313,
+      "step": 29
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.540416485945182,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.9657,
+      "step": 30
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.43395697982419223,
+      "learning_rate": 0.0001631578947368421,
+      "loss": 0.8576,
+      "step": 31
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.46963036387995893,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8423,
+      "step": 32
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.45539400239698086,
+      "learning_rate": 0.0001736842105263158,
+      "loss": 0.798,
+      "step": 33
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.4606712595303784,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8476,
+      "step": 34
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.5017330108056747,
+      "learning_rate": 0.00018421052631578948,
+      "loss": 0.8688,
+      "step": 35
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.4274065338288124,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8116,
+      "step": 36
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.47603841684019044,
+      "learning_rate": 0.00019473684210526317,
+      "loss": 0.8709,
+      "step": 37
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.46980138691953705,
+      "learning_rate": 0.0002,
+      "loss": 0.8104,
+      "step": 38
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.4445247338504254,
+      "learning_rate": 0.00019999966405802826,
+      "loss": 0.8094,
+      "step": 39
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5242809775621773,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.9212,
+      "step": 40
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.4651424134939184,
+      "learning_rate": 0.00019999697653579705,
+      "loss": 0.8692,
+      "step": 41
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.449789261867823,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.834,
+      "step": 42
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.5203812836903858,
+      "learning_rate": 0.0001999916015635627,
+      "loss": 0.913,
+      "step": 43
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.4663304312307797,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8124,
+      "step": 44
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.4868169772295852,
+      "learning_rate": 0.00019998353928577919,
+      "loss": 0.8797,
+      "step": 45
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.3790460899489488,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.7733,
+      "step": 46
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.45106360736919526,
+      "learning_rate": 0.0001999727899191228,
+      "loss": 0.887,
+      "step": 47
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.408512272799759,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.7955,
+      "step": 48
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.4292310323355621,
+      "learning_rate": 0.00019995935375248606,
+      "loss": 0.796,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.5760647088894424,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.858,
+      "step": 50
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.46819810055070243,
+      "learning_rate": 0.00019994323114697022,
+      "loss": 0.8403,
+      "step": 51
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5057652506692215,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8328,
+      "step": 52
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.5246022695539894,
+      "learning_rate": 0.0001999244225358753,
+      "loss": 0.8986,
+      "step": 53
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.5058257344249311,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8703,
+      "step": 54
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.4501909459351609,
+      "learning_rate": 0.00019990292842468868,
+      "loss": 0.8701,
+      "step": 55
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.46649398621078,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8789,
+      "step": 56
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.3865589497205679,
+      "learning_rate": 0.0001998787493910712,
+      "loss": 0.762,
+      "step": 57
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.47464467097697066,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.932,
+      "step": 58
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.4014322315447128,
+      "learning_rate": 0.000199851886084842,
+      "loss": 0.7688,
+      "step": 59
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.503124900204115,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8828,
+      "step": 60
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.4408221447128362,
+      "learning_rate": 0.00019982233922796085,
+      "loss": 0.8346,
+      "step": 61
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5167445238638355,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.9163,
+      "step": 62
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.4685207059609607,
+      "learning_rate": 0.00019979010961450878,
+      "loss": 0.8311,
+      "step": 63
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.40331836132027354,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.7652,
+      "step": 64
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.4697315346699755,
+      "learning_rate": 0.00019975519811066663,
+      "loss": 0.9228,
+      "step": 65
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.45021848631012573,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8314,
+      "step": 66
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.5515640221319575,
+      "learning_rate": 0.0001997176056546921,
+      "loss": 0.8923,
+      "step": 67
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.4826389903930987,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.8487,
+      "step": 68
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.510083073111606,
+      "learning_rate": 0.0001996773332568941,
+      "loss": 0.8928,
+      "step": 69
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.43786463946699905,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.8515,
+      "step": 70
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.470892478542731,
+      "learning_rate": 0.00019963438199960599,
+      "loss": 0.9231,
+      "step": 71
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5640542733886403,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.8723,
+      "step": 72
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.4371591858764352,
+      "learning_rate": 0.00019958875303715615,
+      "loss": 0.889,
+      "step": 73
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.41219541736099313,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.7701,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.4378002235683173,
+      "learning_rate": 0.0001995404475958373,
+      "loss": 0.8083,
+      "step": 75
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.4882060227684013,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8964,
+      "step": 76
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.4418048035941677,
+      "learning_rate": 0.0001994894669738732,
+      "loss": 0.8203,
+      "step": 77
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.4252195168566824,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.7761,
+      "step": 78
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.41113673138492074,
+      "learning_rate": 0.0001994358125413841,
+      "loss": 0.7589,
+      "step": 79
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.43688240390590394,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.813,
+      "step": 80
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.48937772744416697,
+      "learning_rate": 0.0001993794857403495,
+      "loss": 0.9127,
+      "step": 81
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.48790047206375037,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.8357,
+      "step": 82
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.44384417196333376,
+      "learning_rate": 0.0001993204880845699,
+      "loss": 0.7833,
+      "step": 83
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4707494051494073,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.8338,
+      "step": 84
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.4104480033958369,
+      "learning_rate": 0.00019925882115962568,
+      "loss": 0.7625,
+      "step": 85
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.4435412737512262,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.7803,
+      "step": 86
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.45424275991848656,
+      "learning_rate": 0.00019919448662283478,
+      "loss": 0.8729,
+      "step": 87
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.4042880429462926,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.7282,
+      "step": 88
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.42099845264428704,
+      "learning_rate": 0.00019912748620320794,
+      "loss": 0.7957,
+      "step": 89
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5179888598062934,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.8904,
+      "step": 90
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.41315649406651883,
+      "learning_rate": 0.00019905782170140238,
+      "loss": 0.8141,
+      "step": 91
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.5039344170883732,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.9011,
+      "step": 92
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.428932906748187,
+      "learning_rate": 0.00019898549498967343,
+      "loss": 0.8359,
+      "step": 93
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.3918990139415261,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.7698,
+      "step": 94
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.4867307263306567,
+      "learning_rate": 0.000198910508011824,
+      "loss": 0.8894,
+      "step": 95
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.3849900373937041,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7461,
+      "step": 96
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.44339301042600654,
+      "learning_rate": 0.00019883286278315262,
+      "loss": 0.8389,
+      "step": 97
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.4403051271804401,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.873,
+      "step": 98
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.3932316537794687,
+      "learning_rate": 0.00019875256139039902,
+      "loss": 0.7839,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5150303092373243,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.9516,
+      "step": 100
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.42560379190353087,
+      "learning_rate": 0.00019866960599168826,
+      "loss": 0.823,
+      "step": 101
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.4733956375036902,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.8851,
+      "step": 102
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.41629507697959817,
+      "learning_rate": 0.0001985839988164726,
+      "loss": 0.772,
+      "step": 103
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.4488922942457398,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.8442,
+      "step": 104
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.4207918504223135,
+      "learning_rate": 0.00019849574216547171,
+      "loss": 0.8452,
+      "step": 105
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.3795656815181342,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.7297,
+      "step": 106
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.47907397510258765,
+      "learning_rate": 0.00019840483841061058,
+      "loss": 0.8072,
+      "step": 107
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.4142080264062873,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.8323,
+      "step": 108
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.43349213106384954,
+      "learning_rate": 0.00019831128999495606,
+      "loss": 0.8212,
+      "step": 109
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.4670008337658961,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.8888,
+      "step": 110
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.40100642981445356,
+      "learning_rate": 0.0001982150994326511,
+      "loss": 0.8043,
+      "step": 111
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.40081568877954143,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.7262,
+      "step": 112
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.47132320831200863,
+      "learning_rate": 0.0001981162693088471,
+      "loss": 0.8547,
+      "step": 113
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.43664227626852314,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8621,
+      "step": 114
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.42978129437132245,
+      "learning_rate": 0.0001980148022796345,
+      "loss": 0.8551,
+      "step": 115
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.43994065392023857,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7863,
+      "step": 116
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.41214536833658816,
+      "learning_rate": 0.00019791070107197153,
+      "loss": 0.8076,
+      "step": 117
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.5037547666595701,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.7521,
+      "step": 118
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.4028163450562548,
+      "learning_rate": 0.0001978039684836106,
+      "loss": 0.7186,
+      "step": 119
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4341756973283765,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.7786,
+      "step": 120
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.46218989214333206,
+      "learning_rate": 0.0001976946073830234,
+      "loss": 0.8423,
+      "step": 121
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.43517248933936953,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.7408,
+      "step": 122
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.44795165042482726,
+      "learning_rate": 0.00019758262070932375,
+      "loss": 0.7408,
+      "step": 123
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4545616865398823,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7962,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5120527191035785,
+      "learning_rate": 0.00019746801147218842,
+      "loss": 0.7847,
+      "step": 125
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.45589451863464087,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.7998,
+      "step": 126
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.4481327122615246,
+      "learning_rate": 0.00019735078275177654,
+      "loss": 0.791,
+      "step": 127
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4251840844480094,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.8316,
+      "step": 128
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.3628062981318352,
+      "learning_rate": 0.00019723093769864663,
+      "loss": 0.7163,
+      "step": 129
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.4805709842925019,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.895,
+      "step": 130
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.42458589044495515,
+      "learning_rate": 0.0001971084795336719,
+      "loss": 0.8219,
+      "step": 131
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.4113404858173765,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.8085,
+      "step": 132
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.4860210093004792,
+      "learning_rate": 0.00019698341154795389,
+      "loss": 0.866,
+      "step": 133
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.4681857078909186,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.8573,
+      "step": 134
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.42375573358728696,
+      "learning_rate": 0.00019685573710273376,
+      "loss": 0.7784,
+      "step": 135
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.44734150733905653,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.7925,
+      "step": 136
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.4461303725232236,
+      "learning_rate": 0.00019672545962930215,
+      "loss": 0.8014,
+      "step": 137
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.3949997712538345,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8022,
+      "step": 138
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.39244220314624556,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.7385,
+      "step": 139
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.40176139668836713,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.7214,
+      "step": 140
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.4314547890333666,
+      "learning_rate": 0.00019645710967265882,
+      "loss": 0.7872,
+      "step": 141
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.4625915384488709,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7953,
+      "step": 142
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.40505205236175806,
+      "learning_rate": 0.00019631904440143612,
+      "loss": 0.765,
+      "step": 143
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4623439449351562,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.8885,
+      "step": 144
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.40620045960175877,
+      "learning_rate": 0.00019617839052578603,
+      "loss": 0.7914,
+      "step": 145
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.38591399758921124,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7291,
+      "step": 146
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.42371161939980584,
+      "learning_rate": 0.0001960351518258255,
+      "loss": 0.8044,
+      "step": 147
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.3881700045503985,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.7109,
+      "step": 148
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.4482791113780307,
+      "learning_rate": 0.00019588933215113926,
+      "loss": 0.8399,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.4404341406364202,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.8161,
+      "step": 150
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.4560567252709708,
+      "learning_rate": 0.00019574093542067673,
+      "loss": 0.8053,
+      "step": 151
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.48260438414333967,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.8069,
+      "step": 152
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.42905621721548526,
+      "learning_rate": 0.0001955899656226464,
+      "loss": 0.7908,
+      "step": 153
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.4141612044013541,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.7786,
+      "step": 154
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.46782800192143464,
+      "learning_rate": 0.0001954364268144088,
+      "loss": 0.8697,
+      "step": 155
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.43284321639254114,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.8078,
+      "step": 156
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.43673194225207906,
+      "learning_rate": 0.00019528032312236736,
+      "loss": 0.7898,
+      "step": 157
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.4300511780384195,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7702,
+      "step": 158
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.4165428286489318,
+      "learning_rate": 0.00019512165874185767,
+      "loss": 0.7877,
+      "step": 159
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4008560058497671,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.7776,
+      "step": 160
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.38935253294393823,
+      "learning_rate": 0.0001949604379370345,
+      "loss": 0.6922,
+      "step": 161
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.4503657687466216,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8197,
+      "step": 162
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.4691887379486971,
+      "learning_rate": 0.00019479666504075736,
+      "loss": 0.7924,
+      "step": 163
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.3884194106275563,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.7665,
+      "step": 164
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.394762681258105,
+      "learning_rate": 0.0001946303444544741,
+      "loss": 0.7539,
+      "step": 165
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.3985846530495592,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.7394,
+      "step": 166
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.4643833283764792,
+      "learning_rate": 0.00019446148064810242,
+      "loss": 0.7741,
+      "step": 167
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.42613032663048966,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.7967,
+      "step": 168
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.3985170031600202,
+      "learning_rate": 0.00019429007815990993,
+      "loss": 0.7908,
+      "step": 169
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.7356625207849016,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7775,
+      "step": 170
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.45094376276448905,
+      "learning_rate": 0.00019411614159639204,
+      "loss": 0.7783,
+      "step": 171
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.4636435816649117,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.8484,
+      "step": 172
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.4621198499956239,
+      "learning_rate": 0.00019393967563214833,
+      "loss": 0.9057,
+      "step": 173
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.36922838228271354,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.7374,
+      "step": 174
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.4965688405829738,
+      "learning_rate": 0.00019376068500975667,
+      "loss": 0.8498,
+      "step": 175
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.3646938881056897,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.7621,
+      "step": 176
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.3758101689141692,
+      "learning_rate": 0.000193579174539646,
+      "loss": 0.7237,
+      "step": 177
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.4609739513114896,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7893,
+      "step": 178
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.4372103808260089,
+      "learning_rate": 0.00019339514909996706,
+      "loss": 0.8498,
+      "step": 179
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.5335801995001441,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8743,
+      "step": 180
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.46559554482857823,
+      "learning_rate": 0.00019320861363646095,
+      "loss": 0.8128,
+      "step": 181
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.38211339446149645,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.754,
+      "step": 182
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.38983877939713985,
+      "learning_rate": 0.00019301957316232658,
+      "loss": 0.7078,
+      "step": 183
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.35728347883672906,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.7,
+      "step": 184
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.44890919500408405,
+      "learning_rate": 0.0001928280327580858,
+      "loss": 0.7341,
+      "step": 185
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.43143807366697845,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.81,
+      "step": 186
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.5033546677466464,
+      "learning_rate": 0.00019263399757144683,
+      "loss": 0.8697,
+      "step": 187
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.47626916741189934,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8703,
+      "step": 188
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.48024450509527555,
+      "learning_rate": 0.000192437472817166,
+      "loss": 0.7925,
+      "step": 189
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.42792645932493373,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.7499,
+      "step": 190
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.4537067692309997,
+      "learning_rate": 0.00019223846377690754,
+      "loss": 0.7749,
+      "step": 191
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.44128448938912807,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.7712,
+      "step": 192
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.4237325523357925,
+      "learning_rate": 0.00019203697579910154,
+      "loss": 0.7853,
+      "step": 193
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.44618220922984425,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.8255,
+      "step": 194
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.412291999280626,
+      "learning_rate": 0.00019183301429880043,
+      "loss": 0.8134,
+      "step": 195
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.476265593273783,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.8385,
+      "step": 196
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.41576625362728403,
+      "learning_rate": 0.00019162658475753327,
+      "loss": 0.7821,
+      "step": 197
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.4289971257577167,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8013,
+      "step": 198
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.39105537877275937,
+      "learning_rate": 0.00019141769272315858,
+      "loss": 0.7248,
+      "step": 199
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3895336927661995,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.7006,
+      "step": 200
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.48139123386353133,
+      "learning_rate": 0.00019120634380971496,
+      "loss": 0.7692,
+      "step": 201
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.42553092787861524,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.7733,
+      "step": 202
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.4198528337621994,
+      "learning_rate": 0.0001909925436972706,
+      "loss": 0.7154,
+      "step": 203
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.47240006700238607,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.8265,
+      "step": 204
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.42225252570225996,
+      "learning_rate": 0.00019077629813177036,
+      "loss": 0.7502,
+      "step": 205
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.3967444589999123,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.6841,
+      "step": 206
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.4625079056614288,
+      "learning_rate": 0.00019055761292488142,
+      "loss": 0.8298,
+      "step": 207
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.48961109504154815,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.8815,
+      "step": 208
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.43470857997535833,
+      "learning_rate": 0.00019033649395383702,
+      "loss": 0.7719,
+      "step": 209
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.4335611546115917,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.8122,
+      "step": 210
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.4368120206369419,
+      "learning_rate": 0.00019011294716127867,
+      "loss": 0.8141,
+      "step": 211
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.3850030447261817,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7727,
+      "step": 212
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.40477749678485697,
+      "learning_rate": 0.0001898869785550963,
+      "loss": 0.7856,
+      "step": 213
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.42982204528288137,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.7594,
+      "step": 214
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.41021059459830667,
+      "learning_rate": 0.00018965859420826684,
+      "loss": 0.7721,
+      "step": 215
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.387973868858638,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.712,
+      "step": 216
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.4860347187640322,
+      "learning_rate": 0.00018942780025869098,
+      "loss": 0.8645,
+      "step": 217
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.44564759178431707,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.8202,
+      "step": 218
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.420660624361008,
+      "learning_rate": 0.00018919460290902826,
+      "loss": 0.7458,
+      "step": 219
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.41015411316730255,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.7232,
+      "step": 220
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.42515818070700523,
+      "learning_rate": 0.0001889590084265304,
+      "loss": 0.8088,
+      "step": 221
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.4610430638278789,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.8274,
+      "step": 222
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.42056918293114154,
+      "learning_rate": 0.0001887210231428727,
+      "loss": 0.777,
+      "step": 223
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.44536383613524727,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.8221,
+      "step": 224
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.421032036109606,
+      "learning_rate": 0.0001884806534539841,
+      "loss": 0.8321,
+      "step": 225
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.43101712396074243,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.7817,
+      "step": 226
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.38674873972167195,
+      "learning_rate": 0.0001882379058198751,
+      "loss": 0.7869,
+      "step": 227
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.3707518679142606,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.702,
+      "step": 228
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.3916112759808893,
+      "learning_rate": 0.00018799278676446423,
+      "loss": 0.7938,
+      "step": 229
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.38609299121364954,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7322,
+      "step": 230
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.435661825146532,
+      "learning_rate": 0.00018774530287540278,
+      "loss": 0.8045,
+      "step": 231
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.49499626868178864,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.8234,
+      "step": 232
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.4893711424890475,
+      "learning_rate": 0.00018749546080389757,
+      "loss": 0.873,
+      "step": 233
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.4450958116436791,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.8649,
+      "step": 234
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.45547998510701876,
+      "learning_rate": 0.00018724326726453244,
+      "loss": 0.8302,
+      "step": 235
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.4295009687377971,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7666,
+      "step": 236
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.43034893049915784,
+      "learning_rate": 0.00018698872903508755,
+      "loss": 0.7891,
+      "step": 237
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.4114283725236567,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7864,
+      "step": 238
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.4110886039969523,
+      "learning_rate": 0.0001867318529563574,
+      "loss": 0.7785,
+      "step": 239
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.3855114181967641,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7328,
+      "step": 240
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.4567790027176353,
+      "learning_rate": 0.00018647264593196688,
+      "loss": 0.8856,
+      "step": 241
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.5619354317424428,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.7871,
+      "step": 242
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.42812378397821366,
+      "learning_rate": 0.00018621111492818585,
+      "loss": 0.8011,
+      "step": 243
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4241679326027458,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.7245,
+      "step": 244
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.39538001527672545,
+      "learning_rate": 0.00018594726697374175,
+      "loss": 0.7696,
+      "step": 245
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.42330872777641004,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.8117,
+      "step": 246
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.385442778809825,
+      "learning_rate": 0.0001856811091596308,
+      "loss": 0.7262,
+      "step": 247
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.48319857321141446,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.8435,
+      "step": 248
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.42823877532031807,
+      "learning_rate": 0.00018541264863892754,
+      "loss": 0.7938,
+      "step": 249
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.4218589476531892,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.773,
+      "step": 250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.42011382915150025,
+      "learning_rate": 0.00018514189262659235,
+      "loss": 0.7718,
+      "step": 251
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.45200703035714385,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8206,
+      "step": 252
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.44780678084596004,
+      "learning_rate": 0.00018486884839927768,
+      "loss": 0.8711,
+      "step": 253
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.3984521596182537,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.7721,
+      "step": 254
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.43193742325175283,
+      "learning_rate": 0.0001845935232951325,
+      "loss": 0.7781,
+      "step": 255
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.37411697466957655,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7818,
+      "step": 256
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.4607327939799755,
+      "learning_rate": 0.00018431592471360503,
+      "loss": 0.8509,
+      "step": 257
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4039402676301833,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7639,
+      "step": 258
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.4051051026908022,
+      "learning_rate": 0.000184036060115244,
+      "loss": 0.7958,
+      "step": 259
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4681079514119628,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.778,
+      "step": 260
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.5089045150757345,
+      "learning_rate": 0.00018375393702149787,
+      "loss": 0.8048,
+      "step": 261
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.39712170271914216,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.8004,
+      "step": 262
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.4392432511684368,
+      "learning_rate": 0.00018346956301451304,
+      "loss": 0.8395,
+      "step": 263
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.4347253143263972,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.8149,
+      "step": 264
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.41443178774377587,
+      "learning_rate": 0.00018318294573692985,
+      "loss": 0.7329,
+      "step": 265
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.4650143976768978,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.839,
+      "step": 266
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.43911311807537096,
+      "learning_rate": 0.0001828940928916772,
+      "loss": 0.8067,
+      "step": 267
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.39540305914208157,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.7068,
+      "step": 268
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.42966623588159436,
+      "learning_rate": 0.00018260301224176558,
+      "loss": 0.8177,
+      "step": 269
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.43133822155256435,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7177,
+      "step": 270
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.4244528334614769,
+      "learning_rate": 0.00018230971161007853,
+      "loss": 0.7657,
+      "step": 271
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.47176055918419235,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.8243,
+      "step": 272
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.41788494818177235,
+      "learning_rate": 0.00018201419887916214,
+      "loss": 0.7464,
+      "step": 273
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.40337884467661256,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.7418,
+      "step": 274
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.43116229829324304,
+      "learning_rate": 0.00018171648199101346,
+      "loss": 0.837,
+      "step": 275
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4201249121122027,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7734,
+      "step": 276
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.4276002086311205,
+      "learning_rate": 0.00018141656894686689,
+      "loss": 0.8,
+      "step": 277
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.462778116990809,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.798,
+      "step": 278
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.3895954135717525,
+      "learning_rate": 0.00018111446780697929,
+      "loss": 0.7247,
+      "step": 279
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.44693768901196756,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7611,
+      "step": 280
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.5012266359481057,
+      "learning_rate": 0.00018081018669041324,
+      "loss": 0.7782,
+      "step": 281
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.431580052077535,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.8136,
+      "step": 282
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.34580350194592,
+      "learning_rate": 0.00018050373377481878,
+      "loss": 0.7046,
+      "step": 283
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.40521604941592043,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.7822,
+      "step": 284
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.463413863480668,
+      "learning_rate": 0.0001801951172962139,
+      "loss": 0.8068,
+      "step": 285
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.40150935644843316,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.8233,
+      "step": 286
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.45689557364514,
+      "learning_rate": 0.0001798843455487629,
+      "loss": 0.823,
+      "step": 287
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.4670739142075554,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.8149,
+      "step": 288
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.5307781342567004,
+      "learning_rate": 0.00017957142688455362,
+      "loss": 0.804,
+      "step": 289
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.4208180995221731,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7791,
+      "step": 290
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.45014040176489684,
+      "learning_rate": 0.00017925636971337304,
+      "loss": 0.8117,
+      "step": 291
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4596321645895528,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7931,
+      "step": 292
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.4028349356623204,
+      "learning_rate": 0.00017893918250248104,
+      "loss": 0.7381,
+      "step": 293
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.3917587239284584,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.6936,
+      "step": 294
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.3900838090279277,
+      "learning_rate": 0.00017861987377638312,
+      "loss": 0.7342,
+      "step": 295
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.43758813519958567,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7689,
+      "step": 296
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.40332860167731854,
+      "learning_rate": 0.0001782984521166011,
+      "loss": 0.7457,
+      "step": 297
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.39487269978666895,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7212,
+      "step": 298
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.4332277656743601,
+      "learning_rate": 0.00017797492616144256,
+      "loss": 0.8009,
+      "step": 299
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.35652603119573323,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.6794,
+      "step": 300
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.4438369720997122,
+      "learning_rate": 0.00017764930460576866,
+      "loss": 0.8007,
+      "step": 301
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.42018279839502726,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.758,
+      "step": 302
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.4200937067695404,
+      "learning_rate": 0.00017732159620076053,
+      "loss": 0.7503,
+      "step": 303
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.4884774124325421,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.8924,
+      "step": 304
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.3908029573556648,
+      "learning_rate": 0.00017699180975368396,
+      "loss": 0.7262,
+      "step": 305
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.41808155721721457,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7691,
+      "step": 306
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.40250340781508953,
+      "learning_rate": 0.00017665995412765285,
+      "loss": 0.7377,
+      "step": 307
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.44929706295556365,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.8265,
+      "step": 308
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.4159689392637273,
+      "learning_rate": 0.00017632603824139085,
+      "loss": 0.7394,
+      "step": 309
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.43527641503799935,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.8427,
+      "step": 310
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.34219949975786723,
+      "learning_rate": 0.0001759900710689918,
+      "loss": 0.7182,
+      "step": 311
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.4254323711431976,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.849,
+      "step": 312
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.4673742997750265,
+      "learning_rate": 0.00017565206163967846,
+      "loss": 0.8315,
+      "step": 313
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.3829077844147456,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.7322,
+      "step": 314
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.390914529242842,
+      "learning_rate": 0.00017531201903755994,
+      "loss": 0.7534,
+      "step": 315
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.42361530007513093,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.708,
+      "step": 316
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.45358930689734117,
+      "learning_rate": 0.00017496995240138744,
+      "loss": 0.7985,
+      "step": 317
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.4215442147140462,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.8093,
+      "step": 318
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.4094160983237034,
+      "learning_rate": 0.00017462587092430875,
+      "loss": 0.7078,
+      "step": 319
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.40893592359557696,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7011,
+      "step": 320
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.4071121116863761,
+      "learning_rate": 0.00017427978385362112,
+      "loss": 0.755,
+      "step": 321
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.4357221168046931,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7251,
+      "step": 322
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.395505603922931,
+      "learning_rate": 0.0001739317004905227,
+      "loss": 0.7379,
+      "step": 323
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.37045560115272647,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.6287,
+      "step": 324
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.3976537556203463,
+      "learning_rate": 0.00017358163018986282,
+      "loss": 0.7186,
+      "step": 325
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.5056725192022081,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.8429,
+      "step": 326
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.3756217181501402,
+      "learning_rate": 0.00017322958235989016,
+      "loss": 0.736,
+      "step": 327
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.4074679927609601,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7919,
+      "step": 328
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.4249231127747171,
+      "learning_rate": 0.00017287556646200018,
+      "loss": 0.7303,
+      "step": 329
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.39000956775536444,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.7254,
+      "step": 330
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.3916990291806942,
+      "learning_rate": 0.00017251959201048083,
+      "loss": 0.7013,
+      "step": 331
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.42591790880822694,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7502,
+      "step": 332
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.37489223839291724,
+      "learning_rate": 0.00017216166857225674,
+      "loss": 0.7018,
+      "step": 333
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.4194676251178253,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.7212,
+      "step": 334
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.43101335303581556,
+      "learning_rate": 0.00017180180576663228,
+      "loss": 0.8161,
+      "step": 335
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4387232065897843,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8008,
+      "step": 336
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.41382439367804846,
+      "learning_rate": 0.00017144001326503273,
+      "loss": 0.804,
+      "step": 337
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.41211004447198074,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.7569,
+      "step": 338
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.40008071317180616,
+      "learning_rate": 0.00017107630079074478,
+      "loss": 0.7677,
+      "step": 339
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.5127153714759964,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7871,
+      "step": 340
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.4490024504182968,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.7269,
+      "step": 341
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.43938571973084534,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7702,
+      "step": 342
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.41014909069350464,
+      "learning_rate": 0.00017034315507498635,
+      "loss": 0.785,
+      "step": 343
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.41426028858989083,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.8286,
+      "step": 344
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.4115488561422381,
+      "learning_rate": 0.00016997374153703625,
+      "loss": 0.8013,
+      "step": 345
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.4214337570754401,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.8269,
+      "step": 346
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.4043638403803839,
+      "learning_rate": 0.00016960244743290868,
+      "loss": 0.7818,
+      "step": 347
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.6022236587314866,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7983,
+      "step": 348
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.40289661644881036,
+      "learning_rate": 0.00016922928274124886,
+      "loss": 0.7353,
+      "step": 349
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.37273367086224285,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.6951,
+      "step": 350
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.4682536065646327,
+      "learning_rate": 0.00016885425749097444,
+      "loss": 0.7607,
+      "step": 351
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.44530767350503303,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.8339,
+      "step": 352
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.4192781089904373,
+      "learning_rate": 0.00016847738176100632,
+      "loss": 0.7627,
+      "step": 353
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.424279947113004,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7607,
+      "step": 354
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.3624871461837757,
+      "learning_rate": 0.0001680986656799975,
+      "loss": 0.6784,
+      "step": 355
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.4458628496104206,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.8183,
+      "step": 356
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.37260410669057686,
+      "learning_rate": 0.00016771811942606108,
+      "loss": 0.699,
+      "step": 357
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.45846386329482036,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.8798,
+      "step": 358
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.3880417003199712,
+      "learning_rate": 0.00016733575322649657,
+      "loss": 0.6794,
+      "step": 359
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4084947786453557,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.7111,
+      "step": 360
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.40911708106332595,
+      "learning_rate": 0.00016695157735751513,
+      "loss": 0.7549,
+      "step": 361
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.4132999549821528,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.791,
+      "step": 362
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.49385166506659406,
+      "learning_rate": 0.0001665656021439633,
+      "loss": 0.8749,
+      "step": 363
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.38919074019594835,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.6928,
+      "step": 364
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.43760925349144164,
+      "learning_rate": 0.00016617783795904565,
+      "loss": 0.7955,
+      "step": 365
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.38914225629869686,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7366,
+      "step": 366
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.36628467222820826,
+      "learning_rate": 0.00016578829522404583,
+      "loss": 0.6918,
+      "step": 367
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.42253341688016677,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.8499,
+      "step": 368
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.43265824860995267,
+      "learning_rate": 0.00016539698440804661,
+      "loss": 0.8011,
+      "step": 369
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.45157281512211417,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.8259,
+      "step": 370
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.41067404773789473,
+      "learning_rate": 0.0001650039160276485,
+      "loss": 0.7546,
+      "step": 371
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.414208097470773,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.7532,
+      "step": 372
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.4208006567151566,
+      "learning_rate": 0.0001646091006466871,
+      "loss": 0.7728,
+      "step": 373
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.41022324623325745,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.7424,
+      "step": 374
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.4372777903897775,
+      "learning_rate": 0.00016421254887594917,
+      "loss": 0.7284,
+      "step": 375
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.4293241178806811,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.8082,
+      "step": 376
+    },
+    {
+      "epoch": 0.3016,
+      "grad_norm": 0.4703338824301085,
+      "learning_rate": 0.00016381427137288754,
+      "loss": 0.7858,
+      "step": 377
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.3821269768908472,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.6812,
+      "step": 378
+    },
+    {
+      "epoch": 0.3032,
+      "grad_norm": 0.4639157162424115,
+      "learning_rate": 0.0001634142788413346,
+      "loss": 0.8313,
+      "step": 379
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4066065519034959,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7776,
+      "step": 380
+    },
+    {
+      "epoch": 0.3048,
+      "grad_norm": 0.4069990387589458,
+      "learning_rate": 0.00016301258203121462,
+      "loss": 0.7526,
+      "step": 381
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.44425750702697925,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.7674,
+      "step": 382
+    },
+    {
+      "epoch": 0.3064,
+      "grad_norm": 0.40915898668096295,
+      "learning_rate": 0.00016260919173825508,
+      "loss": 0.7172,
+      "step": 383
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.46277505387879625,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7885,
+      "step": 384
+    },
+    {
+      "epoch": 0.308,
+      "grad_norm": 0.42489735407616697,
+      "learning_rate": 0.00016220411880369601,
+      "loss": 0.7767,
+      "step": 385
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.3915573764346979,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7262,
+      "step": 386
+    },
+    {
+      "epoch": 0.3096,
+      "grad_norm": 0.4694645980008425,
+      "learning_rate": 0.00016179737411399926,
+      "loss": 0.8272,
+      "step": 387
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.42899293439515646,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.804,
+      "step": 388
+    },
+    {
+      "epoch": 0.3112,
+      "grad_norm": 0.3740814824219872,
+      "learning_rate": 0.00016138896860055555,
+      "loss": 0.7416,
+      "step": 389
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.39837081609459923,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7494,
+      "step": 390
+    },
+    {
+      "epoch": 0.3128,
+      "grad_norm": 0.4582353947458681,
+      "learning_rate": 0.00016097891323939062,
+      "loss": 0.8083,
+      "step": 391
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4307640402194602,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7589,
+      "step": 392
+    },
+    {
+      "epoch": 0.3144,
+      "grad_norm": 0.36653586683967887,
+      "learning_rate": 0.00016056721905087056,
+      "loss": 0.6716,
+      "step": 393
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.544221337878332,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.7744,
+      "step": 394
+    },
+    {
+      "epoch": 0.316,
+      "grad_norm": 0.4351542944159919,
+      "learning_rate": 0.00016015389709940538,
+      "loss": 0.7857,
+      "step": 395
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.39901121045005666,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7286,
+      "step": 396
+    },
+    {
+      "epoch": 0.3176,
+      "grad_norm": 0.4219620512236007,
+      "learning_rate": 0.0001597389584931517,
+      "loss": 0.7797,
+      "step": 397
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.39815337070858814,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7049,
+      "step": 398
+    },
+    {
+      "epoch": 0.3192,
+      "grad_norm": 0.3818173849977698,
+      "learning_rate": 0.0001593224143837142,
+      "loss": 0.7471,
+      "step": 399
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.39122989783011713,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7306,
+      "step": 400
+    },
+    {
+      "epoch": 0.3208,
+      "grad_norm": 0.3767324234403499,
+      "learning_rate": 0.00015890427596584617,
+      "loss": 0.7385,
+      "step": 401
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.40191898660582864,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.6948,
+      "step": 402
+    },
+    {
+      "epoch": 0.3224,
+      "grad_norm": 0.435571201323962,
+      "learning_rate": 0.00015848455447714822,
+      "loss": 0.7739,
+      "step": 403
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.4523390503836961,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7937,
+      "step": 404
+    },
+    {
+      "epoch": 0.324,
+      "grad_norm": 0.4485313538011009,
+      "learning_rate": 0.00015806326119776663,
+      "loss": 0.7949,
+      "step": 405
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.45596518575590006,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.8077,
+      "step": 406
+    },
+    {
+      "epoch": 0.3256,
+      "grad_norm": 0.42809020109568713,
+      "learning_rate": 0.00015764040745008988,
+      "loss": 0.7876,
+      "step": 407
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.38023281273221554,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.7229,
+      "step": 408
+    },
+    {
+      "epoch": 0.3272,
+      "grad_norm": 0.41311752581332717,
+      "learning_rate": 0.00015721600459844468,
+      "loss": 0.6848,
+      "step": 409
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.400008581777007,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7318,
+      "step": 410
+    },
+    {
+      "epoch": 0.3288,
+      "grad_norm": 0.3811652194716379,
+      "learning_rate": 0.00015679006404879033,
+      "loss": 0.7159,
+      "step": 411
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.35730391502884745,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.6486,
+      "step": 412
+    },
+    {
+      "epoch": 0.3304,
+      "grad_norm": 0.4324338486262784,
+      "learning_rate": 0.00015636259724841222,
+      "loss": 0.7947,
+      "step": 413
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.4545592323947514,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7782,
+      "step": 414
+    },
+    {
+      "epoch": 0.332,
+      "grad_norm": 0.42678151167414735,
+      "learning_rate": 0.00015593361568561428,
+      "loss": 0.8018,
+      "step": 415
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.39316885399720736,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7533,
+      "step": 416
+    },
+    {
+      "epoch": 0.3336,
+      "grad_norm": 0.40953597766510147,
+      "learning_rate": 0.0001555031308894101,
+      "loss": 0.7312,
+      "step": 417
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.3628151835596889,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.6925,
+      "step": 418
+    },
+    {
+      "epoch": 0.3352,
+      "grad_norm": 0.3630247652160497,
+      "learning_rate": 0.0001550711544292131,
+      "loss": 0.6087,
+      "step": 419
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4076144904954131,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7285,
+      "step": 420
+    },
+    {
+      "epoch": 0.3368,
+      "grad_norm": 0.3948107855238599,
+      "learning_rate": 0.00015463769791452574,
+      "loss": 0.7689,
+      "step": 421
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.4045910835927847,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7174,
+      "step": 422
+    },
+    {
+      "epoch": 0.3384,
+      "grad_norm": 0.42784470226134785,
+      "learning_rate": 0.00015420277299462736,
+      "loss": 0.782,
+      "step": 423
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.42261162570750493,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7867,
+      "step": 424
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.37034688847703967,
+      "learning_rate": 0.00015376639135826107,
+      "loss": 0.6975,
+      "step": 425
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.37709320180579986,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7107,
+      "step": 426
+    },
+    {
+      "epoch": 0.3416,
+      "grad_norm": 0.39874988129787453,
+      "learning_rate": 0.00015332856473331978,
+      "loss": 0.7501,
+      "step": 427
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.38523765832533496,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7129,
+      "step": 428
+    },
+    {
+      "epoch": 0.3432,
+      "grad_norm": 0.39536564455432827,
+      "learning_rate": 0.00015288930488653094,
+      "loss": 0.7266,
+      "step": 429
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.3463850559775383,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.6834,
+      "step": 430
+    },
+    {
+      "epoch": 0.3448,
+      "grad_norm": 0.46367798778196645,
+      "learning_rate": 0.0001524486236231402,
+      "loss": 0.7669,
+      "step": 431
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.38663458908064,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7596,
+      "step": 432
+    },
+    {
+      "epoch": 0.3464,
+      "grad_norm": 0.41586262767094645,
+      "learning_rate": 0.00015200653278659432,
+      "loss": 0.7199,
+      "step": 433
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.46103511895719435,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.8416,
+      "step": 434
+    },
+    {
+      "epoch": 0.348,
+      "grad_norm": 0.4184428407337638,
+      "learning_rate": 0.00015156304425822267,
+      "loss": 0.7527,
+      "step": 435
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.49855699805314946,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.8474,
+      "step": 436
+    },
+    {
+      "epoch": 0.3496,
+      "grad_norm": 0.42845886618076534,
+      "learning_rate": 0.00015111816995691809,
+      "loss": 0.8049,
+      "step": 437
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.4530175233083066,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.7144,
+      "step": 438
+    },
+    {
+      "epoch": 0.3512,
+      "grad_norm": 0.41075480954547927,
+      "learning_rate": 0.00015067192183881658,
+      "loss": 0.7384,
+      "step": 439
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.40708334713709043,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7652,
+      "step": 440
+    },
+    {
+      "epoch": 0.3528,
+      "grad_norm": 0.5818509229326395,
+      "learning_rate": 0.00015022431189697568,
+      "loss": 0.8631,
+      "step": 441
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.42118137569346126,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.7961,
+      "step": 442
+    },
+    {
+      "epoch": 0.3544,
+      "grad_norm": 0.41010451650558516,
+      "learning_rate": 0.0001497753521610526,
+      "loss": 0.7229,
+      "step": 443
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.423125162993934,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7648,
+      "step": 444
+    },
+    {
+      "epoch": 0.356,
+      "grad_norm": 0.40099758305903915,
+      "learning_rate": 0.00014932505469698052,
+      "loss": 0.7493,
+      "step": 445
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.40085301663186823,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7125,
+      "step": 446
+    },
+    {
+      "epoch": 0.3576,
+      "grad_norm": 0.3510211250094017,
+      "learning_rate": 0.0001488734316066446,
+      "loss": 0.6731,
+      "step": 447
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.37050479877114534,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7004,
+      "step": 448
+    },
+    {
+      "epoch": 0.3592,
+      "grad_norm": 0.4333091776481077,
+      "learning_rate": 0.0001484204950275565,
+      "loss": 0.7667,
+      "step": 449
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4226511054807109,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.7829,
+      "step": 450
+    },
+    {
+      "epoch": 0.3608,
+      "grad_norm": 0.4081592062701581,
+      "learning_rate": 0.00014796625713252848,
+      "loss": 0.7048,
+      "step": 451
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.423554063227864,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7994,
+      "step": 452
+    },
+    {
+      "epoch": 0.3624,
+      "grad_norm": 0.3743142771417236,
+      "learning_rate": 0.00014751073012934587,
+      "loss": 0.6988,
+      "step": 453
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.44351252150762505,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.8442,
+      "step": 454
+    },
+    {
+      "epoch": 0.364,
+      "grad_norm": 0.44646063750669057,
+      "learning_rate": 0.0001470539262604393,
+      "loss": 0.7579,
+      "step": 455
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.3723488079722956,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7201,
+      "step": 456
+    },
+    {
+      "epoch": 0.3656,
+      "grad_norm": 0.4129286246507839,
+      "learning_rate": 0.00014659585780255556,
+      "loss": 0.7467,
+      "step": 457
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.43773886229163683,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.7748,
+      "step": 458
+    },
+    {
+      "epoch": 0.3672,
+      "grad_norm": 0.412176687132668,
+      "learning_rate": 0.0001461365370664276,
+      "loss": 0.7605,
+      "step": 459
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.41105016474383405,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7548,
+      "step": 460
+    },
+    {
+      "epoch": 0.3688,
+      "grad_norm": 0.3859611255262852,
+      "learning_rate": 0.00014567597639644387,
+      "loss": 0.7217,
+      "step": 461
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.3970615939573622,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.76,
+      "step": 462
+    },
+    {
+      "epoch": 0.3704,
+      "grad_norm": 0.4677850684979814,
+      "learning_rate": 0.00014521418817031628,
+      "loss": 0.7227,
+      "step": 463
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.41074224966622186,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.7136,
+      "step": 464
+    },
+    {
+      "epoch": 0.372,
+      "grad_norm": 0.4252016612544697,
+      "learning_rate": 0.00014475118479874774,
+      "loss": 0.7487,
+      "step": 465
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.37404689361923105,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7631,
+      "step": 466
+    },
+    {
+      "epoch": 0.3736,
+      "grad_norm": 0.4090939135398797,
+      "learning_rate": 0.0001442869787250987,
+      "loss": 0.7462,
+      "step": 467
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4369664465142497,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.7877,
+      "step": 468
+    },
+    {
+      "epoch": 0.3752,
+      "grad_norm": 0.40761629202712885,
+      "learning_rate": 0.00014382158242505234,
+      "loss": 0.6905,
+      "step": 469
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.41875378199551183,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7472,
+      "step": 470
+    },
+    {
+      "epoch": 0.3768,
+      "grad_norm": 0.8358439241813403,
+      "learning_rate": 0.00014335500840627986,
+      "loss": 0.7236,
+      "step": 471
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.46388968370700895,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.7863,
+      "step": 472
+    },
+    {
+      "epoch": 0.3784,
+      "grad_norm": 0.39835167876802036,
+      "learning_rate": 0.0001428872692081038,
+      "loss": 0.7549,
+      "step": 473
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.38420987796805284,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.6909,
+      "step": 474
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.4463675089401714,
+      "learning_rate": 0.00014241837740116132,
+      "loss": 0.7249,
+      "step": 475
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.3659560409886909,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.6831,
+      "step": 476
+    },
+    {
+      "epoch": 0.3816,
+      "grad_norm": 0.41133170803453634,
+      "learning_rate": 0.00014194834558706632,
+      "loss": 0.7541,
+      "step": 477
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.42236590760206644,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7408,
+      "step": 478
+    },
+    {
+      "epoch": 0.3832,
+      "grad_norm": 0.3837860015366334,
+      "learning_rate": 0.0001414771863980707,
+      "loss": 0.7473,
+      "step": 479
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.5160320691639759,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.8562,
+      "step": 480
+    },
+    {
+      "epoch": 0.3848,
+      "grad_norm": 0.3849480248408463,
+      "learning_rate": 0.00014100491249672498,
+      "loss": 0.7148,
+      "step": 481
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.3941590303338292,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7586,
+      "step": 482
+    },
+    {
+      "epoch": 0.3864,
+      "grad_norm": 0.38721973528747744,
+      "learning_rate": 0.0001405315365755379,
+      "loss": 0.7846,
+      "step": 483
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.40064782545917166,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7568,
+      "step": 484
+    },
+    {
+      "epoch": 0.388,
+      "grad_norm": 0.39338870914971513,
+      "learning_rate": 0.00014005707135663527,
+      "loss": 0.7185,
+      "step": 485
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.39029507130944646,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.6981,
+      "step": 486
+    },
+    {
+      "epoch": 0.3896,
+      "grad_norm": 0.40867482731944854,
+      "learning_rate": 0.00013958152959141825,
+      "loss": 0.6855,
+      "step": 487
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.4382076380381577,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.75,
+      "step": 488
+    },
+    {
+      "epoch": 0.3912,
+      "grad_norm": 0.4270498186237226,
+      "learning_rate": 0.00013910492406022033,
+      "loss": 0.7405,
+      "step": 489
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.45075032116777786,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.6953,
+      "step": 490
+    },
+    {
+      "epoch": 0.3928,
+      "grad_norm": 0.38898341483309623,
+      "learning_rate": 0.0001386272675719642,
+      "loss": 0.6565,
+      "step": 491
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.4033537483239289,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7429,
+      "step": 492
+    },
+    {
+      "epoch": 0.3944,
+      "grad_norm": 0.42999158597516535,
+      "learning_rate": 0.00013814857296381728,
+      "loss": 0.729,
+      "step": 493
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.38974476676785047,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.6573,
+      "step": 494
+    },
+    {
+      "epoch": 0.396,
+      "grad_norm": 0.3961133656085115,
+      "learning_rate": 0.00013766885310084688,
+      "loss": 0.6803,
+      "step": 495
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.3823908001357277,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.685,
+      "step": 496
+    },
+    {
+      "epoch": 0.3976,
+      "grad_norm": 0.49877993731022596,
+      "learning_rate": 0.00013718812087567414,
+      "loss": 0.8469,
+      "step": 497
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.42866883973871944,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7263,
+      "step": 498
+    },
+    {
+      "epoch": 0.3992,
+      "grad_norm": 0.4012134571717772,
+      "learning_rate": 0.000136706389208128,
+      "loss": 0.7376,
+      "step": 499
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.40274292876935464,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7137,
+      "step": 500
+    },
+    {
+      "epoch": 0.4008,
+      "grad_norm": 0.3890943884589063,
+      "learning_rate": 0.00013622367104489756,
+      "loss": 0.7607,
+      "step": 501
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.41888096021420373,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.8449,
+      "step": 502
+    },
+    {
+      "epoch": 0.4024,
+      "grad_norm": 0.39795135596529974,
+      "learning_rate": 0.0001357399793591844,
+      "loss": 0.7654,
+      "step": 503
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.4176974400501837,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7507,
+      "step": 504
+    },
+    {
+      "epoch": 0.404,
+      "grad_norm": 0.4290373283668271,
+      "learning_rate": 0.00013525532715035366,
+      "loss": 0.7141,
+      "step": 505
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.3788056047338438,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.6782,
+      "step": 506
+    },
+    {
+      "epoch": 0.4056,
+      "grad_norm": 0.4030490255452256,
+      "learning_rate": 0.00013476972744358507,
+      "loss": 0.7107,
+      "step": 507
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.8704552300643414,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.7185,
+      "step": 508
+    },
+    {
+      "epoch": 0.4072,
+      "grad_norm": 0.4022264828473569,
+      "learning_rate": 0.00013428319328952253,
+      "loss": 0.7202,
+      "step": 509
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.4229430400668877,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7891,
+      "step": 510
+    },
+    {
+      "epoch": 0.4088,
+      "grad_norm": 0.4231583162337773,
+      "learning_rate": 0.0001337957377639235,
+      "loss": 0.6874,
+      "step": 511
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.43153948757655713,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.6786,
+      "step": 512
+    },
+    {
+      "epoch": 0.4104,
+      "grad_norm": 0.4198059022745376,
+      "learning_rate": 0.0001333073739673076,
+      "loss": 0.7531,
+      "step": 513
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.3811705461151831,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.6858,
+      "step": 514
+    },
+    {
+      "epoch": 0.412,
+      "grad_norm": 0.4190940859985535,
+      "learning_rate": 0.0001328181150246045,
+      "loss": 0.7143,
+      "step": 515
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4536687433193188,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.8082,
+      "step": 516
+    },
+    {
+      "epoch": 0.4136,
+      "grad_norm": 0.41071086963109465,
+      "learning_rate": 0.00013232797408480127,
+      "loss": 0.7351,
+      "step": 517
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.47882365797373727,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7702,
+      "step": 518
+    },
+    {
+      "epoch": 0.4152,
+      "grad_norm": 0.38295352477242467,
+      "learning_rate": 0.00013183696432058888,
+      "loss": 0.6919,
+      "step": 519
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.42881355586122355,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7571,
+      "step": 520
+    },
+    {
+      "epoch": 0.4168,
+      "grad_norm": 0.38449068440791867,
+      "learning_rate": 0.00013134509892800822,
+      "loss": 0.6772,
+      "step": 521
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.5086204503020627,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.8256,
+      "step": 522
+    },
+    {
+      "epoch": 0.4184,
+      "grad_norm": 0.4301701690919157,
+      "learning_rate": 0.00013085239112609547,
+      "loss": 0.8075,
+      "step": 523
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4058292645531843,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7181,
+      "step": 524
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.3537948967625652,
+      "learning_rate": 0.00013035885415652685,
+      "loss": 0.674,
+      "step": 525
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.3804409614599797,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7096,
+      "step": 526
+    },
+    {
+      "epoch": 0.4216,
+      "grad_norm": 0.3973976042378748,
+      "learning_rate": 0.00012986450128326266,
+      "loss": 0.6934,
+      "step": 527
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.4050141353906074,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.757,
+      "step": 528
+    },
+    {
+      "epoch": 0.4232,
+      "grad_norm": 0.3986132692795599,
+      "learning_rate": 0.00012936934579219094,
+      "loss": 0.6967,
+      "step": 529
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.40584716553831934,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7235,
+      "step": 530
+    },
+    {
+      "epoch": 0.4248,
+      "grad_norm": 0.40044297473824103,
+      "learning_rate": 0.00012887340099077024,
+      "loss": 0.758,
+      "step": 531
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.3848338429889167,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7297,
+      "step": 532
+    },
+    {
+      "epoch": 0.4264,
+      "grad_norm": 0.42139677544301846,
+      "learning_rate": 0.0001283766802076722,
+      "loss": 0.7165,
+      "step": 533
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.36554253586630076,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.6707,
+      "step": 534
+    },
+    {
+      "epoch": 0.428,
+      "grad_norm": 0.36826074350319077,
+      "learning_rate": 0.00012787919679242306,
+      "loss": 0.6653,
+      "step": 535
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4373697897927393,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7336,
+      "step": 536
+    },
+    {
+      "epoch": 0.4296,
+      "grad_norm": 0.4201047732025366,
+      "learning_rate": 0.00012738096411504522,
+      "loss": 0.7428,
+      "step": 537
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.38396768744068105,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.6727,
+      "step": 538
+    },
+    {
+      "epoch": 0.4312,
+      "grad_norm": 0.3730930808471466,
+      "learning_rate": 0.00012688199556569753,
+      "loss": 0.6945,
+      "step": 539
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.42098927252534707,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.744,
+      "step": 540
+    },
+    {
+      "epoch": 0.4328,
+      "grad_norm": 0.3951808184820184,
+      "learning_rate": 0.0001263823045543158,
+      "loss": 0.7275,
+      "step": 541
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.41872967335722294,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.7741,
+      "step": 542
+    },
+    {
+      "epoch": 0.4344,
+      "grad_norm": 0.4282331565942988,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.6753,
+      "step": 543
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4366654761596964,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.7224,
+      "step": 544
+    },
+    {
+      "epoch": 0.436,
+      "grad_norm": 0.4502231199852326,
+      "learning_rate": 0.00012538080888191408,
+      "loss": 0.869,
+      "step": 545
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.35101016716715344,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.6528,
+      "step": 546
+    },
+    {
+      "epoch": 0.4376,
+      "grad_norm": 0.46211253437428196,
+      "learning_rate": 0.00012487903113640337,
+      "loss": 0.7658,
+      "step": 547
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.4179053401194437,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7627,
+      "step": 548
+    },
+    {
+      "epoch": 0.4392,
+      "grad_norm": 0.42304596138171013,
+      "learning_rate": 0.00012437658475915377,
+      "loss": 0.7285,
+      "step": 549
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.38481247196526547,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.724,
+      "step": 550
+    },
+    {
+      "epoch": 0.4408,
+      "grad_norm": 0.43738464403250726,
+      "learning_rate": 0.00012387348325356874,
+      "loss": 0.781,
+      "step": 551
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.40584209275053346,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.7206,
+      "step": 552
+    },
+    {
+      "epoch": 0.4424,
+      "grad_norm": 0.3855582382310194,
+      "learning_rate": 0.00012336974014065844,
+      "loss": 0.6935,
+      "step": 553
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.3673208093686153,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.6593,
+      "step": 554
+    },
+    {
+      "epoch": 0.444,
+      "grad_norm": 0.41615163514555564,
+      "learning_rate": 0.00012286536895867654,
+      "loss": 0.674,
+      "step": 555
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.4624231812328161,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7679,
+      "step": 556
+    },
+    {
+      "epoch": 0.4456,
+      "grad_norm": 0.39163698830776683,
+      "learning_rate": 0.00012236038326275626,
+      "loss": 0.7186,
+      "step": 557
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.3696327093662802,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7175,
+      "step": 558
+    },
+    {
+      "epoch": 0.4472,
+      "grad_norm": 0.3721727885224274,
+      "learning_rate": 0.00012185479662454595,
+      "loss": 0.6454,
+      "step": 559
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4738677251372763,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7833,
+      "step": 560
+    },
+    {
+      "epoch": 0.4488,
+      "grad_norm": 0.43173794439134333,
+      "learning_rate": 0.00012134862263184467,
+      "loss": 0.7576,
+      "step": 561
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.45922038884476846,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.8029,
+      "step": 562
+    },
+    {
+      "epoch": 0.4504,
+      "grad_norm": 0.4646870984818398,
+      "learning_rate": 0.00012084187488823657,
+      "loss": 0.7145,
+      "step": 563
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3777597816259183,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.6997,
+      "step": 564
+    },
+    {
+      "epoch": 0.452,
+      "grad_norm": 0.36089691320765555,
+      "learning_rate": 0.00012033456701272576,
+      "loss": 0.649,
+      "step": 565
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.3973909913995549,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7399,
+      "step": 566
+    },
+    {
+      "epoch": 0.4536,
+      "grad_norm": 0.44075193787463834,
+      "learning_rate": 0.00011982671263936995,
+      "loss": 0.6699,
+      "step": 567
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.38675999472414196,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.6949,
+      "step": 568
+    },
+    {
+      "epoch": 0.4552,
+      "grad_norm": 0.3825013397690802,
+      "learning_rate": 0.00011931832541691418,
+      "loss": 0.7111,
+      "step": 569
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.400911462526282,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.7051,
+      "step": 570
+    },
+    {
+      "epoch": 0.4568,
+      "grad_norm": 0.4139699698844406,
+      "learning_rate": 0.00011880941900842397,
+      "loss": 0.7686,
+      "step": 571
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.43175536022693406,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7591,
+      "step": 572
+    },
+    {
+      "epoch": 0.4584,
+      "grad_norm": 0.4230263494240272,
+      "learning_rate": 0.00011830000709091815,
+      "loss": 0.6988,
+      "step": 573
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.3737762850727218,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.6842,
+      "step": 574
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.38964413678090243,
+      "learning_rate": 0.0001177901033550012,
+      "loss": 0.743,
+      "step": 575
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.37543935689794156,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.7152,
+      "step": 576
+    },
+    {
+      "epoch": 0.4616,
+      "grad_norm": 0.43321509500863586,
+      "learning_rate": 0.00011727972150449544,
+      "loss": 0.697,
+      "step": 577
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.4303903400259315,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.6405,
+      "step": 578
+    },
+    {
+      "epoch": 0.4632,
+      "grad_norm": 0.3760923694217793,
+      "learning_rate": 0.00011676887525607271,
+      "loss": 0.7234,
+      "step": 579
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3736237674593301,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.6894,
+      "step": 580
+    },
+    {
+      "epoch": 0.4648,
+      "grad_norm": 0.394451489947489,
+      "learning_rate": 0.00011625757833888551,
+      "loss": 0.663,
+      "step": 581
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.4647786289303067,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7257,
+      "step": 582
+    },
+    {
+      "epoch": 0.4664,
+      "grad_norm": 0.3922140990005979,
+      "learning_rate": 0.0001157458444941984,
+      "loss": 0.7071,
+      "step": 583
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.4290632195123379,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.7473,
+      "step": 584
+    },
+    {
+      "epoch": 0.468,
+      "grad_norm": 0.4060539612859277,
+      "learning_rate": 0.00011523368747501839,
+      "loss": 0.6824,
+      "step": 585
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.4227461339262886,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7516,
+      "step": 586
+    },
+    {
+      "epoch": 0.4696,
+      "grad_norm": 0.4431357699775886,
+      "learning_rate": 0.00011472112104572547,
+      "loss": 0.7619,
+      "step": 587
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.4093628115550298,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.7209,
+      "step": 588
+    },
+    {
+      "epoch": 0.4712,
+      "grad_norm": 0.37441216848578235,
+      "learning_rate": 0.0001142081589817027,
+      "loss": 0.6471,
+      "step": 589
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.45421403580973224,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7773,
+      "step": 590
+    },
+    {
+      "epoch": 0.4728,
+      "grad_norm": 0.37713032856522577,
+      "learning_rate": 0.00011369481506896582,
+      "loss": 0.6931,
+      "step": 591
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.4300256782238676,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.7121,
+      "step": 592
+    },
+    {
+      "epoch": 0.4744,
+      "grad_norm": 0.4264632541041666,
+      "learning_rate": 0.00011318110310379301,
+      "loss": 0.7352,
+      "step": 593
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.45548600133711703,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7462,
+      "step": 594
+    },
+    {
+      "epoch": 0.476,
+      "grad_norm": 0.3690942812425331,
+      "learning_rate": 0.00011266703689235394,
+      "loss": 0.7231,
+      "step": 595
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.3999966576574989,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.7114,
+      "step": 596
+    },
+    {
+      "epoch": 0.4776,
+      "grad_norm": 0.38948232123577514,
+      "learning_rate": 0.00011215263025033869,
+      "loss": 0.7409,
+      "step": 597
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.481230685207155,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.6837,
+      "step": 598
+    },
+    {
+      "epoch": 0.4792,
+      "grad_norm": 0.4503212121059997,
+      "learning_rate": 0.00011163789700258655,
+      "loss": 0.7794,
+      "step": 599
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.36614189932253133,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.6793,
+      "step": 600
+    },
+    {
+      "epoch": 0.4808,
+      "grad_norm": 0.46491602621791206,
+      "learning_rate": 0.00011112285098271451,
+      "loss": 0.7656,
+      "step": 601
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.39604886431863673,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.737,
+      "step": 602
+    },
+    {
+      "epoch": 0.4824,
+      "grad_norm": 0.42064628624762074,
+      "learning_rate": 0.00011060750603274535,
+      "loss": 0.7578,
+      "step": 603
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.4254702982415326,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.7621,
+      "step": 604
+    },
+    {
+      "epoch": 0.484,
+      "grad_norm": 0.40597402249866665,
+      "learning_rate": 0.00011009187600273566,
+      "loss": 0.7541,
+      "step": 605
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.3608489448693908,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7173,
+      "step": 606
+    },
+    {
+      "epoch": 0.4856,
+      "grad_norm": 0.4091979039360563,
+      "learning_rate": 0.00010957597475040373,
+      "loss": 0.7701,
+      "step": 607
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3460555035826413,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.629,
+      "step": 608
+    },
+    {
+      "epoch": 0.4872,
+      "grad_norm": 0.42243337211845133,
+      "learning_rate": 0.00010905981614075693,
+      "loss": 0.7449,
+      "step": 609
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.39767090591192195,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.7283,
+      "step": 610
+    },
+    {
+      "epoch": 0.4888,
+      "grad_norm": 0.45863817731004536,
+      "learning_rate": 0.00010854341404571928,
+      "loss": 0.7254,
+      "step": 611
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.35430390957391245,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.6686,
+      "step": 612
+    },
+    {
+      "epoch": 0.4904,
+      "grad_norm": 0.367535141832263,
+      "learning_rate": 0.00010802678234375851,
+      "loss": 0.6957,
+      "step": 613
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.4299938590504528,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7537,
+      "step": 614
+    },
+    {
+      "epoch": 0.492,
+      "grad_norm": 0.37255287859587105,
+      "learning_rate": 0.0001075099349195131,
+      "loss": 0.6413,
+      "step": 615
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4205287420555947,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.7039,
+      "step": 616
+    },
+    {
+      "epoch": 0.4936,
+      "grad_norm": 0.36889169736730626,
+      "learning_rate": 0.00010699288566341914,
+      "loss": 0.6985,
+      "step": 617
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.4066798466103115,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.6832,
+      "step": 618
+    },
+    {
+      "epoch": 0.4952,
+      "grad_norm": 0.38308578601888854,
+      "learning_rate": 0.000106475648471337,
+      "loss": 0.7013,
+      "step": 619
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.36880126553715525,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.6992,
+      "step": 620
+    },
+    {
+      "epoch": 0.4968,
+      "grad_norm": 0.44451092597637976,
+      "learning_rate": 0.00010595823724417795,
+      "loss": 0.7202,
+      "step": 621
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.3716178286616881,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.6519,
+      "step": 622
+    },
+    {
+      "epoch": 0.4984,
+      "grad_norm": 0.4426458653806619,
+      "learning_rate": 0.00010544066588753044,
+      "loss": 0.705,
+      "step": 623
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.37895263210753366,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.7287,
+      "step": 624
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.3999977558858813,
+      "learning_rate": 0.00010492294831128641,
+      "loss": 0.6768,
+      "step": 625
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.3956912810776047,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7209,
+      "step": 626
+    },
+    {
+      "epoch": 0.5016,
+      "grad_norm": 0.4424455275777821,
+      "learning_rate": 0.00010440509842926767,
+      "loss": 0.7579,
+      "step": 627
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.4673803319588536,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.7861,
+      "step": 628
+    },
+    {
+      "epoch": 0.5032,
+      "grad_norm": 0.39630784564816757,
+      "learning_rate": 0.00010388713015885161,
+      "loss": 0.725,
+      "step": 629
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.3804278965686192,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.7224,
+      "step": 630
+    },
+    {
+      "epoch": 0.5048,
+      "grad_norm": 0.31830914083965456,
+      "learning_rate": 0.00010336905742059742,
+      "loss": 0.6331,
+      "step": 631
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.42548505965409894,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7231,
+      "step": 632
+    },
+    {
+      "epoch": 0.5064,
+      "grad_norm": 0.3585288604599067,
+      "learning_rate": 0.0001028508941378719,
+      "loss": 0.6857,
+      "step": 633
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.33174657718323153,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.6791,
+      "step": 634
+    },
+    {
+      "epoch": 0.508,
+      "grad_norm": 0.4060636357776109,
+      "learning_rate": 0.00010233265423647523,
+      "loss": 0.7667,
+      "step": 635
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.3905132002051727,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.7016,
+      "step": 636
+    },
+    {
+      "epoch": 0.5096,
+      "grad_norm": 0.4109437568865212,
+      "learning_rate": 0.00010181435164426676,
+      "loss": 0.7591,
+      "step": 637
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.40342604974072854,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.6846,
+      "step": 638
+    },
+    {
+      "epoch": 0.5112,
+      "grad_norm": 0.37153699490697906,
+      "learning_rate": 0.00010129600029079072,
+      "loss": 0.6963,
+      "step": 639
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4664907888794908,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.781,
+      "step": 640
+    },
+    {
+      "epoch": 0.5128,
+      "grad_norm": 0.3719035890782228,
+      "learning_rate": 0.00010077761410690172,
+      "loss": 0.6836,
+      "step": 641
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.3543157408407524,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.6417,
+      "step": 642
+    },
+    {
+      "epoch": 0.5144,
+      "grad_norm": 0.3831151097875646,
+      "learning_rate": 0.00010025920702439051,
+      "loss": 0.694,
+      "step": 643
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.38888890747319,
+      "learning_rate": 0.0001,
+      "loss": 0.7479,
+      "step": 644
+    },
+    {
+      "epoch": 0.516,
+      "grad_norm": 0.3945568364244657,
+      "learning_rate": 9.97407929756095e-05,
+      "loss": 0.7192,
+      "step": 645
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.35244019029902424,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.6542,
+      "step": 646
+    },
+    {
+      "epoch": 0.5176,
+      "grad_norm": 0.377560958078493,
+      "learning_rate": 9.92223858930983e-05,
+      "loss": 0.6846,
+      "step": 647
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.37257558644780747,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.6823,
+      "step": 648
+    },
+    {
+      "epoch": 0.5192,
+      "grad_norm": 0.38257422868362584,
+      "learning_rate": 9.870399970920932e-05,
+      "loss": 0.6866,
+      "step": 649
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.4855592521266635,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7963,
+      "step": 650
+    },
+    {
+      "epoch": 0.5208,
+      "grad_norm": 0.40036227247467976,
+      "learning_rate": 9.818564835573323e-05,
+      "loss": 0.6637,
+      "step": 651
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.32867665206140373,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.584,
+      "step": 652
+    },
+    {
+      "epoch": 0.5224,
+      "grad_norm": 0.4198925818656027,
+      "learning_rate": 9.766734576352478e-05,
+      "loss": 0.7955,
+      "step": 653
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.3972556335268183,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6863,
+      "step": 654
+    },
+    {
+      "epoch": 0.524,
+      "grad_norm": 0.3887101864579285,
+      "learning_rate": 9.714910586212816e-05,
+      "loss": 0.6686,
+      "step": 655
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3781477995356685,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7068,
+      "step": 656
+    },
+    {
+      "epoch": 0.5256,
+      "grad_norm": 0.3901639415685532,
+      "learning_rate": 9.663094257940258e-05,
+      "loss": 0.7113,
+      "step": 657
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.3869702546950535,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.718,
+      "step": 658
+    },
+    {
+      "epoch": 0.5272,
+      "grad_norm": 0.34995862952758633,
+      "learning_rate": 9.611286984114841e-05,
+      "loss": 0.6668,
+      "step": 659
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.39437970951145673,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.6747,
+      "step": 660
+    },
+    {
+      "epoch": 0.5288,
+      "grad_norm": 0.41130325981029686,
+      "learning_rate": 9.559490157073236e-05,
+      "loss": 0.7083,
+      "step": 661
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.3741628506410771,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.6863,
+      "step": 662
+    },
+    {
+      "epoch": 0.5304,
+      "grad_norm": 0.39128973461786043,
+      "learning_rate": 9.507705168871358e-05,
+      "loss": 0.7628,
+      "step": 663
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.36977609063465494,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6734,
+      "step": 664
+    },
+    {
+      "epoch": 0.532,
+      "grad_norm": 0.451538422818693,
+      "learning_rate": 9.455933411246958e-05,
+      "loss": 0.7499,
+      "step": 665
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.3712987515739473,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.6851,
+      "step": 666
+    },
+    {
+      "epoch": 0.5336,
+      "grad_norm": 0.42219531239486613,
+      "learning_rate": 9.404176275582208e-05,
+      "loss": 0.7346,
+      "step": 667
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.43330289478955075,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.6971,
+      "step": 668
+    },
+    {
+      "epoch": 0.5352,
+      "grad_norm": 0.4492720289986999,
+      "learning_rate": 9.352435152866298e-05,
+      "loss": 0.7711,
+      "step": 669
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.4931802042202946,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.8143,
+      "step": 670
+    },
+    {
+      "epoch": 0.5368,
+      "grad_norm": 0.4411906793227389,
+      "learning_rate": 9.300711433658087e-05,
+      "loss": 0.7267,
+      "step": 671
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.4670472290047497,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7854,
+      "step": 672
+    },
+    {
+      "epoch": 0.5384,
+      "grad_norm": 0.42677123995377886,
+      "learning_rate": 9.249006508048694e-05,
+      "loss": 0.7881,
+      "step": 673
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.393010703318978,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.707,
+      "step": 674
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.42520822204478687,
+      "learning_rate": 9.197321765624152e-05,
+      "loss": 0.6535,
+      "step": 675
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.3829198148572694,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.6402,
+      "step": 676
+    },
+    {
+      "epoch": 0.5416,
+      "grad_norm": 0.42465039688004347,
+      "learning_rate": 9.145658595428074e-05,
+      "loss": 0.7564,
+      "step": 677
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.3697197107568034,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7341,
+      "step": 678
+    },
+    {
+      "epoch": 0.5432,
+      "grad_norm": 0.4356568160517684,
+      "learning_rate": 9.09401838592431e-05,
+      "loss": 0.6909,
+      "step": 679
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3887868918363411,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.6851,
+      "step": 680
+    },
+    {
+      "epoch": 0.5448,
+      "grad_norm": 0.41502794218844097,
+      "learning_rate": 9.04240252495963e-05,
+      "loss": 0.7092,
+      "step": 681
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.3508424005258134,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.6415,
+      "step": 682
+    },
+    {
+      "epoch": 0.5464,
+      "grad_norm": 0.3906391535054069,
+      "learning_rate": 8.990812399726435e-05,
+      "loss": 0.7433,
+      "step": 683
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.42460964802698714,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.6994,
+      "step": 684
+    },
+    {
+      "epoch": 0.548,
+      "grad_norm": 0.36260277367149235,
+      "learning_rate": 8.939249396725467e-05,
+      "loss": 0.6549,
+      "step": 685
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.4021897399655954,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.6354,
+      "step": 686
+    },
+    {
+      "epoch": 0.5496,
+      "grad_norm": 0.41526985094243396,
+      "learning_rate": 8.887714901728551e-05,
+      "loss": 0.7671,
+      "step": 687
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.4146212411057442,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7431,
+      "step": 688
+    },
+    {
+      "epoch": 0.5512,
+      "grad_norm": 0.37180097014227725,
+      "learning_rate": 8.836210299741346e-05,
+      "loss": 0.6356,
+      "step": 689
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.36251068211454873,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.6456,
+      "step": 690
+    },
+    {
+      "epoch": 0.5528,
+      "grad_norm": 0.3586531608730886,
+      "learning_rate": 8.784736974966135e-05,
+      "loss": 0.6476,
+      "step": 691
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4603340281684909,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.6984,
+      "step": 692
+    },
+    {
+      "epoch": 0.5544,
+      "grad_norm": 0.3870379005898693,
+      "learning_rate": 8.733296310764611e-05,
+      "loss": 0.7093,
+      "step": 693
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.4027472629038024,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.71,
+      "step": 694
+    },
+    {
+      "epoch": 0.556,
+      "grad_norm": 0.3528191478928965,
+      "learning_rate": 8.6818896896207e-05,
+      "loss": 0.6294,
+      "step": 695
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.33825869458909047,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.6298,
+      "step": 696
+    },
+    {
+      "epoch": 0.5576,
+      "grad_norm": 0.36037288534137707,
+      "learning_rate": 8.63051849310342e-05,
+      "loss": 0.6626,
+      "step": 697
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.4889828579133769,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.7661,
+      "step": 698
+    },
+    {
+      "epoch": 0.5592,
+      "grad_norm": 0.3615120502652659,
+      "learning_rate": 8.579184101829734e-05,
+      "loss": 0.659,
+      "step": 699
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3819622314915852,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6899,
+      "step": 700
+    },
+    {
+      "epoch": 0.5608,
+      "grad_norm": 0.3625524164766474,
+      "learning_rate": 8.527887895427454e-05,
+      "loss": 0.7175,
+      "step": 701
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.37825883451692044,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.6772,
+      "step": 702
+    },
+    {
+      "epoch": 0.5624,
+      "grad_norm": 0.38042278841918303,
+      "learning_rate": 8.476631252498162e-05,
+      "loss": 0.6986,
+      "step": 703
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.35804466110483835,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.6795,
+      "step": 704
+    },
+    {
+      "epoch": 0.564,
+      "grad_norm": 0.3281736917944397,
+      "learning_rate": 8.425415550580162e-05,
+      "loss": 0.6371,
+      "step": 705
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.36691010384976763,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.7083,
+      "step": 706
+    },
+    {
+      "epoch": 0.5656,
+      "grad_norm": 0.37654825968461336,
+      "learning_rate": 8.374242166111448e-05,
+      "loss": 0.729,
+      "step": 707
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.483695314586318,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.8131,
+      "step": 708
+    },
+    {
+      "epoch": 0.5672,
+      "grad_norm": 0.3978851319398291,
+      "learning_rate": 8.323112474392731e-05,
+      "loss": 0.7137,
+      "step": 709
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 1.0701915284130301,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.6879,
+      "step": 710
+    },
+    {
+      "epoch": 0.5688,
+      "grad_norm": 0.40766989240820967,
+      "learning_rate": 8.272027849550457e-05,
+      "loss": 0.7013,
+      "step": 711
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.42485817360229233,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.7917,
+      "step": 712
+    },
+    {
+      "epoch": 0.5704,
+      "grad_norm": 0.4529163662639259,
+      "learning_rate": 8.220989664499878e-05,
+      "loss": 0.7932,
+      "step": 713
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.3757526686525042,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.703,
+      "step": 714
+    },
+    {
+      "epoch": 0.572,
+      "grad_norm": 0.4157260966214999,
+      "learning_rate": 8.169999290908188e-05,
+      "loss": 0.7377,
+      "step": 715
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3684872875873162,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.6747,
+      "step": 716
+    },
+    {
+      "epoch": 0.5736,
+      "grad_norm": 0.4352203721253499,
+      "learning_rate": 8.119058099157604e-05,
+      "loss": 0.6921,
+      "step": 717
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.3688596385164379,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.675,
+      "step": 718
+    },
+    {
+      "epoch": 0.5752,
+      "grad_norm": 0.38601442265441466,
+      "learning_rate": 8.068167458308582e-05,
+      "loss": 0.7195,
+      "step": 719
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3688057036357684,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.6881,
+      "step": 720
+    },
+    {
+      "epoch": 0.5768,
+      "grad_norm": 0.39165481815069025,
+      "learning_rate": 8.017328736063006e-05,
+      "loss": 0.6375,
+      "step": 721
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.3605859209790284,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6295,
+      "step": 722
+    },
+    {
+      "epoch": 0.5784,
+      "grad_norm": 0.41339282016265194,
+      "learning_rate": 7.966543298727425e-05,
+      "loss": 0.7261,
+      "step": 723
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3720958703979262,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6324,
+      "step": 724
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.38369394721037564,
+      "learning_rate": 7.915812511176347e-05,
+      "loss": 0.6528,
+      "step": 725
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.39487035674006327,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.644,
+      "step": 726
+    },
+    {
+      "epoch": 0.5816,
+      "grad_norm": 0.364053557875168,
+      "learning_rate": 7.865137736815535e-05,
+      "loss": 0.655,
+      "step": 727
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.4091073164543154,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.7148,
+      "step": 728
+    },
+    {
+      "epoch": 0.5832,
+      "grad_norm": 0.3860449431917968,
+      "learning_rate": 7.814520337545406e-05,
+      "loss": 0.7492,
+      "step": 729
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.4014892334379826,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.684,
+      "step": 730
+    },
+    {
+      "epoch": 0.5848,
+      "grad_norm": 0.4258334466602001,
+      "learning_rate": 7.763961673724379e-05,
+      "loss": 0.7487,
+      "step": 731
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.3804284937666935,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6785,
+      "step": 732
+    },
+    {
+      "epoch": 0.5864,
+      "grad_norm": 0.3680495957717326,
+      "learning_rate": 7.713463104132345e-05,
+      "loss": 0.6839,
+      "step": 733
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.35916091012297474,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.6831,
+      "step": 734
+    },
+    {
+      "epoch": 0.588,
+      "grad_norm": 0.4853055134210838,
+      "learning_rate": 7.663025985934158e-05,
+      "loss": 0.7819,
+      "step": 735
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.4006664858164372,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.6664,
+      "step": 736
+    },
+    {
+      "epoch": 0.5896,
+      "grad_norm": 0.38752120698758835,
+      "learning_rate": 7.61265167464313e-05,
+      "loss": 0.6748,
+      "step": 737
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.3593820576140191,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.6377,
+      "step": 738
+    },
+    {
+      "epoch": 0.5912,
+      "grad_norm": 0.3659247324494101,
+      "learning_rate": 7.562341524084623e-05,
+      "loss": 0.6446,
+      "step": 739
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.37344276743184085,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.677,
+      "step": 740
+    },
+    {
+      "epoch": 0.5928,
+      "grad_norm": 0.383307924485725,
+      "learning_rate": 7.512096886359664e-05,
+      "loss": 0.6778,
+      "step": 741
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.3903104380329308,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.7101,
+      "step": 742
+    },
+    {
+      "epoch": 0.5944,
+      "grad_norm": 0.34736833042562243,
+      "learning_rate": 7.461919111808595e-05,
+      "loss": 0.6322,
+      "step": 743
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.39453909643963014,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.6983,
+      "step": 744
+    },
+    {
+      "epoch": 0.596,
+      "grad_norm": 0.40376938298853243,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.6717,
+      "step": 745
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.42368985381238794,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6669,
+      "step": 746
+    },
+    {
+      "epoch": 0.5976,
+      "grad_norm": 0.4236510598985175,
+      "learning_rate": 7.361769544568425e-05,
+      "loss": 0.6447,
+      "step": 747
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.41339350765198635,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.6922,
+      "step": 748
+    },
+    {
+      "epoch": 0.5992,
+      "grad_norm": 0.7218375320143412,
+      "learning_rate": 7.311800443430251e-05,
+      "loss": 0.7486,
+      "step": 749
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.42855190255193215,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.7084,
+      "step": 750
+    },
+    {
+      "epoch": 0.6008,
+      "grad_norm": 0.3853592981630134,
+      "learning_rate": 7.26190358849548e-05,
+      "loss": 0.741,
+      "step": 751
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.360165680338082,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.6452,
+      "step": 752
+    },
+    {
+      "epoch": 0.6024,
+      "grad_norm": 0.38043309835102074,
+      "learning_rate": 7.212080320757695e-05,
+      "loss": 0.6614,
+      "step": 753
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.4112894815413764,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.7781,
+      "step": 754
+    },
+    {
+      "epoch": 0.604,
+      "grad_norm": 0.34908945812710956,
+      "learning_rate": 7.162331979232783e-05,
+      "loss": 0.616,
+      "step": 755
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.4170878218052713,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.7444,
+      "step": 756
+    },
+    {
+      "epoch": 0.6056,
+      "grad_norm": 0.36594322008531244,
+      "learning_rate": 7.112659900922976e-05,
+      "loss": 0.6887,
+      "step": 757
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.3391957470731677,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.5661,
+      "step": 758
+    },
+    {
+      "epoch": 0.6072,
+      "grad_norm": 0.374998163890273,
+      "learning_rate": 7.06306542078091e-05,
+      "loss": 0.6748,
+      "step": 759
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.5062788654494025,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.7409,
+      "step": 760
+    },
+    {
+      "epoch": 0.6088,
+      "grad_norm": 0.4033029379370935,
+      "learning_rate": 7.013549871673736e-05,
+      "loss": 0.7613,
+      "step": 761
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.4816638722246713,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.7674,
+      "step": 762
+    },
+    {
+      "epoch": 0.6104,
+      "grad_norm": 0.4911048448099762,
+      "learning_rate": 6.964114584347316e-05,
+      "loss": 0.7677,
+      "step": 763
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.35508407796578995,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.5686,
+      "step": 764
+    },
+    {
+      "epoch": 0.612,
+      "grad_norm": 0.4271338131050412,
+      "learning_rate": 6.914760887390452e-05,
+      "loss": 0.7709,
+      "step": 765
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.3829874443837163,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6703,
+      "step": 766
+    },
+    {
+      "epoch": 0.6136,
+      "grad_norm": 0.3772201380222238,
+      "learning_rate": 6.865490107199181e-05,
+      "loss": 0.6416,
+      "step": 767
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.42957061904173477,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6785,
+      "step": 768
+    },
+    {
+      "epoch": 0.6152,
+      "grad_norm": 0.4351015238354584,
+      "learning_rate": 6.816303567941112e-05,
+      "loss": 0.717,
+      "step": 769
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.44387870196411,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.7456,
+      "step": 770
+    },
+    {
+      "epoch": 0.6168,
+      "grad_norm": 0.38345840918195717,
+      "learning_rate": 6.767202591519875e-05,
+      "loss": 0.6366,
+      "step": 771
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.40318390774851576,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.6941,
+      "step": 772
+    },
+    {
+      "epoch": 0.6184,
+      "grad_norm": 0.39216006216230787,
+      "learning_rate": 6.718188497539554e-05,
+      "loss": 0.6587,
+      "step": 773
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.4257985590229207,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.7713,
+      "step": 774
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.3931310762318468,
+      "learning_rate": 6.669262603269246e-05,
+      "loss": 0.6648,
+      "step": 775
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3642652770131762,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.6814,
+      "step": 776
+    },
+    {
+      "epoch": 0.6216,
+      "grad_norm": 0.364177253154222,
+      "learning_rate": 6.620426223607654e-05,
+      "loss": 0.6627,
+      "step": 777
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.3523087379696415,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6288,
+      "step": 778
+    },
+    {
+      "epoch": 0.6232,
+      "grad_norm": 0.4059771452595751,
+      "learning_rate": 6.571680671047749e-05,
+      "loss": 0.6667,
+      "step": 779
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.482718976349731,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7562,
+      "step": 780
+    },
+    {
+      "epoch": 0.6248,
+      "grad_norm": 0.41578835122028934,
+      "learning_rate": 6.523027255641493e-05,
+      "loss": 0.7022,
+      "step": 781
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.36922905393572486,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.6519,
+      "step": 782
+    },
+    {
+      "epoch": 0.6264,
+      "grad_norm": 0.3762926224218096,
+      "learning_rate": 6.474467284964634e-05,
+      "loss": 0.7232,
+      "step": 783
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3978686265348035,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.7099,
+      "step": 784
+    },
+    {
+      "epoch": 0.628,
+      "grad_norm": 0.32496498392338546,
+      "learning_rate": 6.426002064081565e-05,
+      "loss": 0.5562,
+      "step": 785
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.42086421906589716,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.6752,
+      "step": 786
+    },
+    {
+      "epoch": 0.6296,
+      "grad_norm": 0.3656906027819285,
+      "learning_rate": 6.377632895510248e-05,
+      "loss": 0.7032,
+      "step": 787
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.4161670937660567,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.7525,
+      "step": 788
+    },
+    {
+      "epoch": 0.6312,
+      "grad_norm": 0.37830212145755643,
+      "learning_rate": 6.329361079187199e-05,
+      "loss": 0.7098,
+      "step": 789
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.41658258367628426,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7551,
+      "step": 790
+    },
+    {
+      "epoch": 0.6328,
+      "grad_norm": 0.36045955475606195,
+      "learning_rate": 6.281187912432587e-05,
+      "loss": 0.66,
+      "step": 791
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.37412315585512523,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.7219,
+      "step": 792
+    },
+    {
+      "epoch": 0.6344,
+      "grad_norm": 0.3940531238984509,
+      "learning_rate": 6.233114689915316e-05,
+      "loss": 0.6799,
+      "step": 793
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.42150488983322487,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.7225,
+      "step": 794
+    },
+    {
+      "epoch": 0.636,
+      "grad_norm": 0.4299804274671972,
+      "learning_rate": 6.18514270361827e-05,
+      "loss": 0.7594,
+      "step": 795
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.37550706071176354,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.6751,
+      "step": 796
+    },
+    {
+      "epoch": 0.6376,
+      "grad_norm": 0.35985151186544345,
+      "learning_rate": 6.13727324280358e-05,
+      "loss": 0.6515,
+      "step": 797
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.4831717492898829,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.8192,
+      "step": 798
+    },
+    {
+      "epoch": 0.6392,
+      "grad_norm": 0.40579059753134555,
+      "learning_rate": 6.08950759397797e-05,
+      "loss": 0.7253,
+      "step": 799
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3937601854305587,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.7437,
+      "step": 800
+    },
+    {
+      "epoch": 0.6408,
+      "grad_norm": 0.338102934648602,
+      "learning_rate": 6.0418470408581774e-05,
+      "loss": 0.6423,
+      "step": 801
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.38581607564898074,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.6956,
+      "step": 802
+    },
+    {
+      "epoch": 0.6424,
+      "grad_norm": 0.4113944032955229,
+      "learning_rate": 5.9942928643364724e-05,
+      "loss": 0.6973,
+      "step": 803
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.4095900672318419,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.7466,
+      "step": 804
+    },
+    {
+      "epoch": 0.644,
+      "grad_norm": 0.4292823646665635,
+      "learning_rate": 5.946846342446214e-05,
+      "loss": 0.7328,
+      "step": 805
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.37476926363657725,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.7136,
+      "step": 806
+    },
+    {
+      "epoch": 0.6456,
+      "grad_norm": 0.4440561071976436,
+      "learning_rate": 5.899508750327501e-05,
+      "loss": 0.6965,
+      "step": 807
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.34366750397941753,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6471,
+      "step": 808
+    },
+    {
+      "epoch": 0.6472,
+      "grad_norm": 0.4103654288184694,
+      "learning_rate": 5.8522813601929324e-05,
+      "loss": 0.7292,
+      "step": 809
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.3362083965809128,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6451,
+      "step": 810
+    },
+    {
+      "epoch": 0.6488,
+      "grad_norm": 0.4146917862331588,
+      "learning_rate": 5.80516544129337e-05,
+      "loss": 0.7293,
+      "step": 811
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.5184762847393413,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6954,
+      "step": 812
+    },
+    {
+      "epoch": 0.6504,
+      "grad_norm": 0.3669542021968653,
+      "learning_rate": 5.758162259883867e-05,
+      "loss": 0.6601,
+      "step": 813
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.3851300822559215,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.7422,
+      "step": 814
+    },
+    {
+      "epoch": 0.652,
+      "grad_norm": 0.3600485993160188,
+      "learning_rate": 5.7112730791896207e-05,
+      "loss": 0.7124,
+      "step": 815
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.3741263093251435,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.6974,
+      "step": 816
+    },
+    {
+      "epoch": 0.6536,
+      "grad_norm": 0.40506980225745537,
+      "learning_rate": 5.664499159372017e-05,
+      "loss": 0.7157,
+      "step": 817
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.37142754860523913,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.661,
+      "step": 818
+    },
+    {
+      "epoch": 0.6552,
+      "grad_norm": 0.34868864855170956,
+      "learning_rate": 5.617841757494762e-05,
+      "loss": 0.6331,
+      "step": 819
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3844359901312866,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.6706,
+      "step": 820
+    },
+    {
+      "epoch": 0.6568,
+      "grad_norm": 0.3679428435598268,
+      "learning_rate": 5.5713021274901335e-05,
+      "loss": 0.6432,
+      "step": 821
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.3609800351229282,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.644,
+      "step": 822
+    },
+    {
+      "epoch": 0.6584,
+      "grad_norm": 0.40857060655903316,
+      "learning_rate": 5.524881520125229e-05,
+      "loss": 0.6928,
+      "step": 823
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.34890882720035793,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.6659,
+      "step": 824
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.34646708731898007,
+      "learning_rate": 5.4785811829683764e-05,
+      "loss": 0.6275,
+      "step": 825
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.38567908108552645,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.7153,
+      "step": 826
+    },
+    {
+      "epoch": 0.6616,
+      "grad_norm": 0.4305873982759078,
+      "learning_rate": 5.432402360355615e-05,
+      "loss": 0.7274,
+      "step": 827
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.43135102883365606,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.7578,
+      "step": 828
+    },
+    {
+      "epoch": 0.6632,
+      "grad_norm": 0.4047217454206257,
+      "learning_rate": 5.386346293357242e-05,
+      "loss": 0.6523,
+      "step": 829
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.38403048246137017,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6723,
+      "step": 830
+    },
+    {
+      "epoch": 0.6648,
+      "grad_norm": 0.4149002869362283,
+      "learning_rate": 5.3404142197444506e-05,
+      "loss": 0.7515,
+      "step": 831
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.4718520933027861,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.6489,
+      "step": 832
+    },
+    {
+      "epoch": 0.6664,
+      "grad_norm": 0.38813378130757714,
+      "learning_rate": 5.2946073739560706e-05,
+      "loss": 0.658,
+      "step": 833
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.3939601361015646,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.658,
+      "step": 834
+    },
+    {
+      "epoch": 0.668,
+      "grad_norm": 0.3881313888834844,
+      "learning_rate": 5.248926987065417e-05,
+      "loss": 0.7287,
+      "step": 835
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.4301470833777586,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.7421,
+      "step": 836
+    },
+    {
+      "epoch": 0.6696,
+      "grad_norm": 0.4287457170994794,
+      "learning_rate": 5.203374286747158e-05,
+      "loss": 0.6678,
+      "step": 837
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.3813750880666755,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6476,
+      "step": 838
+    },
+    {
+      "epoch": 0.6712,
+      "grad_norm": 0.4201512651843253,
+      "learning_rate": 5.15795049724435e-05,
+      "loss": 0.7701,
+      "step": 839
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.40329968841178876,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.6548,
+      "step": 840
+    },
+    {
+      "epoch": 0.6728,
+      "grad_norm": 0.435542688212676,
+      "learning_rate": 5.112656839335543e-05,
+      "loss": 0.7366,
+      "step": 841
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.4166070391500017,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7087,
+      "step": 842
+    },
+    {
+      "epoch": 0.6744,
+      "grad_norm": 0.3790452016246712,
+      "learning_rate": 5.0674945303019526e-05,
+      "loss": 0.6651,
+      "step": 843
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4087162996368245,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.7629,
+      "step": 844
+    },
+    {
+      "epoch": 0.676,
+      "grad_norm": 0.45517039116161556,
+      "learning_rate": 5.022464783894744e-05,
+      "loss": 0.6626,
+      "step": 845
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.3907054046676461,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.6617,
+      "step": 846
+    },
+    {
+      "epoch": 0.6776,
+      "grad_norm": 0.4112333855403147,
+      "learning_rate": 4.977568810302432e-05,
+      "loss": 0.7102,
+      "step": 847
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.39196525023148127,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6492,
+      "step": 848
+    },
+    {
+      "epoch": 0.6792,
+      "grad_norm": 0.39241913793819494,
+      "learning_rate": 4.9328078161183464e-05,
+      "loss": 0.6458,
+      "step": 849
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.36655779865171095,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6473,
+      "step": 850
+    },
+    {
+      "epoch": 0.6808,
+      "grad_norm": 0.3695327576372595,
+      "learning_rate": 4.88818300430819e-05,
+      "loss": 0.6977,
+      "step": 851
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.39337691074256637,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6695,
+      "step": 852
+    },
+    {
+      "epoch": 0.6824,
+      "grad_norm": 0.43078872845909777,
+      "learning_rate": 4.843695574177737e-05,
+      "loss": 0.7215,
+      "step": 853
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.41677583461043594,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.7434,
+      "step": 854
+    },
+    {
+      "epoch": 0.684,
+      "grad_norm": 0.37409959960608413,
+      "learning_rate": 4.7993467213405706e-05,
+      "loss": 0.6174,
+      "step": 855
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.37530733361044,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6646,
+      "step": 856
+    },
+    {
+      "epoch": 0.6856,
+      "grad_norm": 0.3948418672519029,
+      "learning_rate": 4.755137637685979e-05,
+      "loss": 0.7504,
+      "step": 857
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.3466028849080845,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.5993,
+      "step": 858
+    },
+    {
+      "epoch": 0.6872,
+      "grad_norm": 0.36659625418677094,
+      "learning_rate": 4.7110695113469085e-05,
+      "loss": 0.6604,
+      "step": 859
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.37078755725390006,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.7182,
+      "step": 860
+    },
+    {
+      "epoch": 0.6888,
+      "grad_norm": 0.3795044276270682,
+      "learning_rate": 4.6671435266680216e-05,
+      "loss": 0.6524,
+      "step": 861
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.33482991952784324,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.5969,
+      "step": 862
+    },
+    {
+      "epoch": 0.6904,
+      "grad_norm": 0.4313167251920351,
+      "learning_rate": 4.623360864173893e-05,
+      "loss": 0.7922,
+      "step": 863
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.39092361080138693,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.7368,
+      "step": 864
+    },
+    {
+      "epoch": 0.692,
+      "grad_norm": 0.3825571527685044,
+      "learning_rate": 4.579722700537268e-05,
+      "loss": 0.6939,
+      "step": 865
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.40444046894376534,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6593,
+      "step": 866
+    },
+    {
+      "epoch": 0.6936,
+      "grad_norm": 0.40668490269779606,
+      "learning_rate": 4.5362302085474254e-05,
+      "loss": 0.7082,
+      "step": 867
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.40044672350004745,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.677,
+      "step": 868
+    },
+    {
+      "epoch": 0.6952,
+      "grad_norm": 0.4690244764733942,
+      "learning_rate": 4.492884557078688e-05,
+      "loss": 0.7342,
+      "step": 869
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.3793000401009645,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.6776,
+      "step": 870
+    },
+    {
+      "epoch": 0.6968,
+      "grad_norm": 0.37205962385934116,
+      "learning_rate": 4.449686911058992e-05,
+      "loss": 0.7073,
+      "step": 871
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3867584388600962,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.597,
+      "step": 872
+    },
+    {
+      "epoch": 0.6984,
+      "grad_norm": 0.3662766708062122,
+      "learning_rate": 4.406638431438576e-05,
+      "loss": 0.6005,
+      "step": 873
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.37132830126650684,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.652,
+      "step": 874
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.3717335656538945,
+      "learning_rate": 4.36374027515878e-05,
+      "loss": 0.6777,
+      "step": 875
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.4071587052230116,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.5889,
+      "step": 876
+    },
+    {
+      "epoch": 0.7016,
+      "grad_norm": 0.39199370584020554,
+      "learning_rate": 4.320993595120969e-05,
+      "loss": 0.6794,
+      "step": 877
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.4083045902562364,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6869,
+      "step": 878
+    },
+    {
+      "epoch": 0.7032,
+      "grad_norm": 0.4364514601217797,
+      "learning_rate": 4.278399540155536e-05,
+      "loss": 0.7299,
+      "step": 879
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4272030308395742,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.6443,
+      "step": 880
+    },
+    {
+      "epoch": 0.7048,
+      "grad_norm": 0.3830830999836823,
+      "learning_rate": 4.2359592549910145e-05,
+      "loss": 0.664,
+      "step": 881
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.45790237792918964,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.7071,
+      "step": 882
+    },
+    {
+      "epoch": 0.7064,
+      "grad_norm": 0.427243169382132,
+      "learning_rate": 4.193673880223339e-05,
+      "loss": 0.6815,
+      "step": 883
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.42608561465983186,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.7259,
+      "step": 884
+    },
+    {
+      "epoch": 0.708,
+      "grad_norm": 0.35803222200964013,
+      "learning_rate": 4.1515445522851784e-05,
+      "loss": 0.6099,
+      "step": 885
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.4105132717548988,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6861,
+      "step": 886
+    },
+    {
+      "epoch": 0.7096,
+      "grad_norm": 0.41160615603238776,
+      "learning_rate": 4.109572403415386e-05,
+      "loss": 0.6691,
+      "step": 887
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4125210074420269,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7136,
+      "step": 888
+    },
+    {
+      "epoch": 0.7112,
+      "grad_norm": 0.3947149593570214,
+      "learning_rate": 4.0677585616285774e-05,
+      "loss": 0.625,
+      "step": 889
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.36897199999238556,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6422,
+      "step": 890
+    },
+    {
+      "epoch": 0.7128,
+      "grad_norm": 0.3989802751482518,
+      "learning_rate": 4.026104150684835e-05,
+      "loss": 0.705,
+      "step": 891
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.3496190401577539,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6207,
+      "step": 892
+    },
+    {
+      "epoch": 0.7144,
+      "grad_norm": 0.4900280407157522,
+      "learning_rate": 3.984610290059467e-05,
+      "loss": 0.8007,
+      "step": 893
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.3551601267452017,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.6354,
+      "step": 894
+    },
+    {
+      "epoch": 0.716,
+      "grad_norm": 0.38899309001246807,
+      "learning_rate": 3.943278094912946e-05,
+      "loss": 0.6374,
+      "step": 895
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3390341726853945,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6266,
+      "step": 896
+    },
+    {
+      "epoch": 0.7176,
+      "grad_norm": 0.40606795460219025,
+      "learning_rate": 3.902108676060937e-05,
+      "loss": 0.6824,
+      "step": 897
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.3854419536808137,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6218,
+      "step": 898
+    },
+    {
+      "epoch": 0.7192,
+      "grad_norm": 0.3897364021862041,
+      "learning_rate": 3.861103139944449e-05,
+      "loss": 0.7032,
+      "step": 899
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.46000968922306035,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.7736,
+      "step": 900
+    },
+    {
+      "epoch": 0.7208,
+      "grad_norm": 0.3939240859217374,
+      "learning_rate": 3.820262588600074e-05,
+      "loss": 0.6861,
+      "step": 901
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.35864600105529776,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6317,
+      "step": 902
+    },
+    {
+      "epoch": 0.7224,
+      "grad_norm": 0.39750925834879125,
+      "learning_rate": 3.7795881196303995e-05,
+      "loss": 0.7157,
+      "step": 903
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.40159732727993463,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.7057,
+      "step": 904
+    },
+    {
+      "epoch": 0.724,
+      "grad_norm": 0.4113036440798727,
+      "learning_rate": 3.739080826174498e-05,
+      "loss": 0.7105,
+      "step": 905
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.401315698737299,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.6668,
+      "step": 906
+    },
+    {
+      "epoch": 0.7256,
+      "grad_norm": 0.40790670367724563,
+      "learning_rate": 3.6987417968785366e-05,
+      "loss": 0.73,
+      "step": 907
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3727532374615525,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.6618,
+      "step": 908
+    },
+    {
+      "epoch": 0.7272,
+      "grad_norm": 0.3503374501600405,
+      "learning_rate": 3.658572115866541e-05,
+      "loss": 0.598,
+      "step": 909
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.40563869970756045,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6812,
+      "step": 910
+    },
+    {
+      "epoch": 0.7288,
+      "grad_norm": 0.3838583386528972,
+      "learning_rate": 3.618572862711247e-05,
+      "loss": 0.699,
+      "step": 911
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.40358621332732,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.6888,
+      "step": 912
+    },
+    {
+      "epoch": 0.7304,
+      "grad_norm": 0.3880879506544284,
+      "learning_rate": 3.578745112405083e-05,
+      "loss": 0.6911,
+      "step": 913
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.370531095853474,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.6404,
+      "step": 914
+    },
+    {
+      "epoch": 0.732,
+      "grad_norm": 0.34209538531266953,
+      "learning_rate": 3.539089935331294e-05,
+      "loss": 0.5939,
+      "step": 915
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4236755140832168,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.7576,
+      "step": 916
+    },
+    {
+      "epoch": 0.7336,
+      "grad_norm": 0.3658400080148741,
+      "learning_rate": 3.4996083972351515e-05,
+      "loss": 0.6159,
+      "step": 917
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.39879796431319914,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6558,
+      "step": 918
+    },
+    {
+      "epoch": 0.7352,
+      "grad_norm": 0.35087830936122166,
+      "learning_rate": 3.4603015591953395e-05,
+      "loss": 0.6117,
+      "step": 919
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.37387322928602545,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.6427,
+      "step": 920
+    },
+    {
+      "epoch": 0.7368,
+      "grad_norm": 0.4872607931728202,
+      "learning_rate": 3.421170477595419e-05,
+      "loss": 0.7606,
+      "step": 921
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.38657425724887196,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.6303,
+      "step": 922
+    },
+    {
+      "epoch": 0.7384,
+      "grad_norm": 0.4134505793433531,
+      "learning_rate": 3.3822162040954354e-05,
+      "loss": 0.6938,
+      "step": 923
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.3598444447813142,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6509,
+      "step": 924
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.3617011879924967,
+      "learning_rate": 3.34343978560367e-05,
+      "loss": 0.6574,
+      "step": 925
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.400947050757263,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.645,
+      "step": 926
+    },
+    {
+      "epoch": 0.7416,
+      "grad_norm": 0.4220074193387606,
+      "learning_rate": 3.3048422642484886e-05,
+      "loss": 0.7001,
+      "step": 927
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3732493162557107,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6036,
+      "step": 928
+    },
+    {
+      "epoch": 0.7432,
+      "grad_norm": 0.40301328291104704,
+      "learning_rate": 3.266424677350346e-05,
+      "loss": 0.686,
+      "step": 929
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4108059320184453,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.7085,
+      "step": 930
+    },
+    {
+      "epoch": 0.7448,
+      "grad_norm": 0.4125815276658339,
+      "learning_rate": 3.228188057393895e-05,
+      "loss": 0.6322,
+      "step": 931
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.36863849906243684,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.6494,
+      "step": 932
+    },
+    {
+      "epoch": 0.7464,
+      "grad_norm": 0.39074398089237977,
+      "learning_rate": 3.190133432000252e-05,
+      "loss": 0.6682,
+      "step": 933
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.4584296705306387,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6281,
+      "step": 934
+    },
+    {
+      "epoch": 0.748,
+      "grad_norm": 0.38788779905511994,
+      "learning_rate": 3.1522618238993725e-05,
+      "loss": 0.6993,
+      "step": 935
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.40701330975481226,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.7688,
+      "step": 936
+    },
+    {
+      "epoch": 0.7496,
+      "grad_norm": 0.4260094945414775,
+      "learning_rate": 3.114574250902558e-05,
+      "loss": 0.7242,
+      "step": 937
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.4057036035467596,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.7253,
+      "step": 938
+    },
+    {
+      "epoch": 0.7512,
+      "grad_norm": 0.40661154553380663,
+      "learning_rate": 3.077071725875116e-05,
+      "loss": 0.6711,
+      "step": 939
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.35937310430012653,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6769,
+      "step": 940
+    },
+    {
+      "epoch": 0.7528,
+      "grad_norm": 0.35787645420113195,
+      "learning_rate": 3.0397552567091337e-05,
+      "loss": 0.6513,
+      "step": 941
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.391945553497051,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.6367,
+      "step": 942
+    },
+    {
+      "epoch": 0.7544,
+      "grad_norm": 0.37326197481065054,
+      "learning_rate": 3.0026258462963787e-05,
+      "loss": 0.6557,
+      "step": 943
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.36011725309369086,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.623,
+      "step": 944
+    },
+    {
+      "epoch": 0.756,
+      "grad_norm": 0.38836463607963373,
+      "learning_rate": 2.9656844925013637e-05,
+      "loss": 0.6578,
+      "step": 945
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.3871726490315593,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6655,
+      "step": 946
+    },
+    {
+      "epoch": 0.7576,
+      "grad_norm": 0.5688223950738495,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.7054,
+      "step": 947
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3816671225013523,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6657,
+      "step": 948
+    },
+    {
+      "epoch": 0.7592,
+      "grad_norm": 0.4080687388597147,
+      "learning_rate": 2.8923699209255284e-05,
+      "loss": 0.6535,
+      "step": 949
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.369691368379072,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.5877,
+      "step": 950
+    },
+    {
+      "epoch": 0.7608,
+      "grad_norm": 0.48218290349226944,
+      "learning_rate": 2.8559986734967282e-05,
+      "loss": 0.6572,
+      "step": 951
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.34774664869458893,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6428,
+      "step": 952
+    },
+    {
+      "epoch": 0.7624,
+      "grad_norm": 0.4195235418480639,
+      "learning_rate": 2.819819423336775e-05,
+      "loss": 0.7067,
+      "step": 953
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.39283130785285547,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6851,
+      "step": 954
+    },
+    {
+      "epoch": 0.764,
+      "grad_norm": 0.3655565612948235,
+      "learning_rate": 2.7838331427743282e-05,
+      "loss": 0.628,
+      "step": 955
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.3739719837457371,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6403,
+      "step": 956
+    },
+    {
+      "epoch": 0.7656,
+      "grad_norm": 0.5015560909701027,
+      "learning_rate": 2.7480407989519198e-05,
+      "loss": 0.6904,
+      "step": 957
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.39831542125162783,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.6953,
+      "step": 958
+    },
+    {
+      "epoch": 0.7672,
+      "grad_norm": 0.47199034653133004,
+      "learning_rate": 2.712443353799984e-05,
+      "loss": 0.6433,
+      "step": 959
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4101857437841964,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6471,
+      "step": 960
+    },
+    {
+      "epoch": 0.7688,
+      "grad_norm": 0.3789851658317438,
+      "learning_rate": 2.677041764010988e-05,
+      "loss": 0.6639,
+      "step": 961
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.4014035059775117,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6576,
+      "step": 962
+    },
+    {
+      "epoch": 0.7704,
+      "grad_norm": 0.40684685579476415,
+      "learning_rate": 2.6418369810137188e-05,
+      "loss": 0.7052,
+      "step": 963
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.4804061527698471,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6641,
+      "step": 964
+    },
+    {
+      "epoch": 0.772,
+      "grad_norm": 0.4557510834099288,
+      "learning_rate": 2.6068299509477266e-05,
+      "loss": 0.7149,
+      "step": 965
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.39465648052244057,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.7206,
+      "step": 966
+    },
+    {
+      "epoch": 0.7736,
+      "grad_norm": 0.392256741376962,
+      "learning_rate": 2.5720216146378917e-05,
+      "loss": 0.7162,
+      "step": 967
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.40585278793645196,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.7213,
+      "step": 968
+    },
+    {
+      "epoch": 0.7752,
+      "grad_norm": 0.4105575124195198,
+      "learning_rate": 2.5374129075691265e-05,
+      "loss": 0.6508,
+      "step": 969
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.3869157683581518,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.6491,
+      "step": 970
+    },
+    {
+      "epoch": 0.7768,
+      "grad_norm": 0.4096673437785533,
+      "learning_rate": 2.503004759861258e-05,
+      "loss": 0.6814,
+      "step": 971
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.36368840976953926,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6322,
+      "step": 972
+    },
+    {
+      "epoch": 0.7784,
+      "grad_norm": 0.41570328821661107,
+      "learning_rate": 2.4687980962440072e-05,
+      "loss": 0.658,
+      "step": 973
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.4110249505203712,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6624,
+      "step": 974
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.4646084606799718,
+      "learning_rate": 2.4347938360321566e-05,
+      "loss": 0.6641,
+      "step": 975
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.42541671333207876,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6915,
+      "step": 976
+    },
+    {
+      "epoch": 0.7816,
+      "grad_norm": 0.43528404059096026,
+      "learning_rate": 2.400992893100822e-05,
+      "loss": 0.6656,
+      "step": 977
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.3670568988549724,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6284,
+      "step": 978
+    },
+    {
+      "epoch": 0.7832,
+      "grad_norm": 0.4289843557276394,
+      "learning_rate": 2.3673961758609152e-05,
+      "loss": 0.672,
+      "step": 979
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4135409791582618,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.6799,
+      "step": 980
+    },
+    {
+      "epoch": 0.7848,
+      "grad_norm": 0.3724300445578464,
+      "learning_rate": 2.334004587234717e-05,
+      "loss": 0.7023,
+      "step": 981
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.5338609126577212,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.7982,
+      "step": 982
+    },
+    {
+      "epoch": 0.7864,
+      "grad_norm": 0.35914040240435635,
+      "learning_rate": 2.300819024631603e-05,
+      "loss": 0.6683,
+      "step": 983
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.36043635690058995,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6507,
+      "step": 984
+    },
+    {
+      "epoch": 0.788,
+      "grad_norm": 0.409236991133682,
+      "learning_rate": 2.26784037992395e-05,
+      "loss": 0.6789,
+      "step": 985
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.35135827201477715,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6044,
+      "step": 986
+    },
+    {
+      "epoch": 0.7896,
+      "grad_norm": 0.39791158187210746,
+      "learning_rate": 2.2350695394231345e-05,
+      "loss": 0.6608,
+      "step": 987
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.36810381494389127,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.7086,
+      "step": 988
+    },
+    {
+      "epoch": 0.7912,
+      "grad_norm": 0.39214302549414787,
+      "learning_rate": 2.2025073838557454e-05,
+      "loss": 0.608,
+      "step": 989
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.3482304751448067,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6041,
+      "step": 990
+    },
+    {
+      "epoch": 0.7928,
+      "grad_norm": 0.38120946174275433,
+      "learning_rate": 2.1701547883398922e-05,
+      "loss": 0.6484,
+      "step": 991
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.40585774138554015,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6494,
+      "step": 992
+    },
+    {
+      "epoch": 0.7944,
+      "grad_norm": 0.39432294021606884,
+      "learning_rate": 2.138012622361689e-05,
+      "loss": 0.6698,
+      "step": 993
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.3561326704328232,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6445,
+      "step": 994
+    },
+    {
+      "epoch": 0.796,
+      "grad_norm": 0.3920736293397295,
+      "learning_rate": 2.106081749751897e-05,
+      "loss": 0.5798,
+      "step": 995
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.4272035629768845,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.7124,
+      "step": 996
+    },
+    {
+      "epoch": 0.7976,
+      "grad_norm": 0.44778088302282665,
+      "learning_rate": 2.0743630286627002e-05,
+      "loss": 0.6948,
+      "step": 997
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.3600814432343671,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6212,
+      "step": 998
+    },
+    {
+      "epoch": 0.7992,
+      "grad_norm": 0.3230883470653162,
+      "learning_rate": 2.0428573115446392e-05,
+      "loss": 0.5683,
+      "step": 999
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.43186613411349173,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.7389,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8008,
+      "grad_norm": 0.40047764102962335,
+      "learning_rate": 2.011565445123711e-05,
+      "loss": 0.6792,
+      "step": 1001
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.3799604559465791,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.6382,
+      "step": 1002
+    },
+    {
+      "epoch": 0.8024,
+      "grad_norm": 0.37789477345298844,
+      "learning_rate": 1.980488270378612e-05,
+      "loss": 0.6248,
+      "step": 1003
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3634941724138379,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.5894,
+      "step": 1004
+    },
+    {
+      "epoch": 0.804,
+      "grad_norm": 0.41025985607363014,
+      "learning_rate": 1.9496266225181248e-05,
+      "loss": 0.7381,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.3969457907755805,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7193,
+      "step": 1006
+    },
+    {
+      "epoch": 0.8056,
+      "grad_norm": 0.37293574757945425,
+      "learning_rate": 1.918981330958678e-05,
+      "loss": 0.6555,
+      "step": 1007
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.38444359516308163,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6224,
+      "step": 1008
+    },
+    {
+      "epoch": 0.8072,
+      "grad_norm": 0.3565223003207056,
+      "learning_rate": 1.8885532193020704e-05,
+      "loss": 0.6304,
+      "step": 1009
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.4089104255913111,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.7657,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8088,
+      "grad_norm": 0.4613564539632201,
+      "learning_rate": 1.8583431053133127e-05,
+      "loss": 0.6718,
+      "step": 1011
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.43717369275721013,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.7229,
+      "step": 1012
+    },
+    {
+      "epoch": 0.8104,
+      "grad_norm": 0.41569821826961406,
+      "learning_rate": 1.8283518008986567e-05,
+      "loss": 0.7392,
+      "step": 1013
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.39494922997660814,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.7029,
+      "step": 1014
+    },
+    {
+      "epoch": 0.812,
+      "grad_norm": 0.4122895274541303,
+      "learning_rate": 1.7985801120837865e-05,
+      "loss": 0.7089,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.39725854975172903,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6865,
+      "step": 1016
+    },
+    {
+      "epoch": 0.8136,
+      "grad_norm": 0.37464112072190875,
+      "learning_rate": 1.7690288389921493e-05,
+      "loss": 0.6389,
+      "step": 1017
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.38588828787104884,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.6096,
+      "step": 1018
+    },
+    {
+      "epoch": 0.8152,
+      "grad_norm": 0.40372796341268163,
+      "learning_rate": 1.739698775823442e-05,
+      "loss": 0.6824,
+      "step": 1019
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.4103247224108773,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.6726,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8168,
+      "grad_norm": 0.35331636496976654,
+      "learning_rate": 1.7105907108322816e-05,
+      "loss": 0.607,
+      "step": 1021
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.35955029150196705,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.6452,
+      "step": 1022
+    },
+    {
+      "epoch": 0.8184,
+      "grad_norm": 0.44853051305761393,
+      "learning_rate": 1.6817054263070174e-05,
+      "loss": 0.7252,
+      "step": 1023
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.38700045620030166,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6843,
+      "step": 1024
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.36267065747739924,
+      "learning_rate": 1.6530436985486996e-05,
+      "loss": 0.6508,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.42430465371526943,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.7086,
+      "step": 1026
+    },
+    {
+      "epoch": 0.8216,
+      "grad_norm": 0.4038402527015271,
+      "learning_rate": 1.6246062978502164e-05,
+      "loss": 0.6854,
+      "step": 1027
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.39445659158549284,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.7104,
+      "step": 1028
+    },
+    {
+      "epoch": 0.8232,
+      "grad_norm": 0.37671583143686765,
+      "learning_rate": 1.5963939884756042e-05,
+      "loss": 0.6702,
+      "step": 1029
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.40260912294792317,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6585,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8248,
+      "grad_norm": 0.43837270664689515,
+      "learning_rate": 1.5684075286394985e-05,
+      "loss": 0.6583,
+      "step": 1031
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3893014770348551,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6087,
+      "step": 1032
+    },
+    {
+      "epoch": 0.8264,
+      "grad_norm": 0.36216323834138736,
+      "learning_rate": 1.5406476704867524e-05,
+      "loss": 0.5883,
+      "step": 1033
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.46412683605673033,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6804,
+      "step": 1034
+    },
+    {
+      "epoch": 0.828,
+      "grad_norm": 0.40285204013158576,
+      "learning_rate": 1.5131151600722337e-05,
+      "loss": 0.7045,
+      "step": 1035
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.370915249411144,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6247,
+      "step": 1036
+    },
+    {
+      "epoch": 0.8296,
+      "grad_norm": 0.36105068358775616,
+      "learning_rate": 1.485810737340767e-05,
+      "loss": 0.6373,
+      "step": 1037
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.4631400901388791,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.7066,
+      "step": 1038
+    },
+    {
+      "epoch": 0.8312,
+      "grad_norm": 0.4372476982607606,
+      "learning_rate": 1.4587351361072454e-05,
+      "loss": 0.6773,
+      "step": 1039
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.4031985086379773,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6627,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8328,
+      "grad_norm": 0.4077896779241173,
+      "learning_rate": 1.4318890840369182e-05,
+      "loss": 0.6703,
+      "step": 1041
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.38703008590275734,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.6869,
+      "step": 1042
+    },
+    {
+      "epoch": 0.8344,
+      "grad_norm": 0.389822216835128,
+      "learning_rate": 1.4052733026258281e-05,
+      "loss": 0.6213,
+      "step": 1043
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.44884734309677693,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.698,
+      "step": 1044
+    },
+    {
+      "epoch": 0.836,
+      "grad_norm": 0.362517575944606,
+      "learning_rate": 1.3788885071814172e-05,
+      "loss": 0.6735,
+      "step": 1045
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.3846628320342674,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6225,
+      "step": 1046
+    },
+    {
+      "epoch": 0.8376,
+      "grad_norm": 0.4059611932840297,
+      "learning_rate": 1.3527354068033139e-05,
+      "loss": 0.6448,
+      "step": 1047
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.39326903402007896,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.6267,
+      "step": 1048
+    },
+    {
+      "epoch": 0.8392,
+      "grad_norm": 0.4012011863886583,
+      "learning_rate": 1.326814704364262e-05,
+      "loss": 0.6433,
+      "step": 1049
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.363230791976257,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6202,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8408,
+      "grad_norm": 0.3650631404313902,
+      "learning_rate": 1.3011270964912459e-05,
+      "loss": 0.6955,
+      "step": 1051
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.37561230808243434,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.5872,
+      "step": 1052
+    },
+    {
+      "epoch": 0.8424,
+      "grad_norm": 0.3623780010217766,
+      "learning_rate": 1.275673273546758e-05,
+      "loss": 0.5793,
+      "step": 1053
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.3505105560783126,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6318,
+      "step": 1054
+    },
+    {
+      "epoch": 0.844,
+      "grad_norm": 0.4271666747293594,
+      "learning_rate": 1.2504539196102439e-05,
+      "loss": 0.7364,
+      "step": 1055
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3716203795664881,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6343,
+      "step": 1056
+    },
+    {
+      "epoch": 0.8456,
+      "grad_norm": 0.36258914631174205,
+      "learning_rate": 1.2254697124597237e-05,
+      "loss": 0.6528,
+      "step": 1057
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.35711272822264123,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.5952,
+      "step": 1058
+    },
+    {
+      "epoch": 0.8472,
+      "grad_norm": 0.3878037895522494,
+      "learning_rate": 1.2007213235535786e-05,
+      "loss": 0.6925,
+      "step": 1059
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.37209329467917407,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6323,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8488,
+      "grad_norm": 0.379412877060846,
+      "learning_rate": 1.176209418012495e-05,
+      "loss": 0.6751,
+      "step": 1061
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.7891292376304488,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6932,
+      "step": 1062
+    },
+    {
+      "epoch": 0.8504,
+      "grad_norm": 0.4081006916394357,
+      "learning_rate": 1.1519346546015907e-05,
+      "loss": 0.6861,
+      "step": 1063
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.4073538388075811,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.683,
+      "step": 1064
+    },
+    {
+      "epoch": 0.852,
+      "grad_norm": 0.4414279198048497,
+      "learning_rate": 1.1278976857127311e-05,
+      "loss": 0.7051,
+      "step": 1065
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.3808359904757574,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6484,
+      "step": 1066
+    },
+    {
+      "epoch": 0.8536,
+      "grad_norm": 0.524126867630712,
+      "learning_rate": 1.1040991573469629e-05,
+      "loss": 0.7399,
+      "step": 1067
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.39951867379220407,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.7088,
+      "step": 1068
+    },
+    {
+      "epoch": 0.8552,
+      "grad_norm": 0.37138460708916754,
+      "learning_rate": 1.0805397090971737e-05,
+      "loss": 0.6687,
+      "step": 1069
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.34826122882986327,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.5969,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8568,
+      "grad_norm": 0.3410088949598137,
+      "learning_rate": 1.057219974130903e-05,
+      "loss": 0.5928,
+      "step": 1071
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3950409737979152,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6379,
+      "step": 1072
+    },
+    {
+      "epoch": 0.8584,
+      "grad_norm": 0.4102781207089552,
+      "learning_rate": 1.0341405791733183e-05,
+      "loss": 0.6908,
+      "step": 1073
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.3429034133578174,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6015,
+      "step": 1074
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.3632312736357231,
+      "learning_rate": 1.0113021444903726e-05,
+      "loss": 0.5986,
+      "step": 1075
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3826228822931815,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6743,
+      "step": 1076
+    },
+    {
+      "epoch": 0.8616,
+      "grad_norm": 0.42683569045701925,
+      "learning_rate": 9.887052838721322e-06,
+      "loss": 0.621,
+      "step": 1077
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.3605030746262158,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6233,
+      "step": 1078
+    },
+    {
+      "epoch": 0.8632,
+      "grad_norm": 0.41230326194585215,
+      "learning_rate": 9.663506046162985e-06,
+      "loss": 0.778,
+      "step": 1079
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.4957680011505481,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.7457,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8648,
+      "grad_norm": 0.4072547648885756,
+      "learning_rate": 9.44238707511862e-06,
+      "loss": 0.645,
+      "step": 1081
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.40754438425649125,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6756,
+      "step": 1082
+    },
+    {
+      "epoch": 0.8664,
+      "grad_norm": 0.39653427847439454,
+      "learning_rate": 9.22370186822965e-06,
+      "loss": 0.7184,
+      "step": 1083
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.43177459051227624,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.7636,
+      "step": 1084
+    },
+    {
+      "epoch": 0.868,
+      "grad_norm": 0.39263907106487356,
+      "learning_rate": 9.0074563027294e-06,
+      "loss": 0.7081,
+      "step": 1085
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.3624972677767638,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6399,
+      "step": 1086
+    },
+    {
+      "epoch": 0.8696,
+      "grad_norm": 0.33843811085319553,
+      "learning_rate": 8.79365619028507e-06,
+      "loss": 0.5777,
+      "step": 1087
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.34161536055166386,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.5904,
+      "step": 1088
+    },
+    {
+      "epoch": 0.8712,
+      "grad_norm": 0.44047485203652753,
+      "learning_rate": 8.582307276841462e-06,
+      "loss": 0.7361,
+      "step": 1089
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.3918072282659997,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.6349,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8728,
+      "grad_norm": 0.3590971052687072,
+      "learning_rate": 8.37341524246672e-06,
+      "loss": 0.6231,
+      "step": 1091
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.40635778568154485,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.7118,
+      "step": 1092
+    },
+    {
+      "epoch": 0.8744,
+      "grad_norm": 0.3788357955249413,
+      "learning_rate": 8.166985701199582e-06,
+      "loss": 0.6275,
+      "step": 1093
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.3888433203841177,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6681,
+      "step": 1094
+    },
+    {
+      "epoch": 0.876,
+      "grad_norm": 0.420258808376998,
+      "learning_rate": 7.963024200898462e-06,
+      "loss": 0.7226,
+      "step": 1095
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.409072040849215,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.709,
+      "step": 1096
+    },
+    {
+      "epoch": 0.8776,
+      "grad_norm": 0.38157508113901245,
+      "learning_rate": 7.761536223092458e-06,
+      "loss": 0.6068,
+      "step": 1097
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.3719039482229343,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6442,
+      "step": 1098
+    },
+    {
+      "epoch": 0.8792,
+      "grad_norm": 0.4128679484467576,
+      "learning_rate": 7.562527182833978e-06,
+      "loss": 0.6828,
+      "step": 1099
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.37493907613039723,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6248,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8808,
+      "grad_norm": 0.47831421098169785,
+      "learning_rate": 7.366002428553153e-06,
+      "loss": 0.7748,
+      "step": 1101
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.3944876198518206,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6075,
+      "step": 1102
+    },
+    {
+      "epoch": 0.8824,
+      "grad_norm": 0.4602873397181344,
+      "learning_rate": 7.171967241914224e-06,
+      "loss": 0.7408,
+      "step": 1103
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.3915401092739435,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6955,
+      "step": 1104
+    },
+    {
+      "epoch": 0.884,
+      "grad_norm": 0.42777489757566567,
+      "learning_rate": 6.980426837673437e-06,
+      "loss": 0.6815,
+      "step": 1105
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.3806850442829798,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.6528,
+      "step": 1106
+    },
+    {
+      "epoch": 0.8856,
+      "grad_norm": 0.3971572784686375,
+      "learning_rate": 6.791386363539065e-06,
+      "loss": 0.657,
+      "step": 1107
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.3682731235206281,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.67,
+      "step": 1108
+    },
+    {
+      "epoch": 0.8872,
+      "grad_norm": 0.43164071322348224,
+      "learning_rate": 6.604850900032955e-06,
+      "loss": 0.6595,
+      "step": 1109
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.37823047074358207,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6407,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8888,
+      "grad_norm": 0.413131874588114,
+      "learning_rate": 6.420825460353974e-06,
+      "loss": 0.6021,
+      "step": 1111
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.365271643610721,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6173,
+      "step": 1112
+    },
+    {
+      "epoch": 0.8904,
+      "grad_norm": 0.37148101784065213,
+      "learning_rate": 6.239314990243339e-06,
+      "loss": 0.6373,
+      "step": 1113
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.3545282942252107,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.6235,
+      "step": 1114
+    },
+    {
+      "epoch": 0.892,
+      "grad_norm": 0.34589376857882104,
+      "learning_rate": 6.0603243678516995e-06,
+      "loss": 0.6217,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.37946025391411065,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6679,
+      "step": 1116
+    },
+    {
+      "epoch": 0.8936,
+      "grad_norm": 0.4179883388920593,
+      "learning_rate": 5.883858403607967e-06,
+      "loss": 0.7016,
+      "step": 1117
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.40606496916550056,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.7358,
+      "step": 1118
+    },
+    {
+      "epoch": 0.8952,
+      "grad_norm": 0.34722654274842113,
+      "learning_rate": 5.7099218400900716e-06,
+      "loss": 0.6104,
+      "step": 1119
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.5640657778816404,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6893,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8968,
+      "grad_norm": 0.34887272732285135,
+      "learning_rate": 5.538519351897575e-06,
+      "loss": 0.676,
+      "step": 1121
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.42103436552600304,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6671,
+      "step": 1122
+    },
+    {
+      "epoch": 0.8984,
+      "grad_norm": 0.397458685970188,
+      "learning_rate": 5.369655545525909e-06,
+      "loss": 0.6084,
+      "step": 1123
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.4560587003423442,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.68,
+      "step": 1124
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.38853994918812274,
+      "learning_rate": 5.2033349592426335e-06,
+      "loss": 0.6739,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.4607525754543247,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.708,
+      "step": 1126
+    },
+    {
+      "epoch": 0.9016,
+      "grad_norm": 0.35855112978980025,
+      "learning_rate": 5.039562062965508e-06,
+      "loss": 0.6008,
+      "step": 1127
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.5252440713870019,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6639,
+      "step": 1128
+    },
+    {
+      "epoch": 0.9032,
+      "grad_norm": 0.397145098522711,
+      "learning_rate": 4.87834125814235e-06,
+      "loss": 0.6954,
+      "step": 1129
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.3947058730584167,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6591,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9048,
+      "grad_norm": 0.39151628751957473,
+      "learning_rate": 4.719676877632639e-06,
+      "loss": 0.6555,
+      "step": 1131
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.3912418614245154,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6689,
+      "step": 1132
+    },
+    {
+      "epoch": 0.9064,
+      "grad_norm": 0.4313141016146209,
+      "learning_rate": 4.563573185591219e-06,
+      "loss": 0.7063,
+      "step": 1133
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.4077614627946444,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6593,
+      "step": 1134
+    },
+    {
+      "epoch": 0.908,
+      "grad_norm": 0.35297209831565624,
+      "learning_rate": 4.4100343773536225e-06,
+      "loss": 0.6551,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.37755494085747254,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6392,
+      "step": 1136
+    },
+    {
+      "epoch": 0.9096,
+      "grad_norm": 0.43709833891458216,
+      "learning_rate": 4.259064579323302e-06,
+      "loss": 0.6573,
+      "step": 1137
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.3974583590147279,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6537,
+      "step": 1138
+    },
+    {
+      "epoch": 0.9112,
+      "grad_norm": 0.4071793814200911,
+      "learning_rate": 4.1106678488607495e-06,
+      "loss": 0.6909,
+      "step": 1139
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3601247563476641,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.5835,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9128,
+      "grad_norm": 0.4482109248429826,
+      "learning_rate": 3.964848174174541e-06,
+      "loss": 0.6992,
+      "step": 1141
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.46998382865881133,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.6443,
+      "step": 1142
+    },
+    {
+      "epoch": 0.9144,
+      "grad_norm": 0.3357347367728721,
+      "learning_rate": 3.821609474213983e-06,
+      "loss": 0.613,
+      "step": 1143
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3616675167607055,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6755,
+      "step": 1144
+    },
+    {
+      "epoch": 0.916,
+      "grad_norm": 0.36199504917256037,
+      "learning_rate": 3.6809555985639068e-06,
+      "loss": 0.6938,
+      "step": 1145
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.38089364449249724,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.5978,
+      "step": 1146
+    },
+    {
+      "epoch": 0.9176,
+      "grad_norm": 0.3715196558561812,
+      "learning_rate": 3.5428903273411863e-06,
+      "loss": 0.6615,
+      "step": 1147
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.36779570170659703,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.639,
+      "step": 1148
+    },
+    {
+      "epoch": 0.9192,
+      "grad_norm": 0.38633718133207556,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.6676,
+      "step": 1149
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.40549553495675567,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6401,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9208,
+      "grad_norm": 0.43144403204092113,
+      "learning_rate": 3.2745403706978872e-06,
+      "loss": 0.7053,
+      "step": 1151
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4141940362584789,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6576,
+      "step": 1152
+    },
+    {
+      "epoch": 0.9224,
+      "grad_norm": 0.4351045764338084,
+      "learning_rate": 3.1442628972662704e-06,
+      "loss": 0.651,
+      "step": 1153
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.3679347474501111,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6398,
+      "step": 1154
+    },
+    {
+      "epoch": 0.924,
+      "grad_norm": 0.3702780498157318,
+      "learning_rate": 3.0165884520461316e-06,
+      "loss": 0.5921,
+      "step": 1155
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3713791644136914,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.5971,
+      "step": 1156
+    },
+    {
+      "epoch": 0.9256,
+      "grad_norm": 0.36455460169278,
+      "learning_rate": 2.8915204663281013e-06,
+      "loss": 0.5863,
+      "step": 1157
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.4040556238367524,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6733,
+      "step": 1158
+    },
+    {
+      "epoch": 0.9272,
+      "grad_norm": 0.4197533321599414,
+      "learning_rate": 2.7690623013533976e-06,
+      "loss": 0.6676,
+      "step": 1159
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4440182289183848,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.7207,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9288,
+      "grad_norm": 0.4413081297715539,
+      "learning_rate": 2.649217248223468e-06,
+      "loss": 0.6119,
+      "step": 1161
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.41931313708560863,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6595,
+      "step": 1162
+    },
+    {
+      "epoch": 0.9304,
+      "grad_norm": 0.4246446041542137,
+      "learning_rate": 2.5319885278115906e-06,
+      "loss": 0.6851,
+      "step": 1163
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.4277447086602959,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.6735,
+      "step": 1164
+    },
+    {
+      "epoch": 0.932,
+      "grad_norm": 0.41970169765480425,
+      "learning_rate": 2.4173792906762804e-06,
+      "loss": 0.675,
+      "step": 1165
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.42411333899106246,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.7166,
+      "step": 1166
+    },
+    {
+      "epoch": 0.9336,
+      "grad_norm": 0.4161922519419455,
+      "learning_rate": 2.3053926169765984e-06,
+      "loss": 0.6544,
+      "step": 1167
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3939621504358724,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6234,
+      "step": 1168
+    },
+    {
+      "epoch": 0.9352,
+      "grad_norm": 0.37831960885850807,
+      "learning_rate": 2.1960315163894075e-06,
+      "loss": 0.6426,
+      "step": 1169
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.41250023097697036,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.7239,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9368,
+      "grad_norm": 0.4886043521145033,
+      "learning_rate": 2.0892989280284823e-06,
+      "loss": 0.6619,
+      "step": 1171
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.40061777538209004,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.6733,
+      "step": 1172
+    },
+    {
+      "epoch": 0.9384,
+      "grad_norm": 0.38345316709018273,
+      "learning_rate": 1.9851977203654835e-06,
+      "loss": 0.6978,
+      "step": 1173
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.4299670022115295,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.693,
+      "step": 1174
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.373034908425949,
+      "learning_rate": 1.8837306911529184e-06,
+      "loss": 0.6308,
+      "step": 1175
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.3890648152286451,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.7081,
+      "step": 1176
+    },
+    {
+      "epoch": 0.9416,
+      "grad_norm": 0.38355611150181956,
+      "learning_rate": 1.7849005673489127e-06,
+      "loss": 0.6301,
+      "step": 1177
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.41208221462674705,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6756,
+      "step": 1178
+    },
+    {
+      "epoch": 0.9432,
+      "grad_norm": 0.427348556462753,
+      "learning_rate": 1.6887100050439587e-06,
+      "loss": 0.6884,
+      "step": 1179
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.40449145649417545,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.685,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9448,
+      "grad_norm": 0.41362539723590275,
+      "learning_rate": 1.595161589389449e-06,
+      "loss": 0.6555,
+      "step": 1181
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.5162633241322898,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.678,
+      "step": 1182
+    },
+    {
+      "epoch": 0.9464,
+      "grad_norm": 2.7048013739560886,
+      "learning_rate": 1.5042578345283108e-06,
+      "loss": 0.6383,
+      "step": 1183
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.38623038093351664,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6599,
+      "step": 1184
+    },
+    {
+      "epoch": 0.948,
+      "grad_norm": 0.41107408813820095,
+      "learning_rate": 1.4160011835273934e-06,
+      "loss": 0.7122,
+      "step": 1185
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.3420118306844945,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.5963,
+      "step": 1186
+    },
+    {
+      "epoch": 0.9496,
+      "grad_norm": 0.44860775950775655,
+      "learning_rate": 1.3303940083117527e-06,
+      "loss": 0.7645,
+      "step": 1187
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.41956358789991066,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6787,
+      "step": 1188
+    },
+    {
+      "epoch": 0.9512,
+      "grad_norm": 0.3996878685357968,
+      "learning_rate": 1.2474386096010039e-06,
+      "loss": 0.631,
+      "step": 1189
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4306761188580242,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.6034,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9528,
+      "grad_norm": 0.33380908699505984,
+      "learning_rate": 1.1671372168474138e-06,
+      "loss": 0.6212,
+      "step": 1191
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.380156098174372,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6877,
+      "step": 1192
+    },
+    {
+      "epoch": 0.9544,
+      "grad_norm": 0.48012803001265547,
+      "learning_rate": 1.089491988176017e-06,
+      "loss": 0.677,
+      "step": 1193
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.36836128639431226,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6486,
+      "step": 1194
+    },
+    {
+      "epoch": 0.956,
+      "grad_norm": 0.3702855334521945,
+      "learning_rate": 1.014505010326583e-06,
+      "loss": 0.6211,
+      "step": 1195
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3680229815646194,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.5959,
+      "step": 1196
+    },
+    {
+      "epoch": 0.9576,
+      "grad_norm": 0.39723037587968774,
+      "learning_rate": 9.421782985976068e-07,
+      "loss": 0.5769,
+      "step": 1197
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.3994223107492055,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6779,
+      "step": 1198
+    },
+    {
+      "epoch": 0.9592,
+      "grad_norm": 0.4039869427449052,
+      "learning_rate": 8.725137967920738e-07,
+      "loss": 0.7039,
+      "step": 1199
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.4103412609606496,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6718,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9608,
+      "grad_norm": 0.3690177610632917,
+      "learning_rate": 8.055133771652345e-07,
+      "loss": 0.6632,
+      "step": 1201
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.4219196717424533,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6655,
+      "step": 1202
+    },
+    {
+      "epoch": 0.9624,
+      "grad_norm": 0.4018337233141601,
+      "learning_rate": 7.411788403743237e-07,
+      "loss": 0.6778,
+      "step": 1203
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.35681086222972497,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6222,
+      "step": 1204
+    },
+    {
+      "epoch": 0.964,
+      "grad_norm": 0.3292173339508648,
+      "learning_rate": 6.7951191543012e-07,
+      "loss": 0.5913,
+      "step": 1205
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.3936008885386386,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.6426,
+      "step": 1206
+    },
+    {
+      "epoch": 0.9656,
+      "grad_norm": 0.43919355223463497,
+      "learning_rate": 6.205142596505176e-07,
+      "loss": 0.7145,
+      "step": 1207
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.3932163380580836,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6133,
+      "step": 1208
+    },
+    {
+      "epoch": 0.9672,
+      "grad_norm": 0.3636623163514951,
+      "learning_rate": 5.64187458615939e-07,
+      "loss": 0.6347,
+      "step": 1209
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.39874386848746346,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6228,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9688,
+      "grad_norm": 0.39882330921672243,
+      "learning_rate": 5.105330261267916e-07,
+      "loss": 0.6561,
+      "step": 1211
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.5197317454420752,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.7055,
+      "step": 1212
+    },
+    {
+      "epoch": 0.9704,
+      "grad_norm": 0.4221316807203784,
+      "learning_rate": 4.5955240416271084e-07,
+      "loss": 0.7113,
+      "step": 1213
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.4142165008630827,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6967,
+      "step": 1214
+    },
+    {
+      "epoch": 0.972,
+      "grad_norm": 0.370064302517949,
+      "learning_rate": 4.112469628438365e-07,
+      "loss": 0.6371,
+      "step": 1215
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.3751873246821837,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.673,
+      "step": 1216
+    },
+    {
+      "epoch": 0.9736,
+      "grad_norm": 0.45211984651576587,
+      "learning_rate": 3.6561800039403016e-07,
+      "loss": 0.6897,
+      "step": 1217
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.39165760279121553,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.612,
+      "step": 1218
+    },
+    {
+      "epoch": 0.9752,
+      "grad_norm": 0.3748915391069381,
+      "learning_rate": 3.2266674310589273e-07,
+      "loss": 0.6178,
+      "step": 1219
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.43537912317156524,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6543,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9768,
+      "grad_norm": 0.46162976257051774,
+      "learning_rate": 2.8239434530792365e-07,
+      "loss": 0.6925,
+      "step": 1221
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.4029308621400298,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6833,
+      "step": 1222
+    },
+    {
+      "epoch": 0.9784,
+      "grad_norm": 0.33775265813241384,
+      "learning_rate": 2.448018893333681e-07,
+      "loss": 0.5976,
+      "step": 1223
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.37593495620956857,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6027,
+      "step": 1224
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.35309989717011225,
+      "learning_rate": 2.098903854912515e-07,
+      "loss": 0.6301,
+      "step": 1225
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.39569637592322077,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6415,
+      "step": 1226
+    },
+    {
+      "epoch": 0.9816,
+      "grad_norm": 0.43208602953113595,
+      "learning_rate": 1.7766077203915655e-07,
+      "loss": 0.7,
+      "step": 1227
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.37194342945756875,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.5833,
+      "step": 1228
+    },
+    {
+      "epoch": 0.9832,
+      "grad_norm": 0.38416040502013293,
+      "learning_rate": 1.481139151579991e-07,
+      "loss": 0.6857,
+      "step": 1229
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.335484737327022,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6161,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9848,
+      "grad_norm": 0.38061953088212563,
+      "learning_rate": 1.2125060892881346e-07,
+      "loss": 0.6358,
+      "step": 1231
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.33884120897595343,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.6345,
+      "step": 1232
+    },
+    {
+      "epoch": 0.9864,
+      "grad_norm": 0.3934930628376492,
+      "learning_rate": 9.707157531134713e-08,
+      "loss": 0.6609,
+      "step": 1233
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.39204428051475615,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.7247,
+      "step": 1234
+    },
+    {
+      "epoch": 0.988,
+      "grad_norm": 0.39874105434440155,
+      "learning_rate": 7.557746412468758e-08,
+      "loss": 0.7189,
+      "step": 1235
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.39100579087505305,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6408,
+      "step": 1236
+    },
+    {
+      "epoch": 0.9896,
+      "grad_norm": 0.36761220334225153,
+      "learning_rate": 5.6768853029787184e-08,
+      "loss": 0.6091,
+      "step": 1237
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.3700637554607591,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6476,
+      "step": 1238
+    },
+    {
+      "epoch": 0.9912,
+      "grad_norm": 0.37325222898220134,
+      "learning_rate": 4.064624751394242e-08,
+      "loss": 0.6141,
+      "step": 1239
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3366722732870449,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6474,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9928,
+      "grad_norm": 0.4155578015379489,
+      "learning_rate": 2.7210080877237976e-08,
+      "loss": 0.6683,
+      "step": 1241
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.3892625290957849,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6482,
+      "step": 1242
+    },
+    {
+      "epoch": 0.9944,
+      "grad_norm": 0.4087626640032446,
+      "learning_rate": 1.646071422083395e-08,
+      "loss": 0.6054,
+      "step": 1243
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.6956867107166412,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.677,
+      "step": 1244
+    },
+    {
+      "epoch": 0.996,
+      "grad_norm": 0.4086757171312014,
+      "learning_rate": 8.398436437317969e-09,
+      "loss": 0.6356,
+      "step": 1245
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.4111855344094051,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.7552,
+      "step": 1246
+    },
+    {
+      "epoch": 0.9976,
+      "grad_norm": 0.38432449021004866,
+      "learning_rate": 3.023464202944748e-09,
+      "loss": 0.6583,
+      "step": 1247
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.4085405245045004,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.6984,
+      "step": 1248
+    },
+    {
+      "epoch": 0.9992,
+      "grad_norm": 0.5039455367154145,
+      "learning_rate": 3.3594197175190745e-10,
+      "loss": 0.7574,
+      "step": 1249
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.47813106625043633,
+      "learning_rate": 0.0,
+      "loss": 0.7561,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0,
+      "step": 1250,
+      "total_flos": 1128654464909312.0,
+      "train_loss": 0.7288802340984345,
+      "train_runtime": 19621.0755,
+      "train_samples_per_second": 1.019,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1128654464909312.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7420ff54ab7c3b9766a89c31fa2c7cb1f7c3b514
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "up_proj",
+    "o_proj",
+    "q_proj",
+    "v_proj",
+    "gate_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..afdaf70f264f5b8fea8bc868b3ba8122d7d511a1
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:125fefd8d9ea1985becd336ceaa4bc360bbf3e48f1c6027ac96ec281a8a0a522
+size 671150064
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a251319c76b0156dc0be44c42f00bbd7b0aaf063
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8495835d7af9abaf3748c69217f36f0d11b20f2f2dbd26d9f978efd43acd004
+size 918507402
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c6795165eef5af58159a34f34709057c673a03be
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,8792 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.7888862334183341,
+      "learning_rate": 5.263157894736842e-06,
+      "loss": 1.2004,
+      "step": 1
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.8072098866777677,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.3362,
+      "step": 2
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.934453822221645,
+      "learning_rate": 1.5789473684210526e-05,
+      "loss": 1.4111,
+      "step": 3
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.7995143788702597,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.3153,
+      "step": 4
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.8045228585693739,
+      "learning_rate": 2.6315789473684212e-05,
+      "loss": 1.3712,
+      "step": 5
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.6936886266377906,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.1641,
+      "step": 6
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.8405796761351603,
+      "learning_rate": 3.6842105263157895e-05,
+      "loss": 1.2383,
+      "step": 7
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.6769420946388747,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.0148,
+      "step": 8
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.6733265030653172,
+      "learning_rate": 4.736842105263158e-05,
+      "loss": 0.9795,
+      "step": 9
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.7335159233425952,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 1.0197,
+      "step": 10
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.7045436470282542,
+      "learning_rate": 5.789473684210527e-05,
+      "loss": 0.9439,
+      "step": 11
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7589080717502401,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 1.0242,
+      "step": 12
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.653064525079337,
+      "learning_rate": 6.842105263157895e-05,
+      "loss": 0.9467,
+      "step": 13
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.5349660353486346,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.851,
+      "step": 14
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.6474693250698709,
+      "learning_rate": 7.894736842105263e-05,
+      "loss": 0.9351,
+      "step": 15
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.5013787309930555,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.8601,
+      "step": 16
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.6108292538136464,
+      "learning_rate": 8.947368421052632e-05,
+      "loss": 0.9248,
+      "step": 17
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.5445189776654845,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.8735,
+      "step": 18
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.4929435837776125,
+      "learning_rate": 0.0001,
+      "loss": 0.8435,
+      "step": 19
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5305886244534136,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.9159,
+      "step": 20
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.43968440859680985,
+      "learning_rate": 0.0001105263157894737,
+      "loss": 0.9122,
+      "step": 21
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.5173469539153005,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.8479,
+      "step": 22
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.5626542058275531,
+      "learning_rate": 0.00012105263157894738,
+      "loss": 0.9466,
+      "step": 23
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.5841597173947826,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 1.0017,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.4951786473740263,
+      "learning_rate": 0.00013157894736842108,
+      "loss": 0.8452,
+      "step": 25
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.5849316515907934,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 1.0015,
+      "step": 26
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.5042752384969739,
+      "learning_rate": 0.00014210526315789474,
+      "loss": 0.9226,
+      "step": 27
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.4853631688719905,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.8992,
+      "step": 28
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.531795828719562,
+      "learning_rate": 0.00015263157894736845,
+      "loss": 0.9509,
+      "step": 29
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.6638752979985171,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.9347,
+      "step": 30
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.45278275207606566,
+      "learning_rate": 0.0001631578947368421,
+      "loss": 0.8278,
+      "step": 31
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5132822719865044,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8264,
+      "step": 32
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.5022331101351186,
+      "learning_rate": 0.0001736842105263158,
+      "loss": 0.9087,
+      "step": 33
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.46575474125212074,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8606,
+      "step": 34
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.5391928815995534,
+      "learning_rate": 0.00018421052631578948,
+      "loss": 0.8977,
+      "step": 35
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.44692616061569684,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8352,
+      "step": 36
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.44614250941619077,
+      "learning_rate": 0.00019473684210526317,
+      "loss": 0.8583,
+      "step": 37
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.433289640457569,
+      "learning_rate": 0.0002,
+      "loss": 0.7968,
+      "step": 38
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.44517420661683976,
+      "learning_rate": 0.00019999966405802826,
+      "loss": 0.8654,
+      "step": 39
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.48805276441168305,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8959,
+      "step": 40
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.49550073472400363,
+      "learning_rate": 0.00019999697653579705,
+      "loss": 0.8294,
+      "step": 41
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.439426740092416,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.863,
+      "step": 42
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.47191451920826855,
+      "learning_rate": 0.0001999916015635627,
+      "loss": 0.8798,
+      "step": 43
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.46627051102404476,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8755,
+      "step": 44
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.42721036955736064,
+      "learning_rate": 0.00019998353928577919,
+      "loss": 0.778,
+      "step": 45
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.48334276256170616,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.7948,
+      "step": 46
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.4557680208107082,
+      "learning_rate": 0.0001999727899191228,
+      "loss": 0.8604,
+      "step": 47
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.42223230996410294,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8521,
+      "step": 48
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.4905567778621406,
+      "learning_rate": 0.00019995935375248606,
+      "loss": 0.9098,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.4242061181062434,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.7911,
+      "step": 50
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.3886571487835808,
+      "learning_rate": 0.00019994323114697022,
+      "loss": 0.7497,
+      "step": 51
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.44997119706190275,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8616,
+      "step": 52
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.4780377708601232,
+      "learning_rate": 0.0001999244225358753,
+      "loss": 0.8795,
+      "step": 53
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.4134394710050785,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.8178,
+      "step": 54
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.4549312991117919,
+      "learning_rate": 0.00019990292842468868,
+      "loss": 0.8209,
+      "step": 55
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.4943475167911096,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8577,
+      "step": 56
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.3981263934035132,
+      "learning_rate": 0.0001998787493910712,
+      "loss": 0.8101,
+      "step": 57
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.4791596433231652,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.9053,
+      "step": 58
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.4301117473556097,
+      "learning_rate": 0.000199851886084842,
+      "loss": 0.8568,
+      "step": 59
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.4553403450217621,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8905,
+      "step": 60
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.47092985413048144,
+      "learning_rate": 0.00019982233922796085,
+      "loss": 0.8354,
+      "step": 61
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5091746908825188,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.9167,
+      "step": 62
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.5308818498640732,
+      "learning_rate": 0.00019979010961450878,
+      "loss": 0.903,
+      "step": 63
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4467185225835727,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.8313,
+      "step": 64
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.4802224442428892,
+      "learning_rate": 0.00019975519811066663,
+      "loss": 0.85,
+      "step": 65
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.44615653657864884,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8224,
+      "step": 66
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.4450950170946168,
+      "learning_rate": 0.0001997176056546921,
+      "loss": 0.8457,
+      "step": 67
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.43739704558596904,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.871,
+      "step": 68
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.42492503347720234,
+      "learning_rate": 0.0001996773332568941,
+      "loss": 0.8066,
+      "step": 69
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.39277555946078135,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.7761,
+      "step": 70
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.4547785885866476,
+      "learning_rate": 0.00019963438199960599,
+      "loss": 0.8484,
+      "step": 71
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5087443545551769,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.9514,
+      "step": 72
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.4421694324028681,
+      "learning_rate": 0.00019958875303715615,
+      "loss": 0.8356,
+      "step": 73
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.3693836667074255,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.7293,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.4545505782833525,
+      "learning_rate": 0.0001995404475958373,
+      "loss": 0.8674,
+      "step": 75
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.5149111832904231,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8317,
+      "step": 76
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.4222308910851471,
+      "learning_rate": 0.0001994894669738732,
+      "loss": 0.7796,
+      "step": 77
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.42773268962424477,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.8301,
+      "step": 78
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.4510758829243804,
+      "learning_rate": 0.0001994358125413841,
+      "loss": 0.8175,
+      "step": 79
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4386879178055002,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.883,
+      "step": 80
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.4247094243098374,
+      "learning_rate": 0.0001993794857403495,
+      "loss": 0.8264,
+      "step": 81
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.39731806722513374,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.8113,
+      "step": 82
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.43059843269685766,
+      "learning_rate": 0.0001993204880845699,
+      "loss": 0.8204,
+      "step": 83
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.46411656604260365,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.9093,
+      "step": 84
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.42444777199515643,
+      "learning_rate": 0.00019925882115962568,
+      "loss": 0.7659,
+      "step": 85
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.4126543869436709,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.7975,
+      "step": 86
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.4749702188611372,
+      "learning_rate": 0.00019919448662283478,
+      "loss": 0.85,
+      "step": 87
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.3956659659924778,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.7673,
+      "step": 88
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.4026016211878808,
+      "learning_rate": 0.00019912748620320794,
+      "loss": 0.7614,
+      "step": 89
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.4369069700224426,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.8282,
+      "step": 90
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.46820403041406944,
+      "learning_rate": 0.00019905782170140238,
+      "loss": 0.8365,
+      "step": 91
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.4993966873868318,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8955,
+      "step": 92
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.4559482258642977,
+      "learning_rate": 0.00019898549498967343,
+      "loss": 0.7761,
+      "step": 93
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.44067884451502826,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.8391,
+      "step": 94
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.4669300417941205,
+      "learning_rate": 0.000198910508011824,
+      "loss": 0.8143,
+      "step": 95
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.4234634201092707,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7585,
+      "step": 96
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.4327595021500628,
+      "learning_rate": 0.00019883286278315262,
+      "loss": 0.8077,
+      "step": 97
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.4640301849434795,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8626,
+      "step": 98
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.4323764808876556,
+      "learning_rate": 0.00019875256139039902,
+      "loss": 0.7699,
+      "step": 99
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.49768714480089576,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8951,
+      "step": 100
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.450340029550535,
+      "learning_rate": 0.00019866960599168826,
+      "loss": 0.7956,
+      "step": 101
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.4162610309184303,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.7993,
+      "step": 102
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.4498058897727285,
+      "learning_rate": 0.0001985839988164726,
+      "loss": 0.8608,
+      "step": 103
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.4386921106812282,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.8507,
+      "step": 104
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.4816905812977084,
+      "learning_rate": 0.00019849574216547171,
+      "loss": 0.8811,
+      "step": 105
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.3785246676744159,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.8009,
+      "step": 106
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.44762669321774,
+      "learning_rate": 0.00019840483841061058,
+      "loss": 0.8272,
+      "step": 107
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.4358359003040998,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.7362,
+      "step": 108
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.44561339298175845,
+      "learning_rate": 0.00019831128999495606,
+      "loss": 0.7828,
+      "step": 109
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.4294708664538283,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.7471,
+      "step": 110
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.47008937042593385,
+      "learning_rate": 0.0001982150994326511,
+      "loss": 0.8299,
+      "step": 111
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4220741908718888,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.8113,
+      "step": 112
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.4389790650159753,
+      "learning_rate": 0.0001981162693088471,
+      "loss": 0.8382,
+      "step": 113
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.4388632276201582,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.9138,
+      "step": 114
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.45562440293018436,
+      "learning_rate": 0.0001980148022796345,
+      "loss": 0.872,
+      "step": 115
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4102332619971319,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7739,
+      "step": 116
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.39953771833214924,
+      "learning_rate": 0.00019791070107197153,
+      "loss": 0.7477,
+      "step": 117
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.46681097789976966,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.857,
+      "step": 118
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.42500801352275824,
+      "learning_rate": 0.0001978039684836106,
+      "loss": 0.8298,
+      "step": 119
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.44011396356640464,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.8948,
+      "step": 120
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.5114027857148425,
+      "learning_rate": 0.0001976946073830234,
+      "loss": 0.8689,
+      "step": 121
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.4097399009368278,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.7868,
+      "step": 122
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.46786182756581846,
+      "learning_rate": 0.00019758262070932375,
+      "loss": 0.8498,
+      "step": 123
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.42716743980313704,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7826,
+      "step": 124
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.4603939853125694,
+      "learning_rate": 0.00019746801147218842,
+      "loss": 0.83,
+      "step": 125
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.47898291377153074,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.81,
+      "step": 126
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.513393236930296,
+      "learning_rate": 0.00019735078275177654,
+      "loss": 0.7822,
+      "step": 127
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.45006012040918797,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.7338,
+      "step": 128
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.4557955954957533,
+      "learning_rate": 0.00019723093769864663,
+      "loss": 0.7859,
+      "step": 129
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.45112089670358757,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8281,
+      "step": 130
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.4099562601936963,
+      "learning_rate": 0.0001971084795336719,
+      "loss": 0.7823,
+      "step": 131
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.4363924382231477,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.7608,
+      "step": 132
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.4716649317292637,
+      "learning_rate": 0.00019698341154795389,
+      "loss": 0.8259,
+      "step": 133
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.46078755661937293,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.8555,
+      "step": 134
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.4543933387592577,
+      "learning_rate": 0.00019685573710273376,
+      "loss": 0.7473,
+      "step": 135
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.41092088159032913,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.7499,
+      "step": 136
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.5250637195337757,
+      "learning_rate": 0.00019672545962930215,
+      "loss": 0.8652,
+      "step": 137
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.42335216858701397,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8126,
+      "step": 138
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.4106919019198593,
+      "learning_rate": 0.00019659258262890683,
+      "loss": 0.7839,
+      "step": 139
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.4505309788164157,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8319,
+      "step": 140
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.47701267935701247,
+      "learning_rate": 0.00019645710967265882,
+      "loss": 0.8866,
+      "step": 141
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.4602105353793328,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.9068,
+      "step": 142
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.4211006037566543,
+      "learning_rate": 0.00019631904440143612,
+      "loss": 0.7682,
+      "step": 143
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4729031861679495,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.8815,
+      "step": 144
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.5157834892568381,
+      "learning_rate": 0.00019617839052578603,
+      "loss": 0.7583,
+      "step": 145
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.389917286301472,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7356,
+      "step": 146
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.8020131752444241,
+      "learning_rate": 0.0001960351518258255,
+      "loss": 0.8031,
+      "step": 147
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.41886490199603604,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.7928,
+      "step": 148
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.4562959619157432,
+      "learning_rate": 0.00019588933215113926,
+      "loss": 0.7921,
+      "step": 149
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.43888692499935916,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.8582,
+      "step": 150
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.46833199056820335,
+      "learning_rate": 0.00019574093542067673,
+      "loss": 0.7979,
+      "step": 151
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.44478689994491294,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.8095,
+      "step": 152
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.44454986358483156,
+      "learning_rate": 0.0001955899656226464,
+      "loss": 0.8201,
+      "step": 153
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.40740469459807094,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.7693,
+      "step": 154
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.47108802996625854,
+      "learning_rate": 0.0001954364268144088,
+      "loss": 0.8201,
+      "step": 155
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.3936299625831791,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7713,
+      "step": 156
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.4697125406176289,
+      "learning_rate": 0.00019528032312236736,
+      "loss": 0.843,
+      "step": 157
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.40928772702393124,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.796,
+      "step": 158
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.3734963193919449,
+      "learning_rate": 0.00019512165874185767,
+      "loss": 0.8126,
+      "step": 159
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.40137246222831985,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.7547,
+      "step": 160
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.42022270376452675,
+      "learning_rate": 0.0001949604379370345,
+      "loss": 0.7602,
+      "step": 161
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.4681716232178868,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.8285,
+      "step": 162
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.4652249797090735,
+      "learning_rate": 0.00019479666504075736,
+      "loss": 0.8439,
+      "step": 163
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.41682731645078525,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.7696,
+      "step": 164
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.44600530812927036,
+      "learning_rate": 0.0001946303444544741,
+      "loss": 0.8058,
+      "step": 165
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.39969615388564694,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.764,
+      "step": 166
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.39813415475940356,
+      "learning_rate": 0.00019446148064810242,
+      "loss": 0.7702,
+      "step": 167
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.4760556291734454,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8239,
+      "step": 168
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.44706755771174195,
+      "learning_rate": 0.00019429007815990993,
+      "loss": 0.8155,
+      "step": 169
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4502000486187465,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.8252,
+      "step": 170
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.44058029653007835,
+      "learning_rate": 0.00019411614159639204,
+      "loss": 0.7742,
+      "step": 171
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.4596690566770355,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7982,
+      "step": 172
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.4556227036052518,
+      "learning_rate": 0.00019393967563214833,
+      "loss": 0.7844,
+      "step": 173
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.40663953391441715,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.7605,
+      "step": 174
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.4823398406663074,
+      "learning_rate": 0.00019376068500975667,
+      "loss": 0.8604,
+      "step": 175
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4434434432179949,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8015,
+      "step": 176
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.4493958616535301,
+      "learning_rate": 0.000193579174539646,
+      "loss": 0.85,
+      "step": 177
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.4803966578590346,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.8563,
+      "step": 178
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.43226349981508544,
+      "learning_rate": 0.00019339514909996706,
+      "loss": 0.8131,
+      "step": 179
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.47305039724417586,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.8738,
+      "step": 180
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.43002078414180595,
+      "learning_rate": 0.00019320861363646095,
+      "loss": 0.7965,
+      "step": 181
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.4205158721935768,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7707,
+      "step": 182
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.410085310592379,
+      "learning_rate": 0.00019301957316232658,
+      "loss": 0.7176,
+      "step": 183
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.37748157807556615,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.761,
+      "step": 184
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.42171736108851915,
+      "learning_rate": 0.0001928280327580858,
+      "loss": 0.8526,
+      "step": 185
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.42954664943054666,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.8467,
+      "step": 186
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.42428764414649567,
+      "learning_rate": 0.00019263399757144683,
+      "loss": 0.8196,
+      "step": 187
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.47698076811249246,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8608,
+      "step": 188
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.4640375687751282,
+      "learning_rate": 0.000192437472817166,
+      "loss": 0.8035,
+      "step": 189
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.4463228546319291,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.7651,
+      "step": 190
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.4896046005460382,
+      "learning_rate": 0.00019223846377690754,
+      "loss": 0.7356,
+      "step": 191
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.46765869984639236,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.8544,
+      "step": 192
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.4764711702893882,
+      "learning_rate": 0.00019203697579910154,
+      "loss": 0.8524,
+      "step": 193
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.42707169579189624,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7988,
+      "step": 194
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.4212467340691912,
+      "learning_rate": 0.00019183301429880043,
+      "loss": 0.7765,
+      "step": 195
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.45757237942238316,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.8578,
+      "step": 196
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.4094434644819681,
+      "learning_rate": 0.00019162658475753327,
+      "loss": 0.7893,
+      "step": 197
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.40223255624486415,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.7402,
+      "step": 198
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.4622108299481829,
+      "learning_rate": 0.00019141769272315858,
+      "loss": 0.7591,
+      "step": 199
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3964607159201426,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.707,
+      "step": 200
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.4270168927998998,
+      "learning_rate": 0.00019120634380971496,
+      "loss": 0.8036,
+      "step": 201
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.3920550741841073,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.7356,
+      "step": 202
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.452142428420456,
+      "learning_rate": 0.0001909925436972706,
+      "loss": 0.7425,
+      "step": 203
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.44341855235588906,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.8787,
+      "step": 204
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.4103030809347348,
+      "learning_rate": 0.00019077629813177036,
+      "loss": 0.7727,
+      "step": 205
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.40090861914740916,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7651,
+      "step": 206
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.3951992588577195,
+      "learning_rate": 0.00019055761292488142,
+      "loss": 0.7535,
+      "step": 207
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.5164300583980236,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.8772,
+      "step": 208
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.40493635384459464,
+      "learning_rate": 0.00019033649395383702,
+      "loss": 0.7572,
+      "step": 209
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.4996389939569627,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.7533,
+      "step": 210
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.41091072724636457,
+      "learning_rate": 0.00019011294716127867,
+      "loss": 0.75,
+      "step": 211
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.42158327729701495,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7546,
+      "step": 212
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.4631383500453912,
+      "learning_rate": 0.0001898869785550963,
+      "loss": 0.8005,
+      "step": 213
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.39620595629829286,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.7487,
+      "step": 214
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.42423850878663344,
+      "learning_rate": 0.00018965859420826684,
+      "loss": 0.8103,
+      "step": 215
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.4262639407196757,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.6982,
+      "step": 216
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.45313481608455014,
+      "learning_rate": 0.00018942780025869098,
+      "loss": 0.8694,
+      "step": 217
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.4213851398375332,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7663,
+      "step": 218
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.4796064199007877,
+      "learning_rate": 0.00018919460290902826,
+      "loss": 0.8103,
+      "step": 219
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.40359670606514453,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.7737,
+      "step": 220
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.42517228222089504,
+      "learning_rate": 0.0001889590084265304,
+      "loss": 0.8458,
+      "step": 221
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.39853364630036775,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.8487,
+      "step": 222
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.42701320403105236,
+      "learning_rate": 0.0001887210231428727,
+      "loss": 0.7836,
+      "step": 223
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4751710203159611,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.8199,
+      "step": 224
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.4177959997718054,
+      "learning_rate": 0.0001884806534539841,
+      "loss": 0.7856,
+      "step": 225
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.47126589535950586,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8601,
+      "step": 226
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.40956026611263685,
+      "learning_rate": 0.0001882379058198751,
+      "loss": 0.7722,
+      "step": 227
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.3786410889782343,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.7358,
+      "step": 228
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.42441607724963093,
+      "learning_rate": 0.00018799278676446423,
+      "loss": 0.8128,
+      "step": 229
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.3931184022834064,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7754,
+      "step": 230
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.4676541594064247,
+      "learning_rate": 0.00018774530287540278,
+      "loss": 0.8533,
+      "step": 231
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.41772326185138137,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.7974,
+      "step": 232
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.42568144869884195,
+      "learning_rate": 0.00018749546080389757,
+      "loss": 0.7934,
+      "step": 233
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.41667139076366067,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7462,
+      "step": 234
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.3894407998600845,
+      "learning_rate": 0.00018724326726453244,
+      "loss": 0.7018,
+      "step": 235
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.3968628191885499,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7229,
+      "step": 236
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.4311578529579322,
+      "learning_rate": 0.00018698872903508755,
+      "loss": 0.7536,
+      "step": 237
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.4454357510677708,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7686,
+      "step": 238
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.4397625012035176,
+      "learning_rate": 0.0001867318529563574,
+      "loss": 0.8069,
+      "step": 239
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4175749180024889,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7943,
+      "step": 240
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.48026801501164645,
+      "learning_rate": 0.00018647264593196688,
+      "loss": 0.8252,
+      "step": 241
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.43837906517453124,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.8318,
+      "step": 242
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.40552130965049665,
+      "learning_rate": 0.00018621111492818585,
+      "loss": 0.7579,
+      "step": 243
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.41965573044241067,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.7377,
+      "step": 244
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.41379905946624335,
+      "learning_rate": 0.00018594726697374175,
+      "loss": 0.7916,
+      "step": 245
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.39928721320339905,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.7484,
+      "step": 246
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.39695857145017316,
+      "learning_rate": 0.0001856811091596308,
+      "loss": 0.8062,
+      "step": 247
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4257439687221929,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.8251,
+      "step": 248
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.3935361187026131,
+      "learning_rate": 0.00018541264863892754,
+      "loss": 0.7724,
+      "step": 249
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.37834209183020484,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.7611,
+      "step": 250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.40419910202390763,
+      "learning_rate": 0.00018514189262659235,
+      "loss": 0.7845,
+      "step": 251
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4653326295828301,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8503,
+      "step": 252
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.43315318696524024,
+      "learning_rate": 0.00018486884839927768,
+      "loss": 0.8144,
+      "step": 253
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.4417707396554827,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.8415,
+      "step": 254
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.4248813489576955,
+      "learning_rate": 0.0001845935232951325,
+      "loss": 0.8443,
+      "step": 255
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4033508599201362,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7995,
+      "step": 256
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.4317865167024778,
+      "learning_rate": 0.00018431592471360503,
+      "loss": 0.7791,
+      "step": 257
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.40264945344040487,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.8031,
+      "step": 258
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.35295010248954845,
+      "learning_rate": 0.000184036060115244,
+      "loss": 0.7309,
+      "step": 259
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4605989535515705,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.865,
+      "step": 260
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.41132925305495854,
+      "learning_rate": 0.00018375393702149787,
+      "loss": 0.7421,
+      "step": 261
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.4567431675996166,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.8196,
+      "step": 262
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.4544080429648433,
+      "learning_rate": 0.00018346956301451304,
+      "loss": 0.8747,
+      "step": 263
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.43602304381113133,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.8268,
+      "step": 264
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.3680958867209954,
+      "learning_rate": 0.00018318294573692985,
+      "loss": 0.7274,
+      "step": 265
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.41981614126886263,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.7739,
+      "step": 266
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.45534292093001205,
+      "learning_rate": 0.0001828940928916772,
+      "loss": 0.843,
+      "step": 267
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.4176930406124194,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.76,
+      "step": 268
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.4250593407694702,
+      "learning_rate": 0.00018260301224176558,
+      "loss": 0.8059,
+      "step": 269
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.385015379342669,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7183,
+      "step": 270
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.4049753323831426,
+      "learning_rate": 0.00018230971161007853,
+      "loss": 0.7731,
+      "step": 271
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4291693562623502,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.8287,
+      "step": 272
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.42908607036423363,
+      "learning_rate": 0.00018201419887916214,
+      "loss": 0.804,
+      "step": 273
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.41143273147413484,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.7958,
+      "step": 274
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.4219014010666351,
+      "learning_rate": 0.00018171648199101346,
+      "loss": 0.7564,
+      "step": 275
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.40528270916888237,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.8199,
+      "step": 276
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.44757517887666887,
+      "learning_rate": 0.00018141656894686689,
+      "loss": 0.7724,
+      "step": 277
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.42027873904868446,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.8265,
+      "step": 278
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.4148304854268926,
+      "learning_rate": 0.00018111446780697929,
+      "loss": 0.8756,
+      "step": 279
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.38377520871651966,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7468,
+      "step": 280
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.4264543015588282,
+      "learning_rate": 0.00018081018669041324,
+      "loss": 0.8039,
+      "step": 281
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.4136300149086464,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.8031,
+      "step": 282
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.36648543430232305,
+      "learning_rate": 0.00018050373377481878,
+      "loss": 0.6646,
+      "step": 283
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.4522172811815767,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.8701,
+      "step": 284
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.43740644922232075,
+      "learning_rate": 0.0001801951172962139,
+      "loss": 0.8038,
+      "step": 285
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.4174595987164087,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7608,
+      "step": 286
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.393744945703445,
+      "learning_rate": 0.0001798843455487629,
+      "loss": 0.7008,
+      "step": 287
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.4894690003563711,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.8372,
+      "step": 288
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.4217244008041063,
+      "learning_rate": 0.00017957142688455362,
+      "loss": 0.823,
+      "step": 289
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.40260737765643384,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.76,
+      "step": 290
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.40243360935749045,
+      "learning_rate": 0.00017925636971337304,
+      "loss": 0.801,
+      "step": 291
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4348442831307786,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.8471,
+      "step": 292
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.43757909427752245,
+      "learning_rate": 0.00017893918250248104,
+      "loss": 0.8046,
+      "step": 293
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.4280554526661448,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7937,
+      "step": 294
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.3803413146307526,
+      "learning_rate": 0.00017861987377638312,
+      "loss": 0.7347,
+      "step": 295
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.4256064404247689,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7465,
+      "step": 296
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.3954708369493568,
+      "learning_rate": 0.0001782984521166011,
+      "loss": 0.7141,
+      "step": 297
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.34956731911658284,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.6944,
+      "step": 298
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.4281109069360621,
+      "learning_rate": 0.00017797492616144256,
+      "loss": 0.7542,
+      "step": 299
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.38735441498537176,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.6906,
+      "step": 300
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.4804515522714117,
+      "learning_rate": 0.00017764930460576866,
+      "loss": 0.8035,
+      "step": 301
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.438669543093096,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.8029,
+      "step": 302
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.42012676468526255,
+      "learning_rate": 0.00017732159620076053,
+      "loss": 0.8088,
+      "step": 303
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.5101640884332544,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.8541,
+      "step": 304
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.3703113726642919,
+      "learning_rate": 0.00017699180975368396,
+      "loss": 0.7049,
+      "step": 305
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.46783269643660297,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.8282,
+      "step": 306
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.38274836146236746,
+      "learning_rate": 0.00017665995412765285,
+      "loss": 0.7117,
+      "step": 307
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.44467611676383684,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7633,
+      "step": 308
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.39640522112939164,
+      "learning_rate": 0.00017632603824139085,
+      "loss": 0.7628,
+      "step": 309
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.4608950588902465,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.7968,
+      "step": 310
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.40004221944472373,
+      "learning_rate": 0.0001759900710689918,
+      "loss": 0.7359,
+      "step": 311
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.4342650667250139,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8056,
+      "step": 312
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.40171348460939127,
+      "learning_rate": 0.00017565206163967846,
+      "loss": 0.7035,
+      "step": 313
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.36209279422253543,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.6508,
+      "step": 314
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.4034756327472997,
+      "learning_rate": 0.00017531201903755994,
+      "loss": 0.7368,
+      "step": 315
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.39249218081928117,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.7819,
+      "step": 316
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.42471256142276875,
+      "learning_rate": 0.00017496995240138744,
+      "loss": 0.7308,
+      "step": 317
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.4180759336676178,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.7141,
+      "step": 318
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.46855118885752606,
+      "learning_rate": 0.00017462587092430875,
+      "loss": 0.7186,
+      "step": 319
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4337432879128543,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7704,
+      "step": 320
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.4439354117273041,
+      "learning_rate": 0.00017427978385362112,
+      "loss": 0.832,
+      "step": 321
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.4074274598145603,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7478,
+      "step": 322
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.38959870273386044,
+      "learning_rate": 0.0001739317004905227,
+      "loss": 0.6937,
+      "step": 323
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4582004248722656,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.8008,
+      "step": 324
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.3956315002460586,
+      "learning_rate": 0.00017358163018986282,
+      "loss": 0.7315,
+      "step": 325
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.4262268133448119,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.819,
+      "step": 326
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.391879323864864,
+      "learning_rate": 0.00017322958235989016,
+      "loss": 0.7493,
+      "step": 327
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.4274038802228928,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7617,
+      "step": 328
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.4290256337161772,
+      "learning_rate": 0.00017287556646200018,
+      "loss": 0.7613,
+      "step": 329
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.37952638899241375,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.6778,
+      "step": 330
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.3826196285241614,
+      "learning_rate": 0.00017251959201048083,
+      "loss": 0.6832,
+      "step": 331
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.4050950810614008,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7675,
+      "step": 332
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.37958456133326496,
+      "learning_rate": 0.00017216166857225674,
+      "loss": 0.6923,
+      "step": 333
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.48525278577734343,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.8732,
+      "step": 334
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.4350053478588795,
+      "learning_rate": 0.00017180180576663228,
+      "loss": 0.8142,
+      "step": 335
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.44116393153626615,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8282,
+      "step": 336
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.4292115231398351,
+      "learning_rate": 0.00017144001326503273,
+      "loss": 0.7402,
+      "step": 337
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.40532916649912354,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.7299,
+      "step": 338
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.41032432819557796,
+      "learning_rate": 0.00017107630079074478,
+      "loss": 0.7953,
+      "step": 339
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.44426998765207043,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.8494,
+      "step": 340
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.47851547077634976,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.7823,
+      "step": 341
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.3798157359553566,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7411,
+      "step": 342
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.4116387079907209,
+      "learning_rate": 0.00017034315507498635,
+      "loss": 0.7693,
+      "step": 343
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4364919120832264,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7789,
+      "step": 344
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.41713361743005883,
+      "learning_rate": 0.00016997374153703625,
+      "loss": 0.7379,
+      "step": 345
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.4206448578876803,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.7692,
+      "step": 346
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.4592632053929459,
+      "learning_rate": 0.00016960244743290868,
+      "loss": 0.82,
+      "step": 347
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.4523816657682658,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7709,
+      "step": 348
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.38625339507517903,
+      "learning_rate": 0.00016922928274124886,
+      "loss": 0.7234,
+      "step": 349
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.38691327725973956,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.6854,
+      "step": 350
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.5078844842218331,
+      "learning_rate": 0.00016885425749097444,
+      "loss": 0.8543,
+      "step": 351
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4385294481594214,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.8188,
+      "step": 352
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.3673088434620512,
+      "learning_rate": 0.00016847738176100632,
+      "loss": 0.6995,
+      "step": 353
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.3746665041857716,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7012,
+      "step": 354
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.3784264474612318,
+      "learning_rate": 0.0001680986656799975,
+      "loss": 0.6636,
+      "step": 355
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.42920592262138246,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.8046,
+      "step": 356
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.38639789811582664,
+      "learning_rate": 0.00016771811942606108,
+      "loss": 0.7038,
+      "step": 357
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.453939261311069,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.8008,
+      "step": 358
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.5125707908362529,
+      "learning_rate": 0.00016733575322649657,
+      "loss": 0.833,
+      "step": 359
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4053712383468021,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.8229,
+      "step": 360
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.4273953244431224,
+      "learning_rate": 0.00016695157735751513,
+      "loss": 0.792,
+      "step": 361
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.40758364381427453,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7753,
+      "step": 362
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.46744613888999914,
+      "learning_rate": 0.0001665656021439633,
+      "loss": 0.8121,
+      "step": 363
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.42243568027474676,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7225,
+      "step": 364
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.42334249817039404,
+      "learning_rate": 0.00016617783795904565,
+      "loss": 0.7629,
+      "step": 365
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.41496346524495037,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.6595,
+      "step": 366
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.4041449703070248,
+      "learning_rate": 0.00016578829522404583,
+      "loss": 0.76,
+      "step": 367
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4602823503849498,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7463,
+      "step": 368
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.45829102214665335,
+      "learning_rate": 0.00016539698440804661,
+      "loss": 0.807,
+      "step": 369
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.5194444205744141,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.8964,
+      "step": 370
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.4274362092511141,
+      "learning_rate": 0.0001650039160276485,
+      "loss": 0.7341,
+      "step": 371
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.48382529105481803,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.8081,
+      "step": 372
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.43814380428013716,
+      "learning_rate": 0.0001646091006466871,
+      "loss": 0.7683,
+      "step": 373
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.45029893258300174,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.796,
+      "step": 374
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.4400905848459202,
+      "learning_rate": 0.00016421254887594917,
+      "loss": 0.7439,
+      "step": 375
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.46507271493682933,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7493,
+      "step": 376
+    },
+    {
+      "epoch": 0.3016,
+      "grad_norm": 0.470863177488771,
+      "learning_rate": 0.00016381427137288754,
+      "loss": 0.7696,
+      "step": 377
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.4232176350661182,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7332,
+      "step": 378
+    },
+    {
+      "epoch": 0.3032,
+      "grad_norm": 0.44400709128201865,
+      "learning_rate": 0.0001634142788413346,
+      "loss": 0.8178,
+      "step": 379
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.41675859512276436,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7634,
+      "step": 380
+    },
+    {
+      "epoch": 0.3048,
+      "grad_norm": 0.3989561799033566,
+      "learning_rate": 0.00016301258203121462,
+      "loss": 0.7214,
+      "step": 381
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.47417378039265345,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.8712,
+      "step": 382
+    },
+    {
+      "epoch": 0.3064,
+      "grad_norm": 0.3907988356051514,
+      "learning_rate": 0.00016260919173825508,
+      "loss": 0.7742,
+      "step": 383
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.40727263958681575,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7074,
+      "step": 384
+    },
+    {
+      "epoch": 0.308,
+      "grad_norm": 0.4120998531328911,
+      "learning_rate": 0.00016220411880369601,
+      "loss": 0.794,
+      "step": 385
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.3994194092341835,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7515,
+      "step": 386
+    },
+    {
+      "epoch": 0.3096,
+      "grad_norm": 0.4790153356094912,
+      "learning_rate": 0.00016179737411399926,
+      "loss": 0.8656,
+      "step": 387
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.40077998577507146,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7278,
+      "step": 388
+    },
+    {
+      "epoch": 0.3112,
+      "grad_norm": 0.38512668766662994,
+      "learning_rate": 0.00016138896860055555,
+      "loss": 0.7802,
+      "step": 389
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.4011914166317225,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7842,
+      "step": 390
+    },
+    {
+      "epoch": 0.3128,
+      "grad_norm": 0.43022389714437426,
+      "learning_rate": 0.00016097891323939062,
+      "loss": 0.7656,
+      "step": 391
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.39144444511121446,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.744,
+      "step": 392
+    },
+    {
+      "epoch": 0.3144,
+      "grad_norm": 0.3694560310117428,
+      "learning_rate": 0.00016056721905087056,
+      "loss": 0.7312,
+      "step": 393
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.4234341894045398,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.8192,
+      "step": 394
+    },
+    {
+      "epoch": 0.316,
+      "grad_norm": 0.43378554475513786,
+      "learning_rate": 0.00016015389709940538,
+      "loss": 0.7725,
+      "step": 395
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.3926939490195302,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7287,
+      "step": 396
+    },
+    {
+      "epoch": 0.3176,
+      "grad_norm": 0.4353029147339415,
+      "learning_rate": 0.0001597389584931517,
+      "loss": 0.7806,
+      "step": 397
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.4104470890785654,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7792,
+      "step": 398
+    },
+    {
+      "epoch": 0.3192,
+      "grad_norm": 0.4276700358980658,
+      "learning_rate": 0.0001593224143837142,
+      "loss": 0.7555,
+      "step": 399
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4938143578839374,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.8334,
+      "step": 400
+    },
+    {
+      "epoch": 0.3208,
+      "grad_norm": 0.42818147606374335,
+      "learning_rate": 0.00015890427596584617,
+      "loss": 0.8428,
+      "step": 401
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.41406947240352393,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.796,
+      "step": 402
+    },
+    {
+      "epoch": 0.3224,
+      "grad_norm": 0.44281639260862904,
+      "learning_rate": 0.00015848455447714822,
+      "loss": 0.7398,
+      "step": 403
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.4158875186808861,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7935,
+      "step": 404
+    },
+    {
+      "epoch": 0.324,
+      "grad_norm": 0.518693050154203,
+      "learning_rate": 0.00015806326119776663,
+      "loss": 0.8845,
+      "step": 405
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.43617889754344386,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.738,
+      "step": 406
+    },
+    {
+      "epoch": 0.3256,
+      "grad_norm": 0.41393504365642786,
+      "learning_rate": 0.00015764040745008988,
+      "loss": 0.7662,
+      "step": 407
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.3916505340881447,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.7218,
+      "step": 408
+    },
+    {
+      "epoch": 0.3272,
+      "grad_norm": 0.40893671612613586,
+      "learning_rate": 0.00015721600459844468,
+      "loss": 0.7065,
+      "step": 409
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.42860148639099166,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7666,
+      "step": 410
+    },
+    {
+      "epoch": 0.3288,
+      "grad_norm": 0.3770999407298945,
+      "learning_rate": 0.00015679006404879033,
+      "loss": 0.7275,
+      "step": 411
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.3773561415348383,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.725,
+      "step": 412
+    },
+    {
+      "epoch": 0.3304,
+      "grad_norm": 0.42739677259444697,
+      "learning_rate": 0.00015636259724841222,
+      "loss": 0.713,
+      "step": 413
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.3746420515089487,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7082,
+      "step": 414
+    },
+    {
+      "epoch": 0.332,
+      "grad_norm": 0.45154432295964797,
+      "learning_rate": 0.00015593361568561428,
+      "loss": 0.8361,
+      "step": 415
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.4154008294171299,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7281,
+      "step": 416
+    },
+    {
+      "epoch": 0.3336,
+      "grad_norm": 0.41672996846450994,
+      "learning_rate": 0.0001555031308894101,
+      "loss": 0.7824,
+      "step": 417
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.4050763828119448,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.7457,
+      "step": 418
+    },
+    {
+      "epoch": 0.3352,
+      "grad_norm": 0.3578301373838656,
+      "learning_rate": 0.0001550711544292131,
+      "loss": 0.696,
+      "step": 419
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4112973121993917,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7459,
+      "step": 420
+    },
+    {
+      "epoch": 0.3368,
+      "grad_norm": 0.40075180843741315,
+      "learning_rate": 0.00015463769791452574,
+      "loss": 0.7239,
+      "step": 421
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.3843413424730113,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7797,
+      "step": 422
+    },
+    {
+      "epoch": 0.3384,
+      "grad_norm": 0.47035153771457855,
+      "learning_rate": 0.00015420277299462736,
+      "loss": 0.8651,
+      "step": 423
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4043920327079687,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.759,
+      "step": 424
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.4099403010516249,
+      "learning_rate": 0.00015376639135826107,
+      "loss": 0.7971,
+      "step": 425
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.4209734147382475,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7346,
+      "step": 426
+    },
+    {
+      "epoch": 0.3416,
+      "grad_norm": 0.3637299336475154,
+      "learning_rate": 0.00015332856473331978,
+      "loss": 0.655,
+      "step": 427
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.3917734710599,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7769,
+      "step": 428
+    },
+    {
+      "epoch": 0.3432,
+      "grad_norm": 0.4185800922042766,
+      "learning_rate": 0.00015288930488653094,
+      "loss": 0.8091,
+      "step": 429
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.4072904979262855,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7374,
+      "step": 430
+    },
+    {
+      "epoch": 0.3448,
+      "grad_norm": 0.4051590420213116,
+      "learning_rate": 0.0001524486236231402,
+      "loss": 0.8019,
+      "step": 431
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.3456562009787237,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.6344,
+      "step": 432
+    },
+    {
+      "epoch": 0.3464,
+      "grad_norm": 0.45860809030378363,
+      "learning_rate": 0.00015200653278659432,
+      "loss": 0.7723,
+      "step": 433
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.4233875124766856,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7464,
+      "step": 434
+    },
+    {
+      "epoch": 0.348,
+      "grad_norm": 0.46047697958856054,
+      "learning_rate": 0.00015156304425822267,
+      "loss": 0.786,
+      "step": 435
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.4482851013634399,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7635,
+      "step": 436
+    },
+    {
+      "epoch": 0.3496,
+      "grad_norm": 0.41374354113559086,
+      "learning_rate": 0.00015111816995691809,
+      "loss": 0.7731,
+      "step": 437
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.5537275439205439,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.7183,
+      "step": 438
+    },
+    {
+      "epoch": 0.3512,
+      "grad_norm": 0.4090471845722831,
+      "learning_rate": 0.00015067192183881658,
+      "loss": 0.7399,
+      "step": 439
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.4121289703575594,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7735,
+      "step": 440
+    },
+    {
+      "epoch": 0.3528,
+      "grad_norm": 0.5004477427745413,
+      "learning_rate": 0.00015022431189697568,
+      "loss": 0.8698,
+      "step": 441
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.44573437201228233,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.686,
+      "step": 442
+    },
+    {
+      "epoch": 0.3544,
+      "grad_norm": 0.4104756936070795,
+      "learning_rate": 0.0001497753521610526,
+      "loss": 0.7473,
+      "step": 443
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.3481804037211295,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.6573,
+      "step": 444
+    },
+    {
+      "epoch": 0.356,
+      "grad_norm": 0.39354978860410955,
+      "learning_rate": 0.00014932505469698052,
+      "loss": 0.7049,
+      "step": 445
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.38871727928399735,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.6936,
+      "step": 446
+    },
+    {
+      "epoch": 0.3576,
+      "grad_norm": 0.3971615200108154,
+      "learning_rate": 0.0001488734316066446,
+      "loss": 0.629,
+      "step": 447
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.4026758153103403,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7671,
+      "step": 448
+    },
+    {
+      "epoch": 0.3592,
+      "grad_norm": 0.42621146792279624,
+      "learning_rate": 0.0001484204950275565,
+      "loss": 0.7886,
+      "step": 449
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.40719481023108833,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.7683,
+      "step": 450
+    },
+    {
+      "epoch": 0.3608,
+      "grad_norm": 0.4505840013211492,
+      "learning_rate": 0.00014796625713252848,
+      "loss": 0.7966,
+      "step": 451
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.41545457724746276,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7301,
+      "step": 452
+    },
+    {
+      "epoch": 0.3624,
+      "grad_norm": 0.3645602116722643,
+      "learning_rate": 0.00014751073012934587,
+      "loss": 0.6745,
+      "step": 453
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.4137319078060829,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7097,
+      "step": 454
+    },
+    {
+      "epoch": 0.364,
+      "grad_norm": 0.41434719090532385,
+      "learning_rate": 0.0001470539262604393,
+      "loss": 0.7283,
+      "step": 455
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.4189836301797749,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.7514,
+      "step": 456
+    },
+    {
+      "epoch": 0.3656,
+      "grad_norm": 0.4446067879616907,
+      "learning_rate": 0.00014659585780255556,
+      "loss": 0.7609,
+      "step": 457
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.3855150992819854,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.7222,
+      "step": 458
+    },
+    {
+      "epoch": 0.3672,
+      "grad_norm": 0.4186352163434588,
+      "learning_rate": 0.0001461365370664276,
+      "loss": 0.7024,
+      "step": 459
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.43010146235663815,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.8027,
+      "step": 460
+    },
+    {
+      "epoch": 0.3688,
+      "grad_norm": 0.3920652110717218,
+      "learning_rate": 0.00014567597639644387,
+      "loss": 0.6709,
+      "step": 461
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.3764836291055801,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.6964,
+      "step": 462
+    },
+    {
+      "epoch": 0.3704,
+      "grad_norm": 0.4054858421959242,
+      "learning_rate": 0.00014521418817031628,
+      "loss": 0.8018,
+      "step": 463
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.37245905228447623,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.6737,
+      "step": 464
+    },
+    {
+      "epoch": 0.372,
+      "grad_norm": 0.38370190144262273,
+      "learning_rate": 0.00014475118479874774,
+      "loss": 0.7373,
+      "step": 465
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.3709038780796342,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7489,
+      "step": 466
+    },
+    {
+      "epoch": 0.3736,
+      "grad_norm": 0.43629126754331155,
+      "learning_rate": 0.0001442869787250987,
+      "loss": 0.7166,
+      "step": 467
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.46491387794401623,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.8376,
+      "step": 468
+    },
+    {
+      "epoch": 0.3752,
+      "grad_norm": 0.466085825819284,
+      "learning_rate": 0.00014382158242505234,
+      "loss": 0.783,
+      "step": 469
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.44202181643730803,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7547,
+      "step": 470
+    },
+    {
+      "epoch": 0.3768,
+      "grad_norm": 0.4072741860029995,
+      "learning_rate": 0.00014335500840627986,
+      "loss": 0.7164,
+      "step": 471
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.4275506204938012,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.7707,
+      "step": 472
+    },
+    {
+      "epoch": 0.3784,
+      "grad_norm": 0.3857298764352471,
+      "learning_rate": 0.0001428872692081038,
+      "loss": 0.7133,
+      "step": 473
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.37400963436434526,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.656,
+      "step": 474
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.40789460594727933,
+      "learning_rate": 0.00014241837740116132,
+      "loss": 0.7493,
+      "step": 475
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.43917867571684516,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7404,
+      "step": 476
+    },
+    {
+      "epoch": 0.3816,
+      "grad_norm": 0.38770506133101995,
+      "learning_rate": 0.00014194834558706632,
+      "loss": 0.6963,
+      "step": 477
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.4605066053260177,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7401,
+      "step": 478
+    },
+    {
+      "epoch": 0.3832,
+      "grad_norm": 0.3623723615249701,
+      "learning_rate": 0.0001414771863980707,
+      "loss": 0.6946,
+      "step": 479
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.472597365619049,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.849,
+      "step": 480
+    },
+    {
+      "epoch": 0.3848,
+      "grad_norm": 0.38059398536400085,
+      "learning_rate": 0.00014100491249672498,
+      "loss": 0.6829,
+      "step": 481
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.4419328471189818,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7706,
+      "step": 482
+    },
+    {
+      "epoch": 0.3864,
+      "grad_norm": 0.4174488873084172,
+      "learning_rate": 0.0001405315365755379,
+      "loss": 0.7806,
+      "step": 483
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.37995649754107935,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7073,
+      "step": 484
+    },
+    {
+      "epoch": 0.388,
+      "grad_norm": 0.37905339289160617,
+      "learning_rate": 0.00014005707135663527,
+      "loss": 0.7059,
+      "step": 485
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.39285244383934415,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7278,
+      "step": 486
+    },
+    {
+      "epoch": 0.3896,
+      "grad_norm": 0.40403149537453,
+      "learning_rate": 0.00013958152959141825,
+      "loss": 0.7724,
+      "step": 487
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.45547737889839607,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7227,
+      "step": 488
+    },
+    {
+      "epoch": 0.3912,
+      "grad_norm": 0.39041841421012613,
+      "learning_rate": 0.00013910492406022033,
+      "loss": 0.7022,
+      "step": 489
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.4409379938077856,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7887,
+      "step": 490
+    },
+    {
+      "epoch": 0.3928,
+      "grad_norm": 0.40077109270195066,
+      "learning_rate": 0.0001386272675719642,
+      "loss": 0.7368,
+      "step": 491
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.3868223095035165,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.6246,
+      "step": 492
+    },
+    {
+      "epoch": 0.3944,
+      "grad_norm": 0.4553742997236335,
+      "learning_rate": 0.00013814857296381728,
+      "loss": 0.7501,
+      "step": 493
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.4312707891511658,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7813,
+      "step": 494
+    },
+    {
+      "epoch": 0.396,
+      "grad_norm": 0.3825011866495948,
+      "learning_rate": 0.00013766885310084688,
+      "loss": 0.7621,
+      "step": 495
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4005274079834418,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7327,
+      "step": 496
+    },
+    {
+      "epoch": 0.3976,
+      "grad_norm": 0.47541170393500026,
+      "learning_rate": 0.00013718812087567414,
+      "loss": 0.8572,
+      "step": 497
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.39858455064952547,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7918,
+      "step": 498
+    },
+    {
+      "epoch": 0.3992,
+      "grad_norm": 0.35876780138681247,
+      "learning_rate": 0.000136706389208128,
+      "loss": 0.6632,
+      "step": 499
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4132380480989518,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.769,
+      "step": 500
+    },
+    {
+      "epoch": 0.4008,
+      "grad_norm": 0.41498450091839245,
+      "learning_rate": 0.00013622367104489756,
+      "loss": 0.7731,
+      "step": 501
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.41062942938318003,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.7176,
+      "step": 502
+    },
+    {
+      "epoch": 0.4024,
+      "grad_norm": 0.4241205793650312,
+      "learning_rate": 0.0001357399793591844,
+      "loss": 0.8063,
+      "step": 503
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.45146142335068495,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7076,
+      "step": 504
+    },
+    {
+      "epoch": 0.404,
+      "grad_norm": 0.4255936702727934,
+      "learning_rate": 0.00013525532715035366,
+      "loss": 0.7368,
+      "step": 505
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.3725789711645719,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.6377,
+      "step": 506
+    },
+    {
+      "epoch": 0.4056,
+      "grad_norm": 0.4281771555109678,
+      "learning_rate": 0.00013476972744358507,
+      "loss": 0.7534,
+      "step": 507
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.4166453240078789,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.7519,
+      "step": 508
+    },
+    {
+      "epoch": 0.4072,
+      "grad_norm": 0.3935184546178131,
+      "learning_rate": 0.00013428319328952253,
+      "loss": 0.7095,
+      "step": 509
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.4126976340784094,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7341,
+      "step": 510
+    },
+    {
+      "epoch": 0.4088,
+      "grad_norm": 0.3863881482536955,
+      "learning_rate": 0.0001337957377639235,
+      "loss": 0.7071,
+      "step": 511
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.40377691461157295,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7482,
+      "step": 512
+    },
+    {
+      "epoch": 0.4104,
+      "grad_norm": 0.4043391983877137,
+      "learning_rate": 0.0001333073739673076,
+      "loss": 0.7771,
+      "step": 513
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.4404424404024769,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.8005,
+      "step": 514
+    },
+    {
+      "epoch": 0.412,
+      "grad_norm": 0.40512715034400004,
+      "learning_rate": 0.0001328181150246045,
+      "loss": 0.7869,
+      "step": 515
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3794249661172897,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7199,
+      "step": 516
+    },
+    {
+      "epoch": 0.4136,
+      "grad_norm": 0.36643121228895176,
+      "learning_rate": 0.00013232797408480127,
+      "loss": 0.6775,
+      "step": 517
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.41580145081995074,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.786,
+      "step": 518
+    },
+    {
+      "epoch": 0.4152,
+      "grad_norm": 0.3629299406899404,
+      "learning_rate": 0.00013183696432058888,
+      "loss": 0.6818,
+      "step": 519
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4351438441221242,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.786,
+      "step": 520
+    },
+    {
+      "epoch": 0.4168,
+      "grad_norm": 0.4115001032600809,
+      "learning_rate": 0.00013134509892800822,
+      "loss": 0.779,
+      "step": 521
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.4333855714956754,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7593,
+      "step": 522
+    },
+    {
+      "epoch": 0.4184,
+      "grad_norm": 0.37380437241387643,
+      "learning_rate": 0.00013085239112609547,
+      "loss": 0.7469,
+      "step": 523
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.40366060949773175,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7328,
+      "step": 524
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.39096397503334457,
+      "learning_rate": 0.00013035885415652685,
+      "loss": 0.7299,
+      "step": 525
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.4060822153997327,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7612,
+      "step": 526
+    },
+    {
+      "epoch": 0.4216,
+      "grad_norm": 0.33592868000345044,
+      "learning_rate": 0.00012986450128326266,
+      "loss": 0.6428,
+      "step": 527
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.37586206358484725,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7068,
+      "step": 528
+    },
+    {
+      "epoch": 0.4232,
+      "grad_norm": 0.36027202021666715,
+      "learning_rate": 0.00012936934579219094,
+      "loss": 0.6888,
+      "step": 529
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.3805555609387189,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.7545,
+      "step": 530
+    },
+    {
+      "epoch": 0.4248,
+      "grad_norm": 0.3950942606832783,
+      "learning_rate": 0.00012887340099077024,
+      "loss": 0.7705,
+      "step": 531
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.33351189312432067,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.6195,
+      "step": 532
+    },
+    {
+      "epoch": 0.4264,
+      "grad_norm": 0.41876839923857423,
+      "learning_rate": 0.0001283766802076722,
+      "loss": 0.7071,
+      "step": 533
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.38472473714600236,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.6606,
+      "step": 534
+    },
+    {
+      "epoch": 0.428,
+      "grad_norm": 0.3843758442692758,
+      "learning_rate": 0.00012787919679242306,
+      "loss": 0.693,
+      "step": 535
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4283657231928018,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.7061,
+      "step": 536
+    },
+    {
+      "epoch": 0.4296,
+      "grad_norm": 0.42564365770391577,
+      "learning_rate": 0.00012738096411504522,
+      "loss": 0.7592,
+      "step": 537
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.4194372565938139,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7163,
+      "step": 538
+    },
+    {
+      "epoch": 0.4312,
+      "grad_norm": 0.3443994575649735,
+      "learning_rate": 0.00012688199556569753,
+      "loss": 0.6153,
+      "step": 539
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.39972130259342203,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7222,
+      "step": 540
+    },
+    {
+      "epoch": 0.4328,
+      "grad_norm": 0.38675070762078495,
+      "learning_rate": 0.0001263823045543158,
+      "loss": 0.6423,
+      "step": 541
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.41841132296140043,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.8016,
+      "step": 542
+    },
+    {
+      "epoch": 0.4344,
+      "grad_norm": 0.4167829979368189,
+      "learning_rate": 0.00012588190451025207,
+      "loss": 0.7683,
+      "step": 543
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.3876578397333228,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.7366,
+      "step": 544
+    },
+    {
+      "epoch": 0.436,
+      "grad_norm": 0.45360324255725454,
+      "learning_rate": 0.00012538080888191408,
+      "loss": 0.8194,
+      "step": 545
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.3877847545263509,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7216,
+      "step": 546
+    },
+    {
+      "epoch": 0.4376,
+      "grad_norm": 0.3843877463228844,
+      "learning_rate": 0.00012487903113640337,
+      "loss": 0.7648,
+      "step": 547
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3826732302875234,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7269,
+      "step": 548
+    },
+    {
+      "epoch": 0.4392,
+      "grad_norm": 0.4174993947124962,
+      "learning_rate": 0.00012437658475915377,
+      "loss": 0.6984,
+      "step": 549
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.3956102928696449,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.6817,
+      "step": 550
+    },
+    {
+      "epoch": 0.4408,
+      "grad_norm": 0.3855227847776443,
+      "learning_rate": 0.00012387348325356874,
+      "loss": 0.7379,
+      "step": 551
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.3870115797122608,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.6653,
+      "step": 552
+    },
+    {
+      "epoch": 0.4424,
+      "grad_norm": 0.4032790635846131,
+      "learning_rate": 0.00012336974014065844,
+      "loss": 0.7051,
+      "step": 553
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.40870803711492304,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.6609,
+      "step": 554
+    },
+    {
+      "epoch": 0.444,
+      "grad_norm": 0.3836852385424524,
+      "learning_rate": 0.00012286536895867654,
+      "loss": 0.6716,
+      "step": 555
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.39027238939245784,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7238,
+      "step": 556
+    },
+    {
+      "epoch": 0.4456,
+      "grad_norm": 0.44435305582292883,
+      "learning_rate": 0.00012236038326275626,
+      "loss": 0.7842,
+      "step": 557
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.36807764876205074,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7188,
+      "step": 558
+    },
+    {
+      "epoch": 0.4472,
+      "grad_norm": 0.37974313056622594,
+      "learning_rate": 0.00012185479662454595,
+      "loss": 0.6753,
+      "step": 559
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.49191848982158215,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7274,
+      "step": 560
+    },
+    {
+      "epoch": 0.4488,
+      "grad_norm": 0.436705325672836,
+      "learning_rate": 0.00012134862263184467,
+      "loss": 0.805,
+      "step": 561
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.4558997685379279,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.7534,
+      "step": 562
+    },
+    {
+      "epoch": 0.4504,
+      "grad_norm": 0.3614375682478202,
+      "learning_rate": 0.00012084187488823657,
+      "loss": 0.7331,
+      "step": 563
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.40962537424818934,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7567,
+      "step": 564
+    },
+    {
+      "epoch": 0.452,
+      "grad_norm": 0.3717354299114109,
+      "learning_rate": 0.00012033456701272576,
+      "loss": 0.685,
+      "step": 565
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.46433120094348523,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.8213,
+      "step": 566
+    },
+    {
+      "epoch": 0.4536,
+      "grad_norm": 0.3971642044552432,
+      "learning_rate": 0.00011982671263936995,
+      "loss": 0.7562,
+      "step": 567
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.36437109433661913,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.6947,
+      "step": 568
+    },
+    {
+      "epoch": 0.4552,
+      "grad_norm": 0.4094693318616697,
+      "learning_rate": 0.00011931832541691418,
+      "loss": 0.7809,
+      "step": 569
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.39973881274641615,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.7591,
+      "step": 570
+    },
+    {
+      "epoch": 0.4568,
+      "grad_norm": 0.43600619875021857,
+      "learning_rate": 0.00011880941900842397,
+      "loss": 0.7489,
+      "step": 571
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.40584693339445527,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7284,
+      "step": 572
+    },
+    {
+      "epoch": 0.4584,
+      "grad_norm": 0.38949820467031054,
+      "learning_rate": 0.00011830000709091815,
+      "loss": 0.7687,
+      "step": 573
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.31714194772776355,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.5904,
+      "step": 574
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.4385282513881684,
+      "learning_rate": 0.0001177901033550012,
+      "loss": 0.7252,
+      "step": 575
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.39049755420507537,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.6847,
+      "step": 576
+    },
+    {
+      "epoch": 0.4616,
+      "grad_norm": 0.40999065646712857,
+      "learning_rate": 0.00011727972150449544,
+      "loss": 0.771,
+      "step": 577
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.3443179403111987,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.6518,
+      "step": 578
+    },
+    {
+      "epoch": 0.4632,
+      "grad_norm": 0.360005319545128,
+      "learning_rate": 0.00011676887525607271,
+      "loss": 0.6874,
+      "step": 579
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3660357434477541,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.6991,
+      "step": 580
+    },
+    {
+      "epoch": 0.4648,
+      "grad_norm": 0.3501226577092116,
+      "learning_rate": 0.00011625757833888551,
+      "loss": 0.6327,
+      "step": 581
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.34940015104960626,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.6964,
+      "step": 582
+    },
+    {
+      "epoch": 0.4664,
+      "grad_norm": 0.3959512088553381,
+      "learning_rate": 0.0001157458444941984,
+      "loss": 0.7498,
+      "step": 583
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.5000743050602681,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.8117,
+      "step": 584
+    },
+    {
+      "epoch": 0.468,
+      "grad_norm": 0.45493454847087106,
+      "learning_rate": 0.00011523368747501839,
+      "loss": 0.7654,
+      "step": 585
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.4181817072560485,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7049,
+      "step": 586
+    },
+    {
+      "epoch": 0.4696,
+      "grad_norm": 0.4993112369023617,
+      "learning_rate": 0.00011472112104572547,
+      "loss": 0.8586,
+      "step": 587
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3688590762804925,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.6746,
+      "step": 588
+    },
+    {
+      "epoch": 0.4712,
+      "grad_norm": 0.4035478132113849,
+      "learning_rate": 0.0001142081589817027,
+      "loss": 0.722,
+      "step": 589
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.5120096404712139,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.8725,
+      "step": 590
+    },
+    {
+      "epoch": 0.4728,
+      "grad_norm": 0.3943051369711696,
+      "learning_rate": 0.00011369481506896582,
+      "loss": 0.7279,
+      "step": 591
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.49952239523844755,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.806,
+      "step": 592
+    },
+    {
+      "epoch": 0.4744,
+      "grad_norm": 0.4111541198803906,
+      "learning_rate": 0.00011318110310379301,
+      "loss": 0.7445,
+      "step": 593
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.42379537377629345,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.7724,
+      "step": 594
+    },
+    {
+      "epoch": 0.476,
+      "grad_norm": 0.3876066415267279,
+      "learning_rate": 0.00011266703689235394,
+      "loss": 0.733,
+      "step": 595
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.38414077142798214,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.7195,
+      "step": 596
+    },
+    {
+      "epoch": 0.4776,
+      "grad_norm": 0.3961888164133105,
+      "learning_rate": 0.00011215263025033869,
+      "loss": 0.708,
+      "step": 597
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.38639229096606376,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7574,
+      "step": 598
+    },
+    {
+      "epoch": 0.4792,
+      "grad_norm": 0.486289906730662,
+      "learning_rate": 0.00011163789700258655,
+      "loss": 0.749,
+      "step": 599
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.35051217769492726,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.6801,
+      "step": 600
+    },
+    {
+      "epoch": 0.4808,
+      "grad_norm": 0.3992297765587727,
+      "learning_rate": 0.00011112285098271451,
+      "loss": 0.7253,
+      "step": 601
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.4191833397729842,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.6575,
+      "step": 602
+    },
+    {
+      "epoch": 0.4824,
+      "grad_norm": 0.45621005745711735,
+      "learning_rate": 0.00011060750603274535,
+      "loss": 0.7948,
+      "step": 603
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.42468678684324007,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.7157,
+      "step": 604
+    },
+    {
+      "epoch": 0.484,
+      "grad_norm": 0.43463087625405183,
+      "learning_rate": 0.00011009187600273566,
+      "loss": 0.7718,
+      "step": 605
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.40796885202363853,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.6686,
+      "step": 606
+    },
+    {
+      "epoch": 0.4856,
+      "grad_norm": 0.42180210923844613,
+      "learning_rate": 0.00010957597475040373,
+      "loss": 0.7862,
+      "step": 607
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3306181956675294,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.6291,
+      "step": 608
+    },
+    {
+      "epoch": 0.4872,
+      "grad_norm": 0.3986594575930178,
+      "learning_rate": 0.00010905981614075693,
+      "loss": 0.7029,
+      "step": 609
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.37841030070189924,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.6921,
+      "step": 610
+    },
+    {
+      "epoch": 0.4888,
+      "grad_norm": 0.4456022887964803,
+      "learning_rate": 0.00010854341404571928,
+      "loss": 0.7539,
+      "step": 611
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.36332341374759114,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.7275,
+      "step": 612
+    },
+    {
+      "epoch": 0.4904,
+      "grad_norm": 0.4088416033183619,
+      "learning_rate": 0.00010802678234375851,
+      "loss": 0.7169,
+      "step": 613
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.4489539684761865,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.6952,
+      "step": 614
+    },
+    {
+      "epoch": 0.492,
+      "grad_norm": 0.37750215592470654,
+      "learning_rate": 0.0001075099349195131,
+      "loss": 0.6821,
+      "step": 615
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.42414263824607873,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.6966,
+      "step": 616
+    },
+    {
+      "epoch": 0.4936,
+      "grad_norm": 0.4627681021566136,
+      "learning_rate": 0.00010699288566341914,
+      "loss": 0.7437,
+      "step": 617
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.4296399308739044,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7833,
+      "step": 618
+    },
+    {
+      "epoch": 0.4952,
+      "grad_norm": 0.39315532016308347,
+      "learning_rate": 0.000106475648471337,
+      "loss": 0.6944,
+      "step": 619
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.32901112928582993,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.636,
+      "step": 620
+    },
+    {
+      "epoch": 0.4968,
+      "grad_norm": 0.44676630419360114,
+      "learning_rate": 0.00010595823724417795,
+      "loss": 0.8107,
+      "step": 621
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.3193413326727492,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.5789,
+      "step": 622
+    },
+    {
+      "epoch": 0.4984,
+      "grad_norm": 0.4383082208833308,
+      "learning_rate": 0.00010544066588753044,
+      "loss": 0.7389,
+      "step": 623
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.37893839035584187,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.7232,
+      "step": 624
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.40385514455332266,
+      "learning_rate": 0.00010492294831128641,
+      "loss": 0.6872,
+      "step": 625
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.4107409175696198,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7025,
+      "step": 626
+    },
+    {
+      "epoch": 0.5016,
+      "grad_norm": 0.45813590214281025,
+      "learning_rate": 0.00010440509842926767,
+      "loss": 0.7535,
+      "step": 627
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.48500097796997715,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.7116,
+      "step": 628
+    },
+    {
+      "epoch": 0.5032,
+      "grad_norm": 0.4390511318468338,
+      "learning_rate": 0.00010388713015885161,
+      "loss": 0.7775,
+      "step": 629
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.5744965923745033,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.7349,
+      "step": 630
+    },
+    {
+      "epoch": 0.5048,
+      "grad_norm": 0.35685145254456246,
+      "learning_rate": 0.00010336905742059742,
+      "loss": 0.6812,
+      "step": 631
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.4500760176534382,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7752,
+      "step": 632
+    },
+    {
+      "epoch": 0.5064,
+      "grad_norm": 0.43381025518263794,
+      "learning_rate": 0.0001028508941378719,
+      "loss": 0.7205,
+      "step": 633
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.3743091354152876,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7308,
+      "step": 634
+    },
+    {
+      "epoch": 0.508,
+      "grad_norm": 0.35456208664101796,
+      "learning_rate": 0.00010233265423647523,
+      "loss": 0.7058,
+      "step": 635
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.3777273041080645,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.6929,
+      "step": 636
+    },
+    {
+      "epoch": 0.5096,
+      "grad_norm": 0.5627296053512927,
+      "learning_rate": 0.00010181435164426676,
+      "loss": 0.7176,
+      "step": 637
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.3882411951123953,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.6935,
+      "step": 638
+    },
+    {
+      "epoch": 0.5112,
+      "grad_norm": 0.4010106008367087,
+      "learning_rate": 0.00010129600029079072,
+      "loss": 0.7295,
+      "step": 639
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4286166627048035,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7521,
+      "step": 640
+    },
+    {
+      "epoch": 0.5128,
+      "grad_norm": 0.38134404488493295,
+      "learning_rate": 0.00010077761410690172,
+      "loss": 0.6871,
+      "step": 641
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.3649315007168492,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.7501,
+      "step": 642
+    },
+    {
+      "epoch": 0.5144,
+      "grad_norm": 0.32894272639704447,
+      "learning_rate": 0.00010025920702439051,
+      "loss": 0.6173,
+      "step": 643
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.3979670296084114,
+      "learning_rate": 0.0001,
+      "loss": 0.7092,
+      "step": 644
+    },
+    {
+      "epoch": 0.516,
+      "grad_norm": 0.375818746489365,
+      "learning_rate": 9.97407929756095e-05,
+      "loss": 0.6522,
+      "step": 645
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.3298020418260722,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.6296,
+      "step": 646
+    },
+    {
+      "epoch": 0.5176,
+      "grad_norm": 0.35704944917221043,
+      "learning_rate": 9.92223858930983e-05,
+      "loss": 0.6848,
+      "step": 647
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3593459758999515,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.6757,
+      "step": 648
+    },
+    {
+      "epoch": 0.5192,
+      "grad_norm": 0.41017337430423023,
+      "learning_rate": 9.870399970920932e-05,
+      "loss": 0.7464,
+      "step": 649
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.38870788914625504,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7182,
+      "step": 650
+    },
+    {
+      "epoch": 0.5208,
+      "grad_norm": 0.36075187159462824,
+      "learning_rate": 9.818564835573323e-05,
+      "loss": 0.6606,
+      "step": 651
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3465952192890516,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.657,
+      "step": 652
+    },
+    {
+      "epoch": 0.5224,
+      "grad_norm": 0.3890310053274927,
+      "learning_rate": 9.766734576352478e-05,
+      "loss": 0.729,
+      "step": 653
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.36891429507117385,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6528,
+      "step": 654
+    },
+    {
+      "epoch": 0.524,
+      "grad_norm": 0.3889508470707155,
+      "learning_rate": 9.714910586212816e-05,
+      "loss": 0.703,
+      "step": 655
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3708482772935566,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7199,
+      "step": 656
+    },
+    {
+      "epoch": 0.5256,
+      "grad_norm": 0.38630447929278516,
+      "learning_rate": 9.663094257940258e-05,
+      "loss": 0.6558,
+      "step": 657
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.4252677946241481,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.76,
+      "step": 658
+    },
+    {
+      "epoch": 0.5272,
+      "grad_norm": 0.3904895795341803,
+      "learning_rate": 9.611286984114841e-05,
+      "loss": 0.7429,
+      "step": 659
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3903828145173097,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.6662,
+      "step": 660
+    },
+    {
+      "epoch": 0.5288,
+      "grad_norm": 0.4197535838547231,
+      "learning_rate": 9.559490157073236e-05,
+      "loss": 0.6786,
+      "step": 661
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.39358250990389637,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.6959,
+      "step": 662
+    },
+    {
+      "epoch": 0.5304,
+      "grad_norm": 0.39934874751805954,
+      "learning_rate": 9.507705168871358e-05,
+      "loss": 0.7437,
+      "step": 663
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.37851355446137275,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.7198,
+      "step": 664
+    },
+    {
+      "epoch": 0.532,
+      "grad_norm": 0.45341797186688787,
+      "learning_rate": 9.455933411246958e-05,
+      "loss": 0.7767,
+      "step": 665
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.36237706418665944,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.644,
+      "step": 666
+    },
+    {
+      "epoch": 0.5336,
+      "grad_norm": 0.39774273457816467,
+      "learning_rate": 9.404176275582208e-05,
+      "loss": 0.6457,
+      "step": 667
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.47254898712867793,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.7675,
+      "step": 668
+    },
+    {
+      "epoch": 0.5352,
+      "grad_norm": 0.5104606186884628,
+      "learning_rate": 9.352435152866298e-05,
+      "loss": 0.8824,
+      "step": 669
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.42332695730487313,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.7512,
+      "step": 670
+    },
+    {
+      "epoch": 0.5368,
+      "grad_norm": 0.40240037498992387,
+      "learning_rate": 9.300711433658087e-05,
+      "loss": 0.7049,
+      "step": 671
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.4098457092214055,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7245,
+      "step": 672
+    },
+    {
+      "epoch": 0.5384,
+      "grad_norm": 0.4872257879709029,
+      "learning_rate": 9.249006508048694e-05,
+      "loss": 0.6227,
+      "step": 673
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.3397579782310635,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.6183,
+      "step": 674
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.41727071576691904,
+      "learning_rate": 9.197321765624152e-05,
+      "loss": 0.757,
+      "step": 675
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.37577272312505366,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.6718,
+      "step": 676
+    },
+    {
+      "epoch": 0.5416,
+      "grad_norm": 0.4036720264091577,
+      "learning_rate": 9.145658595428074e-05,
+      "loss": 0.7189,
+      "step": 677
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.42896781846769505,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.7073,
+      "step": 678
+    },
+    {
+      "epoch": 0.5432,
+      "grad_norm": 0.47154385000572885,
+      "learning_rate": 9.09401838592431e-05,
+      "loss": 0.8136,
+      "step": 679
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.38893628944026587,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.6901,
+      "step": 680
+    },
+    {
+      "epoch": 0.5448,
+      "grad_norm": 0.3815592852819167,
+      "learning_rate": 9.04240252495963e-05,
+      "loss": 0.7259,
+      "step": 681
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.41294996353926877,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.7351,
+      "step": 682
+    },
+    {
+      "epoch": 0.5464,
+      "grad_norm": 0.4175120763011901,
+      "learning_rate": 8.990812399726435e-05,
+      "loss": 0.7051,
+      "step": 683
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.46669499607969855,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7749,
+      "step": 684
+    },
+    {
+      "epoch": 0.548,
+      "grad_norm": 0.34556767002863736,
+      "learning_rate": 8.939249396725467e-05,
+      "loss": 0.6551,
+      "step": 685
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.4274465736455529,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.7329,
+      "step": 686
+    },
+    {
+      "epoch": 0.5496,
+      "grad_norm": 0.43355152213796083,
+      "learning_rate": 8.887714901728551e-05,
+      "loss": 0.7056,
+      "step": 687
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.6048622245573282,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.794,
+      "step": 688
+    },
+    {
+      "epoch": 0.5512,
+      "grad_norm": 0.3441757994502071,
+      "learning_rate": 8.836210299741346e-05,
+      "loss": 0.6646,
+      "step": 689
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.42196882403388164,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.7324,
+      "step": 690
+    },
+    {
+      "epoch": 0.5528,
+      "grad_norm": 0.4066753822516913,
+      "learning_rate": 8.784736974966135e-05,
+      "loss": 0.7334,
+      "step": 691
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.40730272513897353,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.709,
+      "step": 692
+    },
+    {
+      "epoch": 0.5544,
+      "grad_norm": 0.4081083754585546,
+      "learning_rate": 8.733296310764611e-05,
+      "loss": 0.7078,
+      "step": 693
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.37441700960388963,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.7076,
+      "step": 694
+    },
+    {
+      "epoch": 0.556,
+      "grad_norm": 0.4167855001509996,
+      "learning_rate": 8.6818896896207e-05,
+      "loss": 0.7298,
+      "step": 695
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.3893163424833126,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7387,
+      "step": 696
+    },
+    {
+      "epoch": 0.5576,
+      "grad_norm": 0.39857255128153724,
+      "learning_rate": 8.63051849310342e-05,
+      "loss": 0.7165,
+      "step": 697
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.4535351896839242,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.7973,
+      "step": 698
+    },
+    {
+      "epoch": 0.5592,
+      "grad_norm": 0.4155102465200645,
+      "learning_rate": 8.579184101829734e-05,
+      "loss": 0.6877,
+      "step": 699
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.38378824297310854,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6466,
+      "step": 700
+    },
+    {
+      "epoch": 0.5608,
+      "grad_norm": 0.4113967300469492,
+      "learning_rate": 8.527887895427454e-05,
+      "loss": 0.748,
+      "step": 701
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.39313027960520175,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7822,
+      "step": 702
+    },
+    {
+      "epoch": 0.5624,
+      "grad_norm": 0.44822833663854883,
+      "learning_rate": 8.476631252498162e-05,
+      "loss": 0.7345,
+      "step": 703
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.34454468305430036,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.677,
+      "step": 704
+    },
+    {
+      "epoch": 0.564,
+      "grad_norm": 0.3569000138406648,
+      "learning_rate": 8.425415550580162e-05,
+      "loss": 0.6789,
+      "step": 705
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.4362006511176541,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6211,
+      "step": 706
+    },
+    {
+      "epoch": 0.5656,
+      "grad_norm": 0.366006201500598,
+      "learning_rate": 8.374242166111448e-05,
+      "loss": 0.6798,
+      "step": 707
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.4232382038227556,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7744,
+      "step": 708
+    },
+    {
+      "epoch": 0.5672,
+      "grad_norm": 0.371496901813501,
+      "learning_rate": 8.323112474392731e-05,
+      "loss": 0.6871,
+      "step": 709
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.6731000117772543,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.7085,
+      "step": 710
+    },
+    {
+      "epoch": 0.5688,
+      "grad_norm": 0.39080670456360833,
+      "learning_rate": 8.272027849550457e-05,
+      "loss": 0.6256,
+      "step": 711
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4117997518231274,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.6984,
+      "step": 712
+    },
+    {
+      "epoch": 0.5704,
+      "grad_norm": 0.4822700301622576,
+      "learning_rate": 8.220989664499878e-05,
+      "loss": 0.7917,
+      "step": 713
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.41642009981384603,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6661,
+      "step": 714
+    },
+    {
+      "epoch": 0.572,
+      "grad_norm": 0.43828493263885393,
+      "learning_rate": 8.169999290908188e-05,
+      "loss": 0.7156,
+      "step": 715
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.4240648870455564,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.7666,
+      "step": 716
+    },
+    {
+      "epoch": 0.5736,
+      "grad_norm": 0.5280655669164235,
+      "learning_rate": 8.119058099157604e-05,
+      "loss": 0.8215,
+      "step": 717
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.4314632075111001,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7157,
+      "step": 718
+    },
+    {
+      "epoch": 0.5752,
+      "grad_norm": 0.42766372542648995,
+      "learning_rate": 8.068167458308582e-05,
+      "loss": 0.7356,
+      "step": 719
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3536232789675204,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.6305,
+      "step": 720
+    },
+    {
+      "epoch": 0.5768,
+      "grad_norm": 0.396787968011526,
+      "learning_rate": 8.017328736063006e-05,
+      "loss": 0.688,
+      "step": 721
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.36622359112802105,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6104,
+      "step": 722
+    },
+    {
+      "epoch": 0.5784,
+      "grad_norm": 0.4293433853098972,
+      "learning_rate": 7.966543298727425e-05,
+      "loss": 0.7291,
+      "step": 723
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3980734993834797,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.7039,
+      "step": 724
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.3904293866446969,
+      "learning_rate": 7.915812511176347e-05,
+      "loss": 0.6716,
+      "step": 725
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.36995115448433163,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.6903,
+      "step": 726
+    },
+    {
+      "epoch": 0.5816,
+      "grad_norm": 0.32560215936571923,
+      "learning_rate": 7.865137736815535e-05,
+      "loss": 0.6638,
+      "step": 727
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.4174288747054683,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.7212,
+      "step": 728
+    },
+    {
+      "epoch": 0.5832,
+      "grad_norm": 0.3730951899363253,
+      "learning_rate": 7.814520337545406e-05,
+      "loss": 0.6651,
+      "step": 729
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.4063251235311768,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.7293,
+      "step": 730
+    },
+    {
+      "epoch": 0.5848,
+      "grad_norm": 0.4499268916836471,
+      "learning_rate": 7.763961673724379e-05,
+      "loss": 0.7426,
+      "step": 731
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.396633091460796,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6957,
+      "step": 732
+    },
+    {
+      "epoch": 0.5864,
+      "grad_norm": 0.3578247361979313,
+      "learning_rate": 7.713463104132345e-05,
+      "loss": 0.6577,
+      "step": 733
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.38166873777131227,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.6496,
+      "step": 734
+    },
+    {
+      "epoch": 0.588,
+      "grad_norm": 0.4152385411084173,
+      "learning_rate": 7.663025985934158e-05,
+      "loss": 0.7373,
+      "step": 735
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.4578866385374338,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.6814,
+      "step": 736
+    },
+    {
+      "epoch": 0.5896,
+      "grad_norm": 0.3775144819294085,
+      "learning_rate": 7.61265167464313e-05,
+      "loss": 0.7132,
+      "step": 737
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.37444458875836906,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.7106,
+      "step": 738
+    },
+    {
+      "epoch": 0.5912,
+      "grad_norm": 0.40369298334943393,
+      "learning_rate": 7.562341524084623e-05,
+      "loss": 0.728,
+      "step": 739
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.426758911971294,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.7277,
+      "step": 740
+    },
+    {
+      "epoch": 0.5928,
+      "grad_norm": 0.5001292797021929,
+      "learning_rate": 7.512096886359664e-05,
+      "loss": 0.6971,
+      "step": 741
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.3597006649006697,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6538,
+      "step": 742
+    },
+    {
+      "epoch": 0.5944,
+      "grad_norm": 0.3568043116224071,
+      "learning_rate": 7.461919111808595e-05,
+      "loss": 0.7017,
+      "step": 743
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3651169402407518,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.6328,
+      "step": 744
+    },
+    {
+      "epoch": 0.596,
+      "grad_norm": 0.3844907448236401,
+      "learning_rate": 7.411809548974792e-05,
+      "loss": 0.7112,
+      "step": 745
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.4043079489440896,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6742,
+      "step": 746
+    },
+    {
+      "epoch": 0.5976,
+      "grad_norm": 0.4227127648656904,
+      "learning_rate": 7.361769544568425e-05,
+      "loss": 0.7626,
+      "step": 747
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.44251139527084404,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.7496,
+      "step": 748
+    },
+    {
+      "epoch": 0.5992,
+      "grad_norm": 0.39264336976041503,
+      "learning_rate": 7.311800443430251e-05,
+      "loss": 0.7107,
+      "step": 749
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.3854740044037131,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.6725,
+      "step": 750
+    },
+    {
+      "epoch": 0.6008,
+      "grad_norm": 0.37218277208281797,
+      "learning_rate": 7.26190358849548e-05,
+      "loss": 0.6449,
+      "step": 751
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3725759274566575,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.654,
+      "step": 752
+    },
+    {
+      "epoch": 0.6024,
+      "grad_norm": 0.41710753488610125,
+      "learning_rate": 7.212080320757695e-05,
+      "loss": 0.7009,
+      "step": 753
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.4616704403436593,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.7656,
+      "step": 754
+    },
+    {
+      "epoch": 0.604,
+      "grad_norm": 0.3793693671206012,
+      "learning_rate": 7.162331979232783e-05,
+      "loss": 0.6986,
+      "step": 755
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.4365789904267567,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.723,
+      "step": 756
+    },
+    {
+      "epoch": 0.6056,
+      "grad_norm": 0.33849276678841417,
+      "learning_rate": 7.112659900922976e-05,
+      "loss": 0.5828,
+      "step": 757
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.3770313352866923,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.6679,
+      "step": 758
+    },
+    {
+      "epoch": 0.6072,
+      "grad_norm": 0.35909341550316776,
+      "learning_rate": 7.06306542078091e-05,
+      "loss": 0.6794,
+      "step": 759
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4952730478303068,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.773,
+      "step": 760
+    },
+    {
+      "epoch": 0.6088,
+      "grad_norm": 0.3779200278363108,
+      "learning_rate": 7.013549871673736e-05,
+      "loss": 0.6774,
+      "step": 761
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.5152186295001627,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.7447,
+      "step": 762
+    },
+    {
+      "epoch": 0.6104,
+      "grad_norm": 0.4652336562509862,
+      "learning_rate": 6.964114584347316e-05,
+      "loss": 0.8401,
+      "step": 763
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.5523220933081036,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6601,
+      "step": 764
+    },
+    {
+      "epoch": 0.612,
+      "grad_norm": 0.4303701867318267,
+      "learning_rate": 6.914760887390452e-05,
+      "loss": 0.7262,
+      "step": 765
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.37474398699674083,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.7258,
+      "step": 766
+    },
+    {
+      "epoch": 0.6136,
+      "grad_norm": 0.35065173582230624,
+      "learning_rate": 6.865490107199181e-05,
+      "loss": 0.6654,
+      "step": 767
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.36703219558742856,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6379,
+      "step": 768
+    },
+    {
+      "epoch": 0.6152,
+      "grad_norm": 0.41869079450058233,
+      "learning_rate": 6.816303567941112e-05,
+      "loss": 0.7464,
+      "step": 769
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.3979451147642651,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.7428,
+      "step": 770
+    },
+    {
+      "epoch": 0.6168,
+      "grad_norm": 0.3846429827012229,
+      "learning_rate": 6.767202591519875e-05,
+      "loss": 0.728,
+      "step": 771
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.41960066504509924,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.6702,
+      "step": 772
+    },
+    {
+      "epoch": 0.6184,
+      "grad_norm": 0.3770819093470327,
+      "learning_rate": 6.718188497539554e-05,
+      "loss": 0.6202,
+      "step": 773
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.4206725247444838,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.7032,
+      "step": 774
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.35434322028668563,
+      "learning_rate": 6.669262603269246e-05,
+      "loss": 0.591,
+      "step": 775
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.40789924022628743,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.6677,
+      "step": 776
+    },
+    {
+      "epoch": 0.6216,
+      "grad_norm": 0.3381255774020731,
+      "learning_rate": 6.620426223607654e-05,
+      "loss": 0.6213,
+      "step": 777
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.37479881796793857,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6631,
+      "step": 778
+    },
+    {
+      "epoch": 0.6232,
+      "grad_norm": 0.3758333174731441,
+      "learning_rate": 6.571680671047749e-05,
+      "loss": 0.6406,
+      "step": 779
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.42763347900484716,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7203,
+      "step": 780
+    },
+    {
+      "epoch": 0.6248,
+      "grad_norm": 0.358013225260845,
+      "learning_rate": 6.523027255641493e-05,
+      "loss": 0.6402,
+      "step": 781
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.44232850144520364,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.736,
+      "step": 782
+    },
+    {
+      "epoch": 0.6264,
+      "grad_norm": 0.3868589738684474,
+      "learning_rate": 6.474467284964634e-05,
+      "loss": 0.6979,
+      "step": 783
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.38519475468830017,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6938,
+      "step": 784
+    },
+    {
+      "epoch": 0.628,
+      "grad_norm": 0.38018019360013416,
+      "learning_rate": 6.426002064081565e-05,
+      "loss": 0.6368,
+      "step": 785
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.5303594601394425,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.7593,
+      "step": 786
+    },
+    {
+      "epoch": 0.6296,
+      "grad_norm": 0.3758840665843039,
+      "learning_rate": 6.377632895510248e-05,
+      "loss": 0.6798,
+      "step": 787
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.5330733022028751,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.7707,
+      "step": 788
+    },
+    {
+      "epoch": 0.6312,
+      "grad_norm": 0.4053420748203502,
+      "learning_rate": 6.329361079187199e-05,
+      "loss": 0.6961,
+      "step": 789
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.4000744617505393,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7312,
+      "step": 790
+    },
+    {
+      "epoch": 0.6328,
+      "grad_norm": 0.37461660955043896,
+      "learning_rate": 6.281187912432587e-05,
+      "loss": 0.6765,
+      "step": 791
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.3681625533583791,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6539,
+      "step": 792
+    },
+    {
+      "epoch": 0.6344,
+      "grad_norm": 0.3985134606316111,
+      "learning_rate": 6.233114689915316e-05,
+      "loss": 0.6728,
+      "step": 793
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.44143279098238797,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.743,
+      "step": 794
+    },
+    {
+      "epoch": 0.636,
+      "grad_norm": 0.7081456800521198,
+      "learning_rate": 6.18514270361827e-05,
+      "loss": 0.7892,
+      "step": 795
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.3933123238271958,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.6715,
+      "step": 796
+    },
+    {
+      "epoch": 0.6376,
+      "grad_norm": 0.3601641373087553,
+      "learning_rate": 6.13727324280358e-05,
+      "loss": 0.6267,
+      "step": 797
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.4689956926826061,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.7738,
+      "step": 798
+    },
+    {
+      "epoch": 0.6392,
+      "grad_norm": 0.4792144464704637,
+      "learning_rate": 6.08950759397797e-05,
+      "loss": 0.7722,
+      "step": 799
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3742389485992206,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6807,
+      "step": 800
+    },
+    {
+      "epoch": 0.6408,
+      "grad_norm": 0.3301905166954284,
+      "learning_rate": 6.0418470408581774e-05,
+      "loss": 0.6268,
+      "step": 801
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.35988480884110635,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.6726,
+      "step": 802
+    },
+    {
+      "epoch": 0.6424,
+      "grad_norm": 0.3744294865993888,
+      "learning_rate": 5.9942928643364724e-05,
+      "loss": 0.6467,
+      "step": 803
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.40255348248906203,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6907,
+      "step": 804
+    },
+    {
+      "epoch": 0.644,
+      "grad_norm": 0.43247292970458023,
+      "learning_rate": 5.946846342446214e-05,
+      "loss": 0.6995,
+      "step": 805
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.36178749321741194,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6855,
+      "step": 806
+    },
+    {
+      "epoch": 0.6456,
+      "grad_norm": 0.4368786163665237,
+      "learning_rate": 5.899508750327501e-05,
+      "loss": 0.6982,
+      "step": 807
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.36341677587811755,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6613,
+      "step": 808
+    },
+    {
+      "epoch": 0.6472,
+      "grad_norm": 0.43050183605813286,
+      "learning_rate": 5.8522813601929324e-05,
+      "loss": 0.7721,
+      "step": 809
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.32973117042968936,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.5817,
+      "step": 810
+    },
+    {
+      "epoch": 0.6488,
+      "grad_norm": 0.4470699631633555,
+      "learning_rate": 5.80516544129337e-05,
+      "loss": 0.6889,
+      "step": 811
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.4608207968552782,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.8078,
+      "step": 812
+    },
+    {
+      "epoch": 0.6504,
+      "grad_norm": 0.3593112975488266,
+      "learning_rate": 5.758162259883867e-05,
+      "loss": 0.6923,
+      "step": 813
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.3841678464380535,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.6425,
+      "step": 814
+    },
+    {
+      "epoch": 0.652,
+      "grad_norm": 0.3828351995854355,
+      "learning_rate": 5.7112730791896207e-05,
+      "loss": 0.681,
+      "step": 815
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.4607693007659546,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.7063,
+      "step": 816
+    },
+    {
+      "epoch": 0.6536,
+      "grad_norm": 0.3750724520710695,
+      "learning_rate": 5.664499159372017e-05,
+      "loss": 0.63,
+      "step": 817
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.4031151784274128,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.7019,
+      "step": 818
+    },
+    {
+      "epoch": 0.6552,
+      "grad_norm": 0.3820735547833116,
+      "learning_rate": 5.617841757494762e-05,
+      "loss": 0.6912,
+      "step": 819
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3794383491039805,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.6505,
+      "step": 820
+    },
+    {
+      "epoch": 0.6568,
+      "grad_norm": 0.40145334915032993,
+      "learning_rate": 5.5713021274901335e-05,
+      "loss": 0.7471,
+      "step": 821
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.36627442458854886,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.6942,
+      "step": 822
+    },
+    {
+      "epoch": 0.6584,
+      "grad_norm": 0.3894765216107813,
+      "learning_rate": 5.524881520125229e-05,
+      "loss": 0.6787,
+      "step": 823
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.38355286480633677,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.7154,
+      "step": 824
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.3589843005850102,
+      "learning_rate": 5.4785811829683764e-05,
+      "loss": 0.6107,
+      "step": 825
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.37208511941913724,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.6659,
+      "step": 826
+    },
+    {
+      "epoch": 0.6616,
+      "grad_norm": 0.36916335904670466,
+      "learning_rate": 5.432402360355615e-05,
+      "loss": 0.6546,
+      "step": 827
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4353339417797532,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.7128,
+      "step": 828
+    },
+    {
+      "epoch": 0.6632,
+      "grad_norm": 0.4005890631619237,
+      "learning_rate": 5.386346293357242e-05,
+      "loss": 0.7092,
+      "step": 829
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.3756446420199134,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6778,
+      "step": 830
+    },
+    {
+      "epoch": 0.6648,
+      "grad_norm": 0.37576922952546127,
+      "learning_rate": 5.3404142197444506e-05,
+      "loss": 0.601,
+      "step": 831
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.42520146435293865,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.6656,
+      "step": 832
+    },
+    {
+      "epoch": 0.6664,
+      "grad_norm": 0.39262051542575105,
+      "learning_rate": 5.2946073739560706e-05,
+      "loss": 0.7484,
+      "step": 833
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.37552559270813046,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6849,
+      "step": 834
+    },
+    {
+      "epoch": 0.668,
+      "grad_norm": 0.3695064574427169,
+      "learning_rate": 5.248926987065417e-05,
+      "loss": 0.6113,
+      "step": 835
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.4279487729932737,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6826,
+      "step": 836
+    },
+    {
+      "epoch": 0.6696,
+      "grad_norm": 0.3938359588528079,
+      "learning_rate": 5.203374286747158e-05,
+      "loss": 0.6431,
+      "step": 837
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.3590555518218238,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6433,
+      "step": 838
+    },
+    {
+      "epoch": 0.6712,
+      "grad_norm": 0.4540490980629825,
+      "learning_rate": 5.15795049724435e-05,
+      "loss": 0.7745,
+      "step": 839
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3771570585049773,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.6832,
+      "step": 840
+    },
+    {
+      "epoch": 0.6728,
+      "grad_norm": 0.4415921845867145,
+      "learning_rate": 5.112656839335543e-05,
+      "loss": 0.7557,
+      "step": 841
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.40741586224130755,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7413,
+      "step": 842
+    },
+    {
+      "epoch": 0.6744,
+      "grad_norm": 0.4434471970193744,
+      "learning_rate": 5.0674945303019526e-05,
+      "loss": 0.7126,
+      "step": 843
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.3894162484448366,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.627,
+      "step": 844
+    },
+    {
+      "epoch": 0.676,
+      "grad_norm": 0.3863866092532346,
+      "learning_rate": 5.022464783894744e-05,
+      "loss": 0.676,
+      "step": 845
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.3776175059677852,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7,
+      "step": 846
+    },
+    {
+      "epoch": 0.6776,
+      "grad_norm": 0.38833751137731987,
+      "learning_rate": 4.977568810302432e-05,
+      "loss": 0.6657,
+      "step": 847
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.3643478291891471,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6462,
+      "step": 848
+    },
+    {
+      "epoch": 0.6792,
+      "grad_norm": 0.3718531297835898,
+      "learning_rate": 4.9328078161183464e-05,
+      "loss": 0.6207,
+      "step": 849
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.37090002483660917,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6015,
+      "step": 850
+    },
+    {
+      "epoch": 0.6808,
+      "grad_norm": 0.3818802887497591,
+      "learning_rate": 4.88818300430819e-05,
+      "loss": 0.7237,
+      "step": 851
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.4216500531432546,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6384,
+      "step": 852
+    },
+    {
+      "epoch": 0.6824,
+      "grad_norm": 0.43490214335818284,
+      "learning_rate": 4.843695574177737e-05,
+      "loss": 0.7342,
+      "step": 853
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.3874365160955942,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.6828,
+      "step": 854
+    },
+    {
+      "epoch": 0.684,
+      "grad_norm": 0.4101791944198361,
+      "learning_rate": 4.7993467213405706e-05,
+      "loss": 0.6987,
+      "step": 855
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.4019633596180036,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6716,
+      "step": 856
+    },
+    {
+      "epoch": 0.6856,
+      "grad_norm": 0.3881943766742739,
+      "learning_rate": 4.755137637685979e-05,
+      "loss": 0.682,
+      "step": 857
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.37086431272395237,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.6382,
+      "step": 858
+    },
+    {
+      "epoch": 0.6872,
+      "grad_norm": 0.3229897376133167,
+      "learning_rate": 4.7110695113469085e-05,
+      "loss": 0.5634,
+      "step": 859
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.37195166028459176,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.6126,
+      "step": 860
+    },
+    {
+      "epoch": 0.6888,
+      "grad_norm": 0.35736672026268756,
+      "learning_rate": 4.6671435266680216e-05,
+      "loss": 0.6428,
+      "step": 861
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.39181342064986285,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.6178,
+      "step": 862
+    },
+    {
+      "epoch": 0.6904,
+      "grad_norm": 0.4622321102444061,
+      "learning_rate": 4.623360864173893e-05,
+      "loss": 0.7532,
+      "step": 863
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3508420398934294,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.6165,
+      "step": 864
+    },
+    {
+      "epoch": 0.692,
+      "grad_norm": 0.43584792026732694,
+      "learning_rate": 4.579722700537268e-05,
+      "loss": 0.6993,
+      "step": 865
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.3865443765265882,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6686,
+      "step": 866
+    },
+    {
+      "epoch": 0.6936,
+      "grad_norm": 0.44089726242809385,
+      "learning_rate": 4.5362302085474254e-05,
+      "loss": 0.7401,
+      "step": 867
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3636091967598959,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.6735,
+      "step": 868
+    },
+    {
+      "epoch": 0.6952,
+      "grad_norm": 0.45260151896253265,
+      "learning_rate": 4.492884557078688e-05,
+      "loss": 0.8166,
+      "step": 869
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.3716433090985538,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.6785,
+      "step": 870
+    },
+    {
+      "epoch": 0.6968,
+      "grad_norm": 0.3490016723110263,
+      "learning_rate": 4.449686911058992e-05,
+      "loss": 0.6338,
+      "step": 871
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.35261459639093856,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.6912,
+      "step": 872
+    },
+    {
+      "epoch": 0.6984,
+      "grad_norm": 0.430517813454168,
+      "learning_rate": 4.406638431438576e-05,
+      "loss": 0.7577,
+      "step": 873
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.39080082714846687,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.7045,
+      "step": 874
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.3845683325648363,
+      "learning_rate": 4.36374027515878e-05,
+      "loss": 0.6631,
+      "step": 875
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.4011660221395093,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6698,
+      "step": 876
+    },
+    {
+      "epoch": 0.7016,
+      "grad_norm": 0.3803430175217997,
+      "learning_rate": 4.320993595120969e-05,
+      "loss": 0.7293,
+      "step": 877
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.4345641406916766,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.7021,
+      "step": 878
+    },
+    {
+      "epoch": 0.7032,
+      "grad_norm": 0.37788514553335906,
+      "learning_rate": 4.278399540155536e-05,
+      "loss": 0.6335,
+      "step": 879
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.3754341531521682,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.613,
+      "step": 880
+    },
+    {
+      "epoch": 0.7048,
+      "grad_norm": 0.3643334891575628,
+      "learning_rate": 4.2359592549910145e-05,
+      "loss": 0.6607,
+      "step": 881
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.3886671386423688,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.6659,
+      "step": 882
+    },
+    {
+      "epoch": 0.7064,
+      "grad_norm": 0.3968260849187156,
+      "learning_rate": 4.193673880223339e-05,
+      "loss": 0.654,
+      "step": 883
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.4574918781588961,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.7591,
+      "step": 884
+    },
+    {
+      "epoch": 0.708,
+      "grad_norm": 0.39583832461725227,
+      "learning_rate": 4.1515445522851784e-05,
+      "loss": 0.6505,
+      "step": 885
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.3871856632764471,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.7419,
+      "step": 886
+    },
+    {
+      "epoch": 0.7096,
+      "grad_norm": 0.39802865534197646,
+      "learning_rate": 4.109572403415386e-05,
+      "loss": 0.6144,
+      "step": 887
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4036792124871241,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.6651,
+      "step": 888
+    },
+    {
+      "epoch": 0.7112,
+      "grad_norm": 0.3629294435919223,
+      "learning_rate": 4.0677585616285774e-05,
+      "loss": 0.6717,
+      "step": 889
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.4223531830282377,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.702,
+      "step": 890
+    },
+    {
+      "epoch": 0.7128,
+      "grad_norm": 0.3546390164764367,
+      "learning_rate": 4.026104150684835e-05,
+      "loss": 0.6737,
+      "step": 891
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.3606188495178436,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6783,
+      "step": 892
+    },
+    {
+      "epoch": 0.7144,
+      "grad_norm": 0.467315941854401,
+      "learning_rate": 3.984610290059467e-05,
+      "loss": 0.7204,
+      "step": 893
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.3846970151375505,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.5743,
+      "step": 894
+    },
+    {
+      "epoch": 0.716,
+      "grad_norm": 0.37861455834306523,
+      "learning_rate": 3.943278094912946e-05,
+      "loss": 0.6426,
+      "step": 895
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3596346935109103,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6619,
+      "step": 896
+    },
+    {
+      "epoch": 0.7176,
+      "grad_norm": 0.36957034226288926,
+      "learning_rate": 3.902108676060937e-05,
+      "loss": 0.7005,
+      "step": 897
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.3847442596608294,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6349,
+      "step": 898
+    },
+    {
+      "epoch": 0.7192,
+      "grad_norm": 0.37152537084018145,
+      "learning_rate": 3.861103139944449e-05,
+      "loss": 0.6265,
+      "step": 899
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.4575634958571184,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.7508,
+      "step": 900
+    },
+    {
+      "epoch": 0.7208,
+      "grad_norm": 0.38577413678044076,
+      "learning_rate": 3.820262588600074e-05,
+      "loss": 0.6927,
+      "step": 901
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.3302322356224181,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6325,
+      "step": 902
+    },
+    {
+      "epoch": 0.7224,
+      "grad_norm": 0.39399513814722714,
+      "learning_rate": 3.7795881196303995e-05,
+      "loss": 0.7318,
+      "step": 903
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.4375238247354526,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.7364,
+      "step": 904
+    },
+    {
+      "epoch": 0.724,
+      "grad_norm": 0.4012366402819657,
+      "learning_rate": 3.739080826174498e-05,
+      "loss": 0.6626,
+      "step": 905
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.4153527247575827,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.6455,
+      "step": 906
+    },
+    {
+      "epoch": 0.7256,
+      "grad_norm": 0.45562157771641276,
+      "learning_rate": 3.6987417968785366e-05,
+      "loss": 0.6857,
+      "step": 907
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.37143963398553415,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.6559,
+      "step": 908
+    },
+    {
+      "epoch": 0.7272,
+      "grad_norm": 0.35400310240819777,
+      "learning_rate": 3.658572115866541e-05,
+      "loss": 0.6326,
+      "step": 909
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4196101020601408,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6649,
+      "step": 910
+    },
+    {
+      "epoch": 0.7288,
+      "grad_norm": 0.4271247927155786,
+      "learning_rate": 3.618572862711247e-05,
+      "loss": 0.7248,
+      "step": 911
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.39936532676017306,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.6407,
+      "step": 912
+    },
+    {
+      "epoch": 0.7304,
+      "grad_norm": 0.40373812391774183,
+      "learning_rate": 3.578745112405083e-05,
+      "loss": 0.6659,
+      "step": 913
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.4227238458265484,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.7585,
+      "step": 914
+    },
+    {
+      "epoch": 0.732,
+      "grad_norm": 0.36629473480527946,
+      "learning_rate": 3.539089935331294e-05,
+      "loss": 0.6324,
+      "step": 915
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4150563120212974,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.735,
+      "step": 916
+    },
+    {
+      "epoch": 0.7336,
+      "grad_norm": 0.38188336972420933,
+      "learning_rate": 3.4996083972351515e-05,
+      "loss": 0.6507,
+      "step": 917
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.42389403459769254,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.7406,
+      "step": 918
+    },
+    {
+      "epoch": 0.7352,
+      "grad_norm": 0.36331268006366385,
+      "learning_rate": 3.4603015591953395e-05,
+      "loss": 0.6199,
+      "step": 919
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.37595893886168213,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.615,
+      "step": 920
+    },
+    {
+      "epoch": 0.7368,
+      "grad_norm": 0.44337545720228805,
+      "learning_rate": 3.421170477595419e-05,
+      "loss": 0.6587,
+      "step": 921
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.4048425474160189,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.7039,
+      "step": 922
+    },
+    {
+      "epoch": 0.7384,
+      "grad_norm": 0.399459405268422,
+      "learning_rate": 3.3822162040954354e-05,
+      "loss": 0.7177,
+      "step": 923
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.438929791874048,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6438,
+      "step": 924
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.32299495901286296,
+      "learning_rate": 3.34343978560367e-05,
+      "loss": 0.5864,
+      "step": 925
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.41176721861809107,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.7035,
+      "step": 926
+    },
+    {
+      "epoch": 0.7416,
+      "grad_norm": 0.41891098855382514,
+      "learning_rate": 3.3048422642484886e-05,
+      "loss": 0.6946,
+      "step": 927
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.34807605861789076,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.5932,
+      "step": 928
+    },
+    {
+      "epoch": 0.7432,
+      "grad_norm": 0.39692972686466704,
+      "learning_rate": 3.266424677350346e-05,
+      "loss": 0.7343,
+      "step": 929
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.44396979022684196,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.7643,
+      "step": 930
+    },
+    {
+      "epoch": 0.7448,
+      "grad_norm": 0.3504173730748221,
+      "learning_rate": 3.228188057393895e-05,
+      "loss": 0.6379,
+      "step": 931
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.3709172492987434,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.624,
+      "step": 932
+    },
+    {
+      "epoch": 0.7464,
+      "grad_norm": 0.3551495218636917,
+      "learning_rate": 3.190133432000252e-05,
+      "loss": 0.6517,
+      "step": 933
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.3610548339661083,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.686,
+      "step": 934
+    },
+    {
+      "epoch": 0.748,
+      "grad_norm": 0.35221470234389773,
+      "learning_rate": 3.1522618238993725e-05,
+      "loss": 0.5881,
+      "step": 935
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.47129598984211485,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.767,
+      "step": 936
+    },
+    {
+      "epoch": 0.7496,
+      "grad_norm": 0.4281984592959207,
+      "learning_rate": 3.114574250902558e-05,
+      "loss": 0.6818,
+      "step": 937
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.4136989144183822,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.7458,
+      "step": 938
+    },
+    {
+      "epoch": 0.7512,
+      "grad_norm": 0.403398217159144,
+      "learning_rate": 3.077071725875116e-05,
+      "loss": 0.6889,
+      "step": 939
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3820759005938559,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6272,
+      "step": 940
+    },
+    {
+      "epoch": 0.7528,
+      "grad_norm": 0.36206663440057796,
+      "learning_rate": 3.0397552567091337e-05,
+      "loss": 0.6344,
+      "step": 941
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.382037624579827,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.7231,
+      "step": 942
+    },
+    {
+      "epoch": 0.7544,
+      "grad_norm": 0.4009265534647435,
+      "learning_rate": 3.0026258462963787e-05,
+      "loss": 0.6901,
+      "step": 943
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3668408443563656,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.6501,
+      "step": 944
+    },
+    {
+      "epoch": 0.756,
+      "grad_norm": 0.46918634921474295,
+      "learning_rate": 2.9656844925013637e-05,
+      "loss": 0.7383,
+      "step": 945
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.4011047795393638,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.7044,
+      "step": 946
+    },
+    {
+      "epoch": 0.7576,
+      "grad_norm": 0.3978658551973164,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.6485,
+      "step": 947
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.4311999483799842,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.7244,
+      "step": 948
+    },
+    {
+      "epoch": 0.7592,
+      "grad_norm": 0.4405955409446885,
+      "learning_rate": 2.8923699209255284e-05,
+      "loss": 0.8154,
+      "step": 949
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.35148168463275686,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.6329,
+      "step": 950
+    },
+    {
+      "epoch": 0.7608,
+      "grad_norm": 0.4391708027774675,
+      "learning_rate": 2.8559986734967282e-05,
+      "loss": 0.6859,
+      "step": 951
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.401233479375333,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6705,
+      "step": 952
+    },
+    {
+      "epoch": 0.7624,
+      "grad_norm": 0.38501925261003556,
+      "learning_rate": 2.819819423336775e-05,
+      "loss": 0.6618,
+      "step": 953
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.3824208693962986,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6804,
+      "step": 954
+    },
+    {
+      "epoch": 0.764,
+      "grad_norm": 0.36708580610823355,
+      "learning_rate": 2.7838331427743282e-05,
+      "loss": 0.6485,
+      "step": 955
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.3908615234947648,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.7154,
+      "step": 956
+    },
+    {
+      "epoch": 0.7656,
+      "grad_norm": 0.4450760968444355,
+      "learning_rate": 2.7480407989519198e-05,
+      "loss": 0.7355,
+      "step": 957
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.4244818859774434,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.6767,
+      "step": 958
+    },
+    {
+      "epoch": 0.7672,
+      "grad_norm": 0.38961087423927715,
+      "learning_rate": 2.712443353799984e-05,
+      "loss": 0.6501,
+      "step": 959
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.40128368555614596,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6782,
+      "step": 960
+    },
+    {
+      "epoch": 0.7688,
+      "grad_norm": 0.3962240832210282,
+      "learning_rate": 2.677041764010988e-05,
+      "loss": 0.6899,
+      "step": 961
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.3437743452274928,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6179,
+      "step": 962
+    },
+    {
+      "epoch": 0.7704,
+      "grad_norm": 0.44070949897096634,
+      "learning_rate": 2.6418369810137188e-05,
+      "loss": 0.7175,
+      "step": 963
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3986166249950004,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.6976,
+      "step": 964
+    },
+    {
+      "epoch": 0.772,
+      "grad_norm": 0.37466672276486496,
+      "learning_rate": 2.6068299509477266e-05,
+      "loss": 0.6763,
+      "step": 965
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.46823044700217803,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6756,
+      "step": 966
+    },
+    {
+      "epoch": 0.7736,
+      "grad_norm": 0.4077060986771547,
+      "learning_rate": 2.5720216146378917e-05,
+      "loss": 0.7142,
+      "step": 967
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4096255017834381,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.7017,
+      "step": 968
+    },
+    {
+      "epoch": 0.7752,
+      "grad_norm": 0.39291648839729004,
+      "learning_rate": 2.5374129075691265e-05,
+      "loss": 0.6814,
+      "step": 969
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.41934210837225766,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.7685,
+      "step": 970
+    },
+    {
+      "epoch": 0.7768,
+      "grad_norm": 0.42463297275213213,
+      "learning_rate": 2.503004759861258e-05,
+      "loss": 0.6305,
+      "step": 971
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.39826267719605246,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6776,
+      "step": 972
+    },
+    {
+      "epoch": 0.7784,
+      "grad_norm": 0.4201012844491468,
+      "learning_rate": 2.4687980962440072e-05,
+      "loss": 0.6871,
+      "step": 973
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.42423210152452784,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6828,
+      "step": 974
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.3716904673400798,
+      "learning_rate": 2.4347938360321566e-05,
+      "loss": 0.65,
+      "step": 975
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.44715619118674044,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.7075,
+      "step": 976
+    },
+    {
+      "epoch": 0.7816,
+      "grad_norm": 0.44812690231399344,
+      "learning_rate": 2.400992893100822e-05,
+      "loss": 0.6788,
+      "step": 977
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.4148743954758556,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6515,
+      "step": 978
+    },
+    {
+      "epoch": 0.7832,
+      "grad_norm": 0.3815731146703102,
+      "learning_rate": 2.3673961758609152e-05,
+      "loss": 0.7249,
+      "step": 979
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4428462685419307,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.7174,
+      "step": 980
+    },
+    {
+      "epoch": 0.7848,
+      "grad_norm": 0.3433519946274387,
+      "learning_rate": 2.334004587234717e-05,
+      "loss": 0.6333,
+      "step": 981
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.44125548907997186,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.7504,
+      "step": 982
+    },
+    {
+      "epoch": 0.7864,
+      "grad_norm": 0.36862297747718953,
+      "learning_rate": 2.300819024631603e-05,
+      "loss": 0.6269,
+      "step": 983
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.37319845184821043,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6257,
+      "step": 984
+    },
+    {
+      "epoch": 0.788,
+      "grad_norm": 0.4427215035234154,
+      "learning_rate": 2.26784037992395e-05,
+      "loss": 0.753,
+      "step": 985
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.37144786727059803,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6446,
+      "step": 986
+    },
+    {
+      "epoch": 0.7896,
+      "grad_norm": 0.38097204617512126,
+      "learning_rate": 2.2350695394231345e-05,
+      "loss": 0.6654,
+      "step": 987
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.4225298229068004,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.6565,
+      "step": 988
+    },
+    {
+      "epoch": 0.7912,
+      "grad_norm": 0.3691817251407238,
+      "learning_rate": 2.2025073838557454e-05,
+      "loss": 0.6551,
+      "step": 989
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.37136941337706175,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.5964,
+      "step": 990
+    },
+    {
+      "epoch": 0.7928,
+      "grad_norm": 0.37450244628290513,
+      "learning_rate": 2.1701547883398922e-05,
+      "loss": 0.6865,
+      "step": 991
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.4130833114697751,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6977,
+      "step": 992
+    },
+    {
+      "epoch": 0.7944,
+      "grad_norm": 0.44555395651971014,
+      "learning_rate": 2.138012622361689e-05,
+      "loss": 0.6377,
+      "step": 993
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.3968898425692245,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6254,
+      "step": 994
+    },
+    {
+      "epoch": 0.796,
+      "grad_norm": 0.3712169948573636,
+      "learning_rate": 2.106081749751897e-05,
+      "loss": 0.6343,
+      "step": 995
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3865098872535555,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.6337,
+      "step": 996
+    },
+    {
+      "epoch": 0.7976,
+      "grad_norm": 0.43081481479411904,
+      "learning_rate": 2.0743630286627002e-05,
+      "loss": 0.669,
+      "step": 997
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.33824124335767314,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6073,
+      "step": 998
+    },
+    {
+      "epoch": 0.7992,
+      "grad_norm": 0.3664290646462942,
+      "learning_rate": 2.0428573115446392e-05,
+      "loss": 0.6355,
+      "step": 999
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.4116124972227826,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6394,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8008,
+      "grad_norm": 0.4214488547569396,
+      "learning_rate": 2.011565445123711e-05,
+      "loss": 0.6894,
+      "step": 1001
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.35041197043145234,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.6128,
+      "step": 1002
+    },
+    {
+      "epoch": 0.8024,
+      "grad_norm": 0.3928012124823455,
+      "learning_rate": 1.980488270378612e-05,
+      "loss": 0.6737,
+      "step": 1003
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.41725282345429426,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.7083,
+      "step": 1004
+    },
+    {
+      "epoch": 0.804,
+      "grad_norm": 0.3615156197633144,
+      "learning_rate": 1.9496266225181248e-05,
+      "loss": 0.639,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.41426420962795724,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.693,
+      "step": 1006
+    },
+    {
+      "epoch": 0.8056,
+      "grad_norm": 0.3683560317023612,
+      "learning_rate": 1.918981330958678e-05,
+      "loss": 0.5815,
+      "step": 1007
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.41568039625658215,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6066,
+      "step": 1008
+    },
+    {
+      "epoch": 0.8072,
+      "grad_norm": 0.4006963112284083,
+      "learning_rate": 1.8885532193020704e-05,
+      "loss": 0.6937,
+      "step": 1009
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.41018249261627265,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.717,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8088,
+      "grad_norm": 0.4995323353787287,
+      "learning_rate": 1.8583431053133127e-05,
+      "loss": 0.7592,
+      "step": 1011
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.43650575521391843,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.6515,
+      "step": 1012
+    },
+    {
+      "epoch": 0.8104,
+      "grad_norm": 0.40163729148136174,
+      "learning_rate": 1.8283518008986567e-05,
+      "loss": 0.6575,
+      "step": 1013
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.40080269999384505,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.6775,
+      "step": 1014
+    },
+    {
+      "epoch": 0.812,
+      "grad_norm": 0.3940333512523945,
+      "learning_rate": 1.7985801120837865e-05,
+      "loss": 0.6355,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.42995740013190453,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.7174,
+      "step": 1016
+    },
+    {
+      "epoch": 0.8136,
+      "grad_norm": 0.4008589303162475,
+      "learning_rate": 1.7690288389921493e-05,
+      "loss": 0.6797,
+      "step": 1017
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.43877207243888605,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.742,
+      "step": 1018
+    },
+    {
+      "epoch": 0.8152,
+      "grad_norm": 0.4143732694255588,
+      "learning_rate": 1.739698775823442e-05,
+      "loss": 0.6872,
+      "step": 1019
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.362122022640218,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.6237,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8168,
+      "grad_norm": 0.401006384524641,
+      "learning_rate": 1.7105907108322816e-05,
+      "loss": 0.6287,
+      "step": 1021
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.41150942890868347,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.6576,
+      "step": 1022
+    },
+    {
+      "epoch": 0.8184,
+      "grad_norm": 0.40850542982370197,
+      "learning_rate": 1.6817054263070174e-05,
+      "loss": 0.6039,
+      "step": 1023
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.42762187690430686,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6763,
+      "step": 1024
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.38675675962303874,
+      "learning_rate": 1.6530436985486996e-05,
+      "loss": 0.6441,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.4138391754414263,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.7003,
+      "step": 1026
+    },
+    {
+      "epoch": 0.8216,
+      "grad_norm": 0.41573860167128435,
+      "learning_rate": 1.6246062978502164e-05,
+      "loss": 0.6968,
+      "step": 1027
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.42718124101470617,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6816,
+      "step": 1028
+    },
+    {
+      "epoch": 0.8232,
+      "grad_norm": 0.3716399930215204,
+      "learning_rate": 1.5963939884756042e-05,
+      "loss": 0.6321,
+      "step": 1029
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.469897313356421,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.7003,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8248,
+      "grad_norm": 0.4041837674527376,
+      "learning_rate": 1.5684075286394985e-05,
+      "loss": 0.7056,
+      "step": 1031
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3657042641738609,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.5993,
+      "step": 1032
+    },
+    {
+      "epoch": 0.8264,
+      "grad_norm": 0.3993576544320269,
+      "learning_rate": 1.5406476704867524e-05,
+      "loss": 0.6623,
+      "step": 1033
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.35331311245911873,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6126,
+      "step": 1034
+    },
+    {
+      "epoch": 0.828,
+      "grad_norm": 0.3573893675271595,
+      "learning_rate": 1.5131151600722337e-05,
+      "loss": 0.5988,
+      "step": 1035
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3709209680359646,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6527,
+      "step": 1036
+    },
+    {
+      "epoch": 0.8296,
+      "grad_norm": 0.41541374035461237,
+      "learning_rate": 1.485810737340767e-05,
+      "loss": 0.6688,
+      "step": 1037
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.3764019643545416,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.6141,
+      "step": 1038
+    },
+    {
+      "epoch": 0.8312,
+      "grad_norm": 0.4369468983893906,
+      "learning_rate": 1.4587351361072454e-05,
+      "loss": 0.6751,
+      "step": 1039
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.38250970066817025,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6329,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8328,
+      "grad_norm": 0.42672350476593035,
+      "learning_rate": 1.4318890840369182e-05,
+      "loss": 0.6886,
+      "step": 1041
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.37009545183960346,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.6003,
+      "step": 1042
+    },
+    {
+      "epoch": 0.8344,
+      "grad_norm": 0.37229548936007656,
+      "learning_rate": 1.4052733026258281e-05,
+      "loss": 0.6348,
+      "step": 1043
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.4549849078289334,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.7643,
+      "step": 1044
+    },
+    {
+      "epoch": 0.836,
+      "grad_norm": 0.37206131026446826,
+      "learning_rate": 1.3788885071814172e-05,
+      "loss": 0.6873,
+      "step": 1045
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.38122123278652426,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6317,
+      "step": 1046
+    },
+    {
+      "epoch": 0.8376,
+      "grad_norm": 0.42787305571612866,
+      "learning_rate": 1.3527354068033139e-05,
+      "loss": 0.7039,
+      "step": 1047
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.42531717697000854,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.714,
+      "step": 1048
+    },
+    {
+      "epoch": 0.8392,
+      "grad_norm": 0.42190425635710993,
+      "learning_rate": 1.326814704364262e-05,
+      "loss": 0.6994,
+      "step": 1049
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.3275707945531675,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.5978,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8408,
+      "grad_norm": 0.4123729826288982,
+      "learning_rate": 1.3011270964912459e-05,
+      "loss": 0.6265,
+      "step": 1051
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.38435160511624555,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6507,
+      "step": 1052
+    },
+    {
+      "epoch": 0.8424,
+      "grad_norm": 0.37267373640836915,
+      "learning_rate": 1.275673273546758e-05,
+      "loss": 0.6474,
+      "step": 1053
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.38302673161912193,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6811,
+      "step": 1054
+    },
+    {
+      "epoch": 0.844,
+      "grad_norm": 0.4091845289678434,
+      "learning_rate": 1.2504539196102439e-05,
+      "loss": 0.7164,
+      "step": 1055
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.40155070927607034,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6872,
+      "step": 1056
+    },
+    {
+      "epoch": 0.8456,
+      "grad_norm": 0.4194861227916186,
+      "learning_rate": 1.2254697124597237e-05,
+      "loss": 0.6948,
+      "step": 1057
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.38925930416460935,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6353,
+      "step": 1058
+    },
+    {
+      "epoch": 0.8472,
+      "grad_norm": 0.3343519926674715,
+      "learning_rate": 1.2007213235535786e-05,
+      "loss": 0.5882,
+      "step": 1059
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.37468882377998025,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6788,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8488,
+      "grad_norm": 0.415944774962567,
+      "learning_rate": 1.176209418012495e-05,
+      "loss": 0.6272,
+      "step": 1061
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.4249131063398309,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.7145,
+      "step": 1062
+    },
+    {
+      "epoch": 0.8504,
+      "grad_norm": 0.41711218711666387,
+      "learning_rate": 1.1519346546015907e-05,
+      "loss": 0.6209,
+      "step": 1063
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.4128837498379851,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.6976,
+      "step": 1064
+    },
+    {
+      "epoch": 0.852,
+      "grad_norm": 0.35925305553677817,
+      "learning_rate": 1.1278976857127311e-05,
+      "loss": 0.665,
+      "step": 1065
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.35258149175832587,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6029,
+      "step": 1066
+    },
+    {
+      "epoch": 0.8536,
+      "grad_norm": 0.45046396204959727,
+      "learning_rate": 1.1040991573469629e-05,
+      "loss": 0.693,
+      "step": 1067
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3669770913635565,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6507,
+      "step": 1068
+    },
+    {
+      "epoch": 0.8552,
+      "grad_norm": 0.35453880895924644,
+      "learning_rate": 1.0805397090971737e-05,
+      "loss": 0.6464,
+      "step": 1069
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.3461360282849764,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6271,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8568,
+      "grad_norm": 0.3385469110829669,
+      "learning_rate": 1.057219974130903e-05,
+      "loss": 0.6163,
+      "step": 1071
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.37768309956169943,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6733,
+      "step": 1072
+    },
+    {
+      "epoch": 0.8584,
+      "grad_norm": 0.43192174935318534,
+      "learning_rate": 1.0341405791733183e-05,
+      "loss": 0.7395,
+      "step": 1073
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.3896447776958473,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6399,
+      "step": 1074
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.38248077399288255,
+      "learning_rate": 1.0113021444903726e-05,
+      "loss": 0.6836,
+      "step": 1075
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3709794028821994,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6068,
+      "step": 1076
+    },
+    {
+      "epoch": 0.8616,
+      "grad_norm": 0.3904703563728366,
+      "learning_rate": 9.887052838721322e-06,
+      "loss": 0.6232,
+      "step": 1077
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.36606600947344253,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6544,
+      "step": 1078
+    },
+    {
+      "epoch": 0.8632,
+      "grad_norm": 0.43381197547874695,
+      "learning_rate": 9.663506046162985e-06,
+      "loss": 0.7176,
+      "step": 1079
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.39471345240167216,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6605,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8648,
+      "grad_norm": 0.4315478752535828,
+      "learning_rate": 9.44238707511862e-06,
+      "loss": 0.6423,
+      "step": 1081
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.40013872266622436,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6943,
+      "step": 1082
+    },
+    {
+      "epoch": 0.8664,
+      "grad_norm": 0.387519395093711,
+      "learning_rate": 9.22370186822965e-06,
+      "loss": 0.6568,
+      "step": 1083
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.4340248915869381,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.659,
+      "step": 1084
+    },
+    {
+      "epoch": 0.868,
+      "grad_norm": 0.40221129232474506,
+      "learning_rate": 9.0074563027294e-06,
+      "loss": 0.7269,
+      "step": 1085
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.40148875037197057,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6207,
+      "step": 1086
+    },
+    {
+      "epoch": 0.8696,
+      "grad_norm": 0.3509492771514673,
+      "learning_rate": 8.79365619028507e-06,
+      "loss": 0.6336,
+      "step": 1087
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.397875287389267,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6849,
+      "step": 1088
+    },
+    {
+      "epoch": 0.8712,
+      "grad_norm": 0.46386665430153035,
+      "learning_rate": 8.582307276841462e-06,
+      "loss": 0.7382,
+      "step": 1089
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.4056797818327132,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.6658,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8728,
+      "grad_norm": 0.38602393116264594,
+      "learning_rate": 8.37341524246672e-06,
+      "loss": 0.6476,
+      "step": 1091
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4828539366225798,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.7534,
+      "step": 1092
+    },
+    {
+      "epoch": 0.8744,
+      "grad_norm": 0.37718901184575493,
+      "learning_rate": 8.166985701199582e-06,
+      "loss": 0.6525,
+      "step": 1093
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.3902132590453072,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6359,
+      "step": 1094
+    },
+    {
+      "epoch": 0.876,
+      "grad_norm": 0.4060602940939635,
+      "learning_rate": 7.963024200898462e-06,
+      "loss": 0.6597,
+      "step": 1095
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.39823302264436056,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6407,
+      "step": 1096
+    },
+    {
+      "epoch": 0.8776,
+      "grad_norm": 0.3495785862432801,
+      "learning_rate": 7.761536223092458e-06,
+      "loss": 0.5887,
+      "step": 1097
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.37154597449786636,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6655,
+      "step": 1098
+    },
+    {
+      "epoch": 0.8792,
+      "grad_norm": 0.4258319717537172,
+      "learning_rate": 7.562527182833978e-06,
+      "loss": 0.6908,
+      "step": 1099
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4059964137720068,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.6829,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8808,
+      "grad_norm": 0.4651857139614339,
+      "learning_rate": 7.366002428553153e-06,
+      "loss": 0.7124,
+      "step": 1101
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.4002873835289902,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.7299,
+      "step": 1102
+    },
+    {
+      "epoch": 0.8824,
+      "grad_norm": 0.4182887098863189,
+      "learning_rate": 7.171967241914224e-06,
+      "loss": 0.7299,
+      "step": 1103
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.3958402531946444,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6843,
+      "step": 1104
+    },
+    {
+      "epoch": 0.884,
+      "grad_norm": 0.42650265679061683,
+      "learning_rate": 6.980426837673437e-06,
+      "loss": 0.6642,
+      "step": 1105
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.36979311141228494,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.724,
+      "step": 1106
+    },
+    {
+      "epoch": 0.8856,
+      "grad_norm": 0.45611193851844184,
+      "learning_rate": 6.791386363539065e-06,
+      "loss": 0.6907,
+      "step": 1107
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.33588568532605667,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.6131,
+      "step": 1108
+    },
+    {
+      "epoch": 0.8872,
+      "grad_norm": 0.4061854018710927,
+      "learning_rate": 6.604850900032955e-06,
+      "loss": 0.7133,
+      "step": 1109
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.38484916040022893,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6908,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8888,
+      "grad_norm": 0.40279673459652093,
+      "learning_rate": 6.420825460353974e-06,
+      "loss": 0.6769,
+      "step": 1111
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.35459493389580776,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6024,
+      "step": 1112
+    },
+    {
+      "epoch": 0.8904,
+      "grad_norm": 0.39348496439416963,
+      "learning_rate": 6.239314990243339e-06,
+      "loss": 0.6886,
+      "step": 1113
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.36160745975341585,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.5982,
+      "step": 1114
+    },
+    {
+      "epoch": 0.892,
+      "grad_norm": 0.4170137122342651,
+      "learning_rate": 6.0603243678516995e-06,
+      "loss": 0.6217,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.4247995479288835,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.672,
+      "step": 1116
+    },
+    {
+      "epoch": 0.8936,
+      "grad_norm": 0.40112248826611274,
+      "learning_rate": 5.883858403607967e-06,
+      "loss": 0.6525,
+      "step": 1117
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.3830011434263009,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6717,
+      "step": 1118
+    },
+    {
+      "epoch": 0.8952,
+      "grad_norm": 0.3767053001757346,
+      "learning_rate": 5.7099218400900716e-06,
+      "loss": 0.6285,
+      "step": 1119
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.4262070140089433,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.7382,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8968,
+      "grad_norm": 0.42653574026738084,
+      "learning_rate": 5.538519351897575e-06,
+      "loss": 0.5835,
+      "step": 1121
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.3786988857351713,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.5964,
+      "step": 1122
+    },
+    {
+      "epoch": 0.8984,
+      "grad_norm": 0.4136283084118521,
+      "learning_rate": 5.369655545525909e-06,
+      "loss": 0.6574,
+      "step": 1123
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3824126838189303,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6647,
+      "step": 1124
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.41220862608650916,
+      "learning_rate": 5.2033349592426335e-06,
+      "loss": 0.6328,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.4120713215748317,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6746,
+      "step": 1126
+    },
+    {
+      "epoch": 0.9016,
+      "grad_norm": 0.3958022854447971,
+      "learning_rate": 5.039562062965508e-06,
+      "loss": 0.6618,
+      "step": 1127
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.40274399574312025,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.6905,
+      "step": 1128
+    },
+    {
+      "epoch": 0.9032,
+      "grad_norm": 0.39693457693884204,
+      "learning_rate": 4.87834125814235e-06,
+      "loss": 0.6835,
+      "step": 1129
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.41969762965770036,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6685,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9048,
+      "grad_norm": 0.3463381295579955,
+      "learning_rate": 4.719676877632639e-06,
+      "loss": 0.6296,
+      "step": 1131
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.37401139243286946,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6244,
+      "step": 1132
+    },
+    {
+      "epoch": 0.9064,
+      "grad_norm": 0.4246369519189044,
+      "learning_rate": 4.563573185591219e-06,
+      "loss": 0.7189,
+      "step": 1133
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.36601039318896783,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6796,
+      "step": 1134
+    },
+    {
+      "epoch": 0.908,
+      "grad_norm": 0.3974605104680374,
+      "learning_rate": 4.4100343773536225e-06,
+      "loss": 0.619,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.4110936231174561,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.7068,
+      "step": 1136
+    },
+    {
+      "epoch": 0.9096,
+      "grad_norm": 0.4177468932497064,
+      "learning_rate": 4.259064579323302e-06,
+      "loss": 0.6845,
+      "step": 1137
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.4264028114861248,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.706,
+      "step": 1138
+    },
+    {
+      "epoch": 0.9112,
+      "grad_norm": 0.40196424023510746,
+      "learning_rate": 4.1106678488607495e-06,
+      "loss": 0.684,
+      "step": 1139
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.38155367513039906,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.6821,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9128,
+      "grad_norm": 0.43322231632362984,
+      "learning_rate": 3.964848174174541e-06,
+      "loss": 0.6933,
+      "step": 1141
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.38575462841035746,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.6331,
+      "step": 1142
+    },
+    {
+      "epoch": 0.9144,
+      "grad_norm": 0.36988807284998965,
+      "learning_rate": 3.821609474213983e-06,
+      "loss": 0.6754,
+      "step": 1143
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.36894533488174114,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6117,
+      "step": 1144
+    },
+    {
+      "epoch": 0.916,
+      "grad_norm": 0.3824389769632664,
+      "learning_rate": 3.6809555985639068e-06,
+      "loss": 0.5828,
+      "step": 1145
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.4100882503431007,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6732,
+      "step": 1146
+    },
+    {
+      "epoch": 0.9176,
+      "grad_norm": 0.3939656464313838,
+      "learning_rate": 3.5428903273411863e-06,
+      "loss": 0.6479,
+      "step": 1147
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.3610380946052676,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6151,
+      "step": 1148
+    },
+    {
+      "epoch": 0.9192,
+      "grad_norm": 0.37176805505393795,
+      "learning_rate": 3.40741737109318e-06,
+      "loss": 0.6099,
+      "step": 1149
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.3726635307220839,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6585,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9208,
+      "grad_norm": 0.36404188224864553,
+      "learning_rate": 3.2745403706978872e-06,
+      "loss": 0.6664,
+      "step": 1151
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.3785339085319431,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6647,
+      "step": 1152
+    },
+    {
+      "epoch": 0.9224,
+      "grad_norm": 0.4185980915310844,
+      "learning_rate": 3.1442628972662704e-06,
+      "loss": 0.6689,
+      "step": 1153
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.4099574635267387,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6856,
+      "step": 1154
+    },
+    {
+      "epoch": 0.924,
+      "grad_norm": 0.3818024637485815,
+      "learning_rate": 3.0165884520461316e-06,
+      "loss": 0.6691,
+      "step": 1155
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.36279268301084633,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.642,
+      "step": 1156
+    },
+    {
+      "epoch": 0.9256,
+      "grad_norm": 0.38331837755282544,
+      "learning_rate": 2.8915204663281013e-06,
+      "loss": 0.6509,
+      "step": 1157
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.369111445716187,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6361,
+      "step": 1158
+    },
+    {
+      "epoch": 0.9272,
+      "grad_norm": 0.4115678486699466,
+      "learning_rate": 2.7690623013533976e-06,
+      "loss": 0.6855,
+      "step": 1159
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.39808971107404223,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.7295,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9288,
+      "grad_norm": 0.46666506769343125,
+      "learning_rate": 2.649217248223468e-06,
+      "loss": 0.7466,
+      "step": 1161
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.42147508706907266,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6955,
+      "step": 1162
+    },
+    {
+      "epoch": 0.9304,
+      "grad_norm": 0.41894688199289515,
+      "learning_rate": 2.5319885278115906e-06,
+      "loss": 0.7147,
+      "step": 1163
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.45675887190903264,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.7232,
+      "step": 1164
+    },
+    {
+      "epoch": 0.932,
+      "grad_norm": 0.4522821555018652,
+      "learning_rate": 2.4173792906762804e-06,
+      "loss": 0.7097,
+      "step": 1165
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.37444659689206355,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6347,
+      "step": 1166
+    },
+    {
+      "epoch": 0.9336,
+      "grad_norm": 0.42113978996222756,
+      "learning_rate": 2.3053926169765984e-06,
+      "loss": 0.7053,
+      "step": 1167
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.39523331936067674,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6669,
+      "step": 1168
+    },
+    {
+      "epoch": 0.9352,
+      "grad_norm": 0.4137522101946141,
+      "learning_rate": 2.1960315163894075e-06,
+      "loss": 0.6901,
+      "step": 1169
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.4325293674351783,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.6585,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9368,
+      "grad_norm": 0.42502219933304314,
+      "learning_rate": 2.0892989280284823e-06,
+      "loss": 0.6946,
+      "step": 1171
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.4150688302051163,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.6896,
+      "step": 1172
+    },
+    {
+      "epoch": 0.9384,
+      "grad_norm": 0.3872164353059273,
+      "learning_rate": 1.9851977203654835e-06,
+      "loss": 0.6084,
+      "step": 1173
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.3972240093463755,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.7117,
+      "step": 1174
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.41689310545481006,
+      "learning_rate": 1.8837306911529184e-06,
+      "loss": 0.7383,
+      "step": 1175
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.3442155424450381,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6012,
+      "step": 1176
+    },
+    {
+      "epoch": 0.9416,
+      "grad_norm": 0.38713774050685595,
+      "learning_rate": 1.7849005673489127e-06,
+      "loss": 0.6523,
+      "step": 1177
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.3944181865000051,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6382,
+      "step": 1178
+    },
+    {
+      "epoch": 0.9432,
+      "grad_norm": 0.38923644150853126,
+      "learning_rate": 1.6887100050439587e-06,
+      "loss": 0.7153,
+      "step": 1179
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3851152231100486,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6433,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9448,
+      "grad_norm": 0.4070747814345844,
+      "learning_rate": 1.595161589389449e-06,
+      "loss": 0.6767,
+      "step": 1181
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.3907538104319003,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6534,
+      "step": 1182
+    },
+    {
+      "epoch": 0.9464,
+      "grad_norm": 0.37924046239724357,
+      "learning_rate": 1.5042578345283108e-06,
+      "loss": 0.6908,
+      "step": 1183
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3913428678490897,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.662,
+      "step": 1184
+    },
+    {
+      "epoch": 0.948,
+      "grad_norm": 0.39101169097306065,
+      "learning_rate": 1.4160011835273934e-06,
+      "loss": 0.7022,
+      "step": 1185
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.39544336411685543,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.7007,
+      "step": 1186
+    },
+    {
+      "epoch": 0.9496,
+      "grad_norm": 0.4367205135860217,
+      "learning_rate": 1.3303940083117527e-06,
+      "loss": 0.7405,
+      "step": 1187
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.4145368019549925,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.7002,
+      "step": 1188
+    },
+    {
+      "epoch": 0.9512,
+      "grad_norm": 0.35362366608330437,
+      "learning_rate": 1.2474386096010039e-06,
+      "loss": 0.6433,
+      "step": 1189
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.40739311742148976,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.6905,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9528,
+      "grad_norm": 0.3450490486393698,
+      "learning_rate": 1.1671372168474138e-06,
+      "loss": 0.6343,
+      "step": 1191
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.36672693593050265,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6555,
+      "step": 1192
+    },
+    {
+      "epoch": 0.9544,
+      "grad_norm": 0.40443821669536556,
+      "learning_rate": 1.089491988176017e-06,
+      "loss": 0.7021,
+      "step": 1193
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.36152910649068787,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6078,
+      "step": 1194
+    },
+    {
+      "epoch": 0.956,
+      "grad_norm": 0.38822081969756744,
+      "learning_rate": 1.014505010326583e-06,
+      "loss": 0.5962,
+      "step": 1195
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.4021822336397349,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.6385,
+      "step": 1196
+    },
+    {
+      "epoch": 0.9576,
+      "grad_norm": 0.36888296373018903,
+      "learning_rate": 9.421782985976068e-07,
+      "loss": 0.6739,
+      "step": 1197
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.36602497625700325,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.6636,
+      "step": 1198
+    },
+    {
+      "epoch": 0.9592,
+      "grad_norm": 0.37834634024893005,
+      "learning_rate": 8.725137967920738e-07,
+      "loss": 0.6394,
+      "step": 1199
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.41056503998473765,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.7028,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9608,
+      "grad_norm": 0.41934071567059505,
+      "learning_rate": 8.055133771652345e-07,
+      "loss": 0.6132,
+      "step": 1201
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.3895531059623432,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6956,
+      "step": 1202
+    },
+    {
+      "epoch": 0.9624,
+      "grad_norm": 0.4123458423850561,
+      "learning_rate": 7.411788403743237e-07,
+      "loss": 0.658,
+      "step": 1203
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.36695705362492903,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6275,
+      "step": 1204
+    },
+    {
+      "epoch": 0.964,
+      "grad_norm": 0.37582300821639975,
+      "learning_rate": 6.7951191543012e-07,
+      "loss": 0.5752,
+      "step": 1205
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.36945052731923683,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.7127,
+      "step": 1206
+    },
+    {
+      "epoch": 0.9656,
+      "grad_norm": 0.41868711436006467,
+      "learning_rate": 6.205142596505176e-07,
+      "loss": 0.6849,
+      "step": 1207
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.43543255942755765,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6454,
+      "step": 1208
+    },
+    {
+      "epoch": 0.9672,
+      "grad_norm": 0.37590235442187986,
+      "learning_rate": 5.64187458615939e-07,
+      "loss": 0.6789,
+      "step": 1209
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.41731184653345194,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6704,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9688,
+      "grad_norm": 0.4143146083508496,
+      "learning_rate": 5.105330261267916e-07,
+      "loss": 0.6373,
+      "step": 1211
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.3851611980362892,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6019,
+      "step": 1212
+    },
+    {
+      "epoch": 0.9704,
+      "grad_norm": 0.40780593086081135,
+      "learning_rate": 4.5955240416271084e-07,
+      "loss": 0.6518,
+      "step": 1213
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.4056988218108442,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.7135,
+      "step": 1214
+    },
+    {
+      "epoch": 0.972,
+      "grad_norm": 0.3784078966028107,
+      "learning_rate": 4.112469628438365e-07,
+      "loss": 0.6387,
+      "step": 1215
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.3637434465211071,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6167,
+      "step": 1216
+    },
+    {
+      "epoch": 0.9736,
+      "grad_norm": 0.3616300978607632,
+      "learning_rate": 3.6561800039403016e-07,
+      "loss": 0.6263,
+      "step": 1217
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.4129713945666034,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.6191,
+      "step": 1218
+    },
+    {
+      "epoch": 0.9752,
+      "grad_norm": 0.35762528465376253,
+      "learning_rate": 3.2266674310589273e-07,
+      "loss": 0.6434,
+      "step": 1219
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.36828474797230004,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6546,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9768,
+      "grad_norm": 0.3957523047687968,
+      "learning_rate": 2.8239434530792365e-07,
+      "loss": 0.6515,
+      "step": 1221
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.39174572878750363,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6431,
+      "step": 1222
+    },
+    {
+      "epoch": 0.9784,
+      "grad_norm": 0.3779064892067319,
+      "learning_rate": 2.448018893333681e-07,
+      "loss": 0.647,
+      "step": 1223
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.39824669482756286,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6405,
+      "step": 1224
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.37074336916815626,
+      "learning_rate": 2.098903854912515e-07,
+      "loss": 0.633,
+      "step": 1225
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.34004430054490165,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6539,
+      "step": 1226
+    },
+    {
+      "epoch": 0.9816,
+      "grad_norm": 0.37166414703450706,
+      "learning_rate": 1.7766077203915655e-07,
+      "loss": 0.5827,
+      "step": 1227
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3917322050265033,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.651,
+      "step": 1228
+    },
+    {
+      "epoch": 0.9832,
+      "grad_norm": 0.3788898361910757,
+      "learning_rate": 1.481139151579991e-07,
+      "loss": 0.7005,
+      "step": 1229
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.32905755632467365,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.5832,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9848,
+      "grad_norm": 0.4212142889676699,
+      "learning_rate": 1.2125060892881346e-07,
+      "loss": 0.7139,
+      "step": 1231
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3506529790455762,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.6374,
+      "step": 1232
+    },
+    {
+      "epoch": 0.9864,
+      "grad_norm": 0.40496802534144327,
+      "learning_rate": 9.707157531134713e-08,
+      "loss": 0.6205,
+      "step": 1233
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.38956207080771643,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.7307,
+      "step": 1234
+    },
+    {
+      "epoch": 0.988,
+      "grad_norm": 0.409276659437291,
+      "learning_rate": 7.557746412468758e-08,
+      "loss": 0.6849,
+      "step": 1235
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.33857753778177363,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6282,
+      "step": 1236
+    },
+    {
+      "epoch": 0.9896,
+      "grad_norm": 0.35583207592596244,
+      "learning_rate": 5.6768853029787184e-08,
+      "loss": 0.6375,
+      "step": 1237
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.4278669236748669,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6559,
+      "step": 1238
+    },
+    {
+      "epoch": 0.9912,
+      "grad_norm": 0.35771404196607504,
+      "learning_rate": 4.064624751394242e-08,
+      "loss": 0.5723,
+      "step": 1239
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3705260496346748,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6327,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9928,
+      "grad_norm": 0.40411456405464913,
+      "learning_rate": 2.7210080877237976e-08,
+      "loss": 0.6728,
+      "step": 1241
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.39034217032554636,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.653,
+      "step": 1242
+    },
+    {
+      "epoch": 0.9944,
+      "grad_norm": 0.3684099601229621,
+      "learning_rate": 1.646071422083395e-08,
+      "loss": 0.6704,
+      "step": 1243
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.4601256797139572,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.7698,
+      "step": 1244
+    },
+    {
+      "epoch": 0.996,
+      "grad_norm": 0.5029579074824774,
+      "learning_rate": 8.398436437317969e-09,
+      "loss": 0.7173,
+      "step": 1245
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.4367252519126284,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.7101,
+      "step": 1246
+    },
+    {
+      "epoch": 0.9976,
+      "grad_norm": 0.4054879695873238,
+      "learning_rate": 3.023464202944748e-09,
+      "loss": 0.7071,
+      "step": 1247
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3916101744254669,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.6545,
+      "step": 1248
+    },
+    {
+      "epoch": 0.9992,
+      "grad_norm": 0.455720607139133,
+      "learning_rate": 3.3594197175190745e-10,
+      "loss": 0.7416,
+      "step": 1249
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.3877743051460032,
+      "learning_rate": 0.0,
+      "loss": 0.6556,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0,
+      "step": 1250,
+      "total_flos": 1118125740523520.0,
+      "train_loss": 0.7323009983539581,
+      "train_runtime": 19639.1569,
+      "train_samples_per_second": 1.018,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1118125740523520.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6863c943bd46bcce560929356f6b56d66f111f7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "up_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "q_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..436c9519a878d70d21e16a874d25a7d4a73d59bf
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:887965b3be290a805860a3539a31f509131a71329e1a3fa3ec063f825b1916d7
+size 671150064
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..95503a5f7e5192f4eb7000a3d6c7db23eda2ca16
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:132b2359c1af99b84c77dc4998a784e8c1c34f2ebf05bc9b2e9d8c9c86964125
+size 918507402
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..97a0479a48d8d53e59fdb7fdce339ddd8974f0a5
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.8230102459917927,
+      "learning_rate": 5e-05,
+      "loss": 1.2514,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.7754613104842406,
+      "learning_rate": 0.0001,
+      "loss": 1.2817,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.7476538150253536,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.3323,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.826743930439023,
+      "learning_rate": 0.0002,
+      "loss": 1.1092,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.8815455745050119,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 0.9763,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.6994265783297446,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 0.9622,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.4844382211969728,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 0.84,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4831927243517946,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 0.8755,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5020615808420609,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 0.8759,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.4491917358911475,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 0.7923,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.5621541379413584,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 0.8595,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5229399877952087,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 0.9081,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.4899483245891199,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 0.8514,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5248054422478008,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 0.8841,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.4467654958381241,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.8905,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.485282859570833,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 0.9181,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.39458183468054975,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 0.7556,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4373416456058985,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 0.8343,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.5027527747723598,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 0.9187,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4098235261127106,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 0.8822,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.4295129059527011,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 0.8656,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.3936690434586878,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 0.8036,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.4417471186799036,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 0.8928,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4864785406175006,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 0.9437,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.45968721046185873,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 0.8372,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4830714665889916,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.8806,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.43737069265822076,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 0.8653,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.49014680189737875,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 0.9067,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.3889008209366986,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 0.7482,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.46759884804944396,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 0.8731,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.44882353456669494,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 0.8668,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.48171393222739595,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 0.8581,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3903285974086322,
+      "learning_rate": 0.000172967916579403,
+      "loss": 0.7926,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.40684507206766596,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 0.8027,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.4994530648174004,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 0.8932,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.44581192002150094,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 0.9286,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.5045891023767098,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.7539,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.6705640902608652,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 0.7617,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.42724800016082565,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 0.7821,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4693831019747195,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 0.8945,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.4177089941624564,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 0.7994,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4545228930867604,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 0.8277,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.40871977385442615,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 0.8496,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3950360210436592,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 0.762,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.48892837124416993,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 0.9717,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.40637100014340133,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 0.7747,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.419754658262873,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 0.8454,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.43116125499781977,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.8547,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.45465691712353407,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 0.8865,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.43305078198789515,
+      "learning_rate": 0.000136764169663272,
+      "loss": 0.7896,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.418300240118205,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 0.7947,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4063728927765339,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 0.7833,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.434297863169188,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 0.7386,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.3595960970194214,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 0.7493,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.40644583570820914,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 0.7941,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.45001523527084303,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 0.8319,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.4453989829060359,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 0.8097,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.5209708873974737,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 0.8557,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.36837003736864576,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.8406,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.42846222791516786,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 0.8763,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.42477424040871764,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 0.8633,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.37810519177436647,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 0.7503,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.7574479348860013,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.7541,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4593511499288001,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.8625,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.38782464238758474,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.7631,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.376653667689996,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.7488,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.43937504286831613,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.7863,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.41651678253242863,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.8357,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.4016459673662941,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.7872,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5064989487514656,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.8308,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.40645253693630223,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.7172,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4730501998349042,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.9324,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.37087713139709355,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.768,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3964332480626391,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.8189,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.7221770560206417,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.8414,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4316521393843248,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.771,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.4084063585831575,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.814,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3942071420392019,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.8124,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.44230839486402507,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.9178,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3996441014502741,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 0.8134,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.43362126658735817,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.7898,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4031592034738205,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.8033,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.366707674078855,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.7775,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3975638056137053,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.8359,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.4045827373258958,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.8101,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4093349338659919,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.8367,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.36895176230401944,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.7753,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.41132089972959834,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.7604,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.350552813073951,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.7579,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.36441120795630105,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 0.8193,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4088620411864537,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.8123,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.4482828532364913,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.8331,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.39655166223757243,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.8113,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.39906728803227215,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.8395,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.39806188010556676,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.8374,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3392116012955209,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.7256,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.3846613142134544,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.8255,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.40662812774870566,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.7375,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.432119270511682,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.8773,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.44158999212591304,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.8638,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.37440088499613716,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.7741,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3866735548327976,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 0.8221,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.44310425673846804,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.7853,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.34743929892057196,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.6733,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.4703681600961128,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.9532,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.4348078777013694,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.7169,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.4145488466937236,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.8356,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.36617204461285024,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.7639,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.40029865160158434,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.7791,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.39878953165088726,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.8354,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.3921062999652053,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.7724,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.4499808155505365,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.8353,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.4388588621281375,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.8147,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.40645242005253257,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.7926,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.35812624610283694,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.7639,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4594340329784737,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.755,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.39452619162037694,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.7649,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4025940361974267,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.7408,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.3624064832750243,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.7472,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.404072865815922,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.8834,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.39761468715465376,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.7179,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.39029133459924825,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.7815,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.36841909487954966,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.7708,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3834250542279903,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.7298,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.4079464579421249,
+      "learning_rate": 0.0,
+      "loss": 0.7584,
+      "step": 125
+    },
+    {
+      "epoch": 1.0,
+      "step": 125,
+      "total_flos": 111708340486144.0,
+      "train_loss": 0.8341695852279664,
+      "train_runtime": 1961.2271,
+      "train_samples_per_second": 1.02,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 111708340486144.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..95e637174f4c6854bec1305e143fcc97e480d899
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "down_proj",
+    "gate_proj",
+    "k_proj",
+    "q_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5dd2f56736a672d20e246768cd69181da7d17193
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:452c66bf0be31c51efdfe47a7179b943f633956899ffc5994464dd7ea7bcb263
+size 671150064
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fc98fb6e60ae042032ed65563e1e4f42fb4c194d
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb4b39c3d0a288f2bfd5dafe524367a1fa916b14f42b8df2a5c55730e7cdaca6
+size 918507402
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..64e5b5901f4ababbbe0da1b506cd9e2a5acb37b5
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.7988098439667438,
+      "learning_rate": 5e-05,
+      "loss": 1.2723,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.8047104939359582,
+      "learning_rate": 0.0001,
+      "loss": 1.3382,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.678779971008081,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.2853,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.9622509769111144,
+      "learning_rate": 0.0002,
+      "loss": 1.1426,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.9449202770205999,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 1.0317,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.6746967486634825,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 0.953,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.6007273349994086,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 1.0059,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.42193348083346716,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 0.8801,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5153618836742115,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 0.973,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.48297466520195975,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 0.8564,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.47564449264950726,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 0.8737,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5207808832264728,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 0.9143,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.4538323052406747,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 0.7725,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5352076138143766,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 0.8691,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.45615441780790505,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.9811,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.42359513948647015,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 0.8762,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.5052044531432168,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 0.9138,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4748034877827455,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 0.8831,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.4319246964375862,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 0.894,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4295723902379643,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 0.8507,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.43225796519679444,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 0.8182,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.40490037014979546,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 0.7561,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.46668780783282765,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 0.8036,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.5157456248655427,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 0.8313,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5128106845064673,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 0.9407,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.622506863485958,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.9093,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.48229322437865646,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 0.9261,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4880628956946326,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 0.8975,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.38460486088342766,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 0.7645,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4550248150103312,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 0.8774,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.384620264631093,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 0.8061,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.5225658909213516,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 0.8908,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.4036288051125909,
+      "learning_rate": 0.000172967916579403,
+      "loss": 0.8343,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.42633925388376787,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 0.8138,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.46806271680078293,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 0.8776,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.422612239326831,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 0.8444,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4388054628355334,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.8609,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.45385481596187843,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 0.7595,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.43686390157927096,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 0.8916,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4081204005411219,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 0.8302,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.5653417246912735,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 0.908,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.39165937516782645,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 0.7948,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.42135665615054524,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 0.7923,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.39143282192422507,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 0.8057,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.44759789583787546,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 0.954,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.43133884534448624,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 0.8549,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.4415959165799372,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 0.8775,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4514681186674009,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.8194,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.4173767895604976,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 0.8604,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4133769218943885,
+      "learning_rate": 0.000136764169663272,
+      "loss": 0.7908,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.41828754301799737,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 0.8546,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.406320541980502,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 0.7896,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.41489293486425927,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 0.7973,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4677216400238403,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 0.8564,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.43476535746613176,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 0.8244,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4277565054201037,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 0.7799,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.4389903142149982,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 0.8338,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.41672364775171566,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 0.8323,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4205986930643128,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.8028,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4213889474829399,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 0.8655,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.38890226375386644,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 0.7673,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4007064239374529,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 0.7581,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.40113808394171746,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.8179,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4701690819238131,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.8274,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3782865063416312,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.7886,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.3695210524444637,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.773,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.5672358594074709,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.8099,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4444450552640662,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.8306,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.41008705046493193,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.8789,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4673737229398386,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.9351,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.3830397811715073,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.7336,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4456941609698785,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.8714,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.40560461532518255,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.8273,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.37433262348405216,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.8253,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.5367018947959953,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.9215,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.405723420826382,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.7731,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.42171087090867204,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.7812,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.36691854390246237,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.7964,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.5249708367020518,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.9663,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.40363141580240425,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 0.7888,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.40789371199957575,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.7985,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3948410406597383,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.7705,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.4223952800047747,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.826,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3972599652479266,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.7778,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.3995654426311221,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.8366,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.40162821542256544,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.8117,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.39073027474543864,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.8034,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.43886821003145177,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.8382,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3269490532820274,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.6712,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3968517644888277,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 0.7962,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4341010529888357,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.7879,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.42299890452348515,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.8825,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.39337741550250666,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.8028,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.39484171624919984,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.8119,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.40769681915213646,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.8004,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3572914078439355,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.6857,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.3957369927951531,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.7213,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.37046745911855067,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.7468,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.5005726568036697,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.7871,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3983543612467786,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.8407,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.48341697939974004,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.8513,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3992372067430147,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 0.7209,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.41004749101059196,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.7662,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.370929509581193,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.7471,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.46262922759234665,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.8762,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3589231369999426,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.7501,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.41915549604282365,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.7917,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3706848594244108,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.7624,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.3984204104574697,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.8092,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.6132336734307566,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.8505,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.4104622166667177,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.7983,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3789263447274554,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.7745,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.3968337298933055,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.7523,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.42083234830591265,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.7649,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.3605611194789269,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.7152,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.36646932666512877,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.715,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.39664518944410565,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.7701,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4198410480684532,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.8049,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.39925709056908043,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.7778,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.4297078774434034,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.8153,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.33826156234226706,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.716,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.40010593332292316,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.8225,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.362422594612195,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.7373,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.39019640539389827,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.7307,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.4127359643945258,
+      "learning_rate": 0.0,
+      "loss": 0.7566,
+      "step": 125
+    },
+    {
+      "epoch": 1.0,
+      "step": 125,
+      "total_flos": 113027115089920.0,
+      "train_loss": 0.8387285661697388,
+      "train_runtime": 1973.4661,
+      "train_samples_per_second": 1.013,
+      "train_steps_per_second": 0.063
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 113027115089920.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b466a38ac92462575b1e20c7147b9b4c32e8e7d
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "gate_proj",
+    "v_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bd60eae6dec5fe85b046b0be52c1edcf7d2925aa
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b2b1a45abe87793a81bcd2f5ef3b7ef69557f7cec3b632c7ca42150a716cfbf
+size 671150064
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..32826a42ba27bc43e432f320a699d2e98d34b7e9
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9bbeac56070177880cd170be34cdf7e19898867aecc03254b9931644a626b2ee
+size 918507402
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..82faf447591a741394bf544bb4c48ee60d597aa6
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,917 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.8142887741739527,
+      "learning_rate": 5e-05,
+      "loss": 1.3203,
+      "step": 1
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.826263044163555,
+      "learning_rate": 0.0001,
+      "loss": 1.357,
+      "step": 2
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.664560860527008,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.3177,
+      "step": 3
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.8465688159037528,
+      "learning_rate": 0.0002,
+      "loss": 1.1138,
+      "step": 4
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.7328564816107398,
+      "learning_rate": 0.00019996629653035126,
+      "loss": 0.9531,
+      "step": 5
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.7074016008967685,
+      "learning_rate": 0.00019986520883988232,
+      "loss": 1.0562,
+      "step": 6
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.538099365465927,
+      "learning_rate": 0.00019969680506871137,
+      "loss": 0.9808,
+      "step": 7
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.5176555636039095,
+      "learning_rate": 0.00019946119873266613,
+      "loss": 0.9657,
+      "step": 8
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.5115786009392326,
+      "learning_rate": 0.00019915854864676664,
+      "loss": 0.9823,
+      "step": 9
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.49782091794630234,
+      "learning_rate": 0.00019878905881817252,
+      "loss": 0.9706,
+      "step": 10
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.5285171786772587,
+      "learning_rate": 0.00019835297830866826,
+      "loss": 0.9181,
+      "step": 11
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5045275115012817,
+      "learning_rate": 0.00019785060106677818,
+      "loss": 0.8479,
+      "step": 12
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.45816614229029784,
+      "learning_rate": 0.00019728226572962473,
+      "loss": 0.8748,
+      "step": 13
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5694700429666423,
+      "learning_rate": 0.0001966483553946637,
+      "loss": 0.9133,
+      "step": 14
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.49181239939969823,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 0.9074,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.49316688543215115,
+      "learning_rate": 0.00019518556284360696,
+      "loss": 0.9717,
+      "step": 16
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.44110871094005183,
+      "learning_rate": 0.0001943576666511982,
+      "loss": 0.8412,
+      "step": 17
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4418895277810478,
+      "learning_rate": 0.0001934661668437073,
+      "loss": 0.8382,
+      "step": 18
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.42911875816940365,
+      "learning_rate": 0.0001925116643538684,
+      "loss": 0.8697,
+      "step": 19
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.44258681052455573,
+      "learning_rate": 0.00019149480258259533,
+      "loss": 0.8645,
+      "step": 20
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.39434812507866013,
+      "learning_rate": 0.00019041626696528503,
+      "loss": 0.8074,
+      "step": 21
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.38627121086070065,
+      "learning_rate": 0.0001892767845097864,
+      "loss": 0.8043,
+      "step": 22
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.42792218348914973,
+      "learning_rate": 0.00018807712330634642,
+      "loss": 0.8948,
+      "step": 23
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.46226754180037904,
+      "learning_rate": 0.0001868180920098644,
+      "loss": 0.8565,
+      "step": 24
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5219612120339278,
+      "learning_rate": 0.00018550053929480202,
+      "loss": 0.9146,
+      "step": 25
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.51761892515566,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 0.9529,
+      "step": 26
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.44024527482066445,
+      "learning_rate": 0.0001826934609456129,
+      "loss": 0.8762,
+      "step": 27
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.49535408012521337,
+      "learning_rate": 0.00018120582747708502,
+      "loss": 0.8785,
+      "step": 28
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.3771750038472647,
+      "learning_rate": 0.0001796634556457236,
+      "loss": 0.774,
+      "step": 29
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.45040072849609114,
+      "learning_rate": 0.0001780673851171728,
+      "loss": 0.8503,
+      "step": 30
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.36341393297332764,
+      "learning_rate": 0.00017641869175372493,
+      "loss": 0.7418,
+      "step": 31
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4216230059515405,
+      "learning_rate": 0.00017471848688911464,
+      "loss": 0.8598,
+      "step": 32
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.4128495288664931,
+      "learning_rate": 0.000172967916579403,
+      "loss": 0.8267,
+      "step": 33
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3861751233140676,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 0.8195,
+      "step": 34
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.40977429983509844,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 0.8812,
+      "step": 35
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4482435012846371,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 0.8215,
+      "step": 36
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4083717573154022,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 0.742,
+      "step": 37
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.38613912307658893,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 0.7913,
+      "step": 38
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.3878615972435959,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 0.7844,
+      "step": 39
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.42355041709215796,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 0.8898,
+      "step": 40
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.43551615637754165,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 0.8392,
+      "step": 41
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.47496150335072046,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 0.7912,
+      "step": 42
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.388663124549581,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 0.7692,
+      "step": 43
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.4296499380531653,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 0.7978,
+      "step": 44
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4649418059917025,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 0.8377,
+      "step": 45
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4241895774181597,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 0.7789,
+      "step": 46
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.4375090382766959,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 0.8476,
+      "step": 47
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.41651911410568904,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 0.7997,
+      "step": 48
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.8385320653576406,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 0.8468,
+      "step": 49
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4300807980092393,
+      "learning_rate": 0.000136764169663272,
+      "loss": 0.7734,
+      "step": 50
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.4203570601266618,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 0.839,
+      "step": 51
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.39408737233710106,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 0.7536,
+      "step": 52
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.4186307599472403,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 0.8182,
+      "step": 53
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.40651416697280657,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 0.8318,
+      "step": 54
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.42229434810654476,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 0.8055,
+      "step": 55
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4358531581187814,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 0.8357,
+      "step": 56
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.40822046618978647,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 0.8125,
+      "step": 57
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4235880476040138,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 0.8259,
+      "step": 58
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4215673331393117,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 0.7784,
+      "step": 59
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4332060167133324,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 0.8417,
+      "step": 60
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.42219695345437325,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 0.8155,
+      "step": 61
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4668267962773362,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 0.7763,
+      "step": 62
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.46272012154033837,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 0.729,
+      "step": 63
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4100359662106381,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 0.8168,
+      "step": 64
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3707040323194465,
+      "learning_rate": 9.870185768020693e-05,
+      "loss": 0.7297,
+      "step": 65
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.40864796522416713,
+      "learning_rate": 9.610644807862625e-05,
+      "loss": 0.8137,
+      "step": 66
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.3802310662705543,
+      "learning_rate": 9.35136630012257e-05,
+      "loss": 0.7679,
+      "step": 67
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4023556002029965,
+      "learning_rate": 9.092525016506858e-05,
+      "loss": 0.7852,
+      "step": 68
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.3809774110383497,
+      "learning_rate": 8.83429543400241e-05,
+      "loss": 0.7833,
+      "step": 69
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.45986536929396077,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 0.8506,
+      "step": 70
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.41836206851235846,
+      "learning_rate": 8.320367101298351e-05,
+      "loss": 0.7764,
+      "step": 71
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4933876256783,
+      "learning_rate": 8.065014774458003e-05,
+      "loss": 0.9205,
+      "step": 72
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.37665806383923467,
+      "learning_rate": 7.810966761934053e-05,
+      "loss": 0.8073,
+      "step": 73
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.4036540744178778,
+      "learning_rate": 7.558394309716088e-05,
+      "loss": 0.8794,
+      "step": 74
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4697773652413909,
+      "learning_rate": 7.307467669163655e-05,
+      "loss": 0.9005,
+      "step": 75
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.41368973764806866,
+      "learning_rate": 7.058355982245037e-05,
+      "loss": 0.7326,
+      "step": 76
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.43645348493034947,
+      "learning_rate": 6.811227167523815e-05,
+      "loss": 0.8351,
+      "step": 77
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.35867972751751565,
+      "learning_rate": 6.566247806970119e-05,
+      "loss": 0.771,
+      "step": 78
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.44441133948036937,
+      "learning_rate": 6.323583033672799e-05,
+      "loss": 0.8379,
+      "step": 79
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.38343098317284874,
+      "learning_rate": 6.083396420528298e-05,
+      "loss": 0.8411,
+      "step": 80
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.35211736255650594,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 0.725,
+      "step": 81
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4100351022946677,
+      "learning_rate": 5.611103504890444e-05,
+      "loss": 0.8126,
+      "step": 82
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.38466666766475,
+      "learning_rate": 5.379315560596038e-05,
+      "loss": 0.791,
+      "step": 83
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3900399017720507,
+      "learning_rate": 5.1506422782568345e-05,
+      "loss": 0.7693,
+      "step": 84
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.40836104144601987,
+      "learning_rate": 4.9252377995334444e-05,
+      "loss": 0.7799,
+      "step": 85
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.40113863945212963,
+      "learning_rate": 4.703254062686017e-05,
+      "loss": 0.8244,
+      "step": 86
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.44475610746699673,
+      "learning_rate": 4.484840700157295e-05,
+      "loss": 0.774,
+      "step": 87
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.42967156563682585,
+      "learning_rate": 4.270144937709981e-05,
+      "loss": 0.8647,
+      "step": 88
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3496551633293636,
+      "learning_rate": 4.059311495186338e-05,
+      "loss": 0.7044,
+      "step": 89
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.44015822119884673,
+      "learning_rate": 3.852482488956992e-05,
+      "loss": 0.8245,
+      "step": 90
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.3942139634871655,
+      "learning_rate": 3.649797336124615e-05,
+      "loss": 0.8019,
+      "step": 91
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.45959491694921717,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 0.9091,
+      "step": 92
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4022454352542581,
+      "learning_rate": 3.257402200743821e-05,
+      "loss": 0.795,
+      "step": 93
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.447424677957686,
+      "learning_rate": 3.0679567197461134e-05,
+      "loss": 0.8613,
+      "step": 94
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.8228159028498044,
+      "learning_rate": 2.8831839169543996e-05,
+      "loss": 0.78,
+      "step": 95
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.40354773729170573,
+      "learning_rate": 2.7032083420597e-05,
+      "loss": 0.8328,
+      "step": 96
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 1.2108866556094844,
+      "learning_rate": 2.528151311088537e-05,
+      "loss": 0.7878,
+      "step": 97
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.3610193731209496,
+      "learning_rate": 2.3581308246275103e-05,
+      "loss": 0.7414,
+      "step": 98
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.4567058715426913,
+      "learning_rate": 2.1932614882827197e-05,
+      "loss": 0.8132,
+      "step": 99
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.5064357508113733,
+      "learning_rate": 2.03365443542764e-05,
+      "loss": 0.8063,
+      "step": 100
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.3638760992762728,
+      "learning_rate": 1.879417252291502e-05,
+      "loss": 0.7419,
+      "step": 101
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.40459816038129287,
+      "learning_rate": 1.730653905438714e-05,
+      "loss": 0.7848,
+      "step": 102
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.4336878568274014,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.8357,
+      "step": 103
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.37345618595382063,
+      "learning_rate": 1.4499460705197998e-05,
+      "loss": 0.7668,
+      "step": 104
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.49049743228491344,
+      "learning_rate": 1.3181907990135622e-05,
+      "loss": 0.9038,
+      "step": 105
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3759596761592932,
+      "learning_rate": 1.1922876693653585e-05,
+      "loss": 0.6944,
+      "step": 106
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.4116623409989959,
+      "learning_rate": 1.0723215490213634e-05,
+      "loss": 0.7785,
+      "step": 107
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.5133833408468815,
+      "learning_rate": 9.583733034714981e-06,
+      "loss": 0.7874,
+      "step": 108
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.39965353210976334,
+      "learning_rate": 8.505197417404687e-06,
+      "loss": 0.7695,
+      "step": 109
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.43476499922924694,
+      "learning_rate": 7.488335646131628e-06,
+      "loss": 0.8003,
+      "step": 110
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.363404507736322,
+      "learning_rate": 6.533833156292679e-06,
+      "loss": 0.7537,
+      "step": 111
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.38207424823187824,
+      "learning_rate": 5.6423333488018095e-06,
+      "loss": 0.7604,
+      "step": 112
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.40645940426463006,
+      "learning_rate": 4.8144371563930476e-06,
+      "loss": 0.8689,
+      "step": 113
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.38990802657425977,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 0.8056,
+      "step": 114
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.3606791147269738,
+      "learning_rate": 3.3516446053363015e-06,
+      "loss": 0.7541,
+      "step": 115
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3757264538044718,
+      "learning_rate": 2.717734270375272e-06,
+      "loss": 0.7306,
+      "step": 116
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.371466212528868,
+      "learning_rate": 2.1493989332218468e-06,
+      "loss": 0.7826,
+      "step": 117
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3687310415085809,
+      "learning_rate": 1.6470216913317626e-06,
+      "loss": 0.7713,
+      "step": 118
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4569668048302852,
+      "learning_rate": 1.2109411818274852e-06,
+      "loss": 0.8099,
+      "step": 119
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.4528811913299572,
+      "learning_rate": 8.41451353233369e-07,
+      "loss": 0.8434,
+      "step": 120
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.3577966107509443,
+      "learning_rate": 5.388012673338661e-07,
+      "loss": 0.7936,
+      "step": 121
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4989489821260528,
+      "learning_rate": 3.0319493128866396e-07,
+      "loss": 0.8581,
+      "step": 122
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.38398524622969127,
+      "learning_rate": 1.3479116011769767e-07,
+      "loss": 0.7302,
+      "step": 123
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.32350981307906573,
+      "learning_rate": 3.370346964876036e-08,
+      "loss": 0.7048,
+      "step": 124
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.41093063999025503,
+      "learning_rate": 0.0,
+      "loss": 0.8109,
+      "step": 125
+    },
+    {
+      "epoch": 1.0,
+      "step": 125,
+      "total_flos": 112252921184256.0,
+      "train_loss": 0.8373198323249816,
+      "train_runtime": 1963.4796,
+      "train_samples_per_second": 1.019,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 112252921184256.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6785ac69d5647fb96903c1ba7432ad49dcaa6fed
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "up_proj",
+    "q_proj",
+    "o_proj",
+    "k_proj",
+    "gate_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f9ee08b523df0dbe39ad4124231fb5c7f8d786af
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:568dc13a0488d7d30be058a2c0dd7311dd566c7e687ce101b610ba27a191de8c
+size 671150064
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..882444ad5675293ffaade1cce09d35c5f089fe4d
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:932a3bcb850e75ed7733b784c557c1300e684bc602034c283d60188fe074fd64
+size 918507402
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a57f5ec125f99908cbae5f0de166dd02271e52d4
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,13167 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005333333333333334,
+      "grad_norm": 0.7614603133138718,
+      "learning_rate": 3.5087719298245615e-06,
+      "loss": 1.2054,
+      "step": 1
+    },
+    {
+      "epoch": 0.0010666666666666667,
+      "grad_norm": 0.9127747104502665,
+      "learning_rate": 7.017543859649123e-06,
+      "loss": 1.4222,
+      "step": 2
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.839285541892957,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.4296,
+      "step": 3
+    },
+    {
+      "epoch": 0.0021333333333333334,
+      "grad_norm": 0.7672733099016094,
+      "learning_rate": 1.4035087719298246e-05,
+      "loss": 1.2582,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026666666666666666,
+      "grad_norm": 0.6387348905114876,
+      "learning_rate": 1.7543859649122806e-05,
+      "loss": 1.1707,
+      "step": 5
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.7582928278378311,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.3103,
+      "step": 6
+    },
+    {
+      "epoch": 0.0037333333333333333,
+      "grad_norm": 0.7192602920897577,
+      "learning_rate": 2.456140350877193e-05,
+      "loss": 1.2863,
+      "step": 7
+    },
+    {
+      "epoch": 0.004266666666666667,
+      "grad_norm": 0.6167677211414274,
+      "learning_rate": 2.8070175438596492e-05,
+      "loss": 1.1186,
+      "step": 8
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.6996031262167461,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.0992,
+      "step": 9
+    },
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 0.6042192060485368,
+      "learning_rate": 3.508771929824561e-05,
+      "loss": 0.9973,
+      "step": 10
+    },
+    {
+      "epoch": 0.005866666666666667,
+      "grad_norm": 0.9060558121505383,
+      "learning_rate": 3.859649122807018e-05,
+      "loss": 1.2129,
+      "step": 11
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8495686330600954,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 1.0312,
+      "step": 12
+    },
+    {
+      "epoch": 0.006933333333333333,
+      "grad_norm": 0.7074201545607064,
+      "learning_rate": 4.56140350877193e-05,
+      "loss": 1.0136,
+      "step": 13
+    },
+    {
+      "epoch": 0.007466666666666667,
+      "grad_norm": 1.1788839348627669,
+      "learning_rate": 4.912280701754386e-05,
+      "loss": 1.0729,
+      "step": 14
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.6364794656570034,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 0.9574,
+      "step": 15
+    },
+    {
+      "epoch": 0.008533333333333334,
+      "grad_norm": 0.5681338682121225,
+      "learning_rate": 5.6140350877192984e-05,
+      "loss": 0.9741,
+      "step": 16
+    },
+    {
+      "epoch": 0.009066666666666667,
+      "grad_norm": 0.6130526671292817,
+      "learning_rate": 5.9649122807017544e-05,
+      "loss": 0.9423,
+      "step": 17
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.5563517522802279,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 0.893,
+      "step": 18
+    },
+    {
+      "epoch": 0.010133333333333333,
+      "grad_norm": 0.6013801587529852,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.9249,
+      "step": 19
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": 0.5865005378899556,
+      "learning_rate": 7.017543859649122e-05,
+      "loss": 0.924,
+      "step": 20
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.5174851167888802,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.8749,
+      "step": 21
+    },
+    {
+      "epoch": 0.011733333333333333,
+      "grad_norm": 0.5461881416444898,
+      "learning_rate": 7.719298245614036e-05,
+      "loss": 0.9422,
+      "step": 22
+    },
+    {
+      "epoch": 0.012266666666666667,
+      "grad_norm": 0.4747564234230771,
+      "learning_rate": 8.070175438596491e-05,
+      "loss": 0.8907,
+      "step": 23
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.43288426326437696,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.837,
+      "step": 24
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 0.4962669711011486,
+      "learning_rate": 8.771929824561403e-05,
+      "loss": 0.8915,
+      "step": 25
+    },
+    {
+      "epoch": 0.013866666666666666,
+      "grad_norm": 0.5901579206727451,
+      "learning_rate": 9.12280701754386e-05,
+      "loss": 1.0372,
+      "step": 26
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.46045329393216505,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.8052,
+      "step": 27
+    },
+    {
+      "epoch": 0.014933333333333333,
+      "grad_norm": 0.4567183595792635,
+      "learning_rate": 9.824561403508771e-05,
+      "loss": 0.822,
+      "step": 28
+    },
+    {
+      "epoch": 0.015466666666666667,
+      "grad_norm": 0.5127915079330957,
+      "learning_rate": 0.0001017543859649123,
+      "loss": 0.8611,
+      "step": 29
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.45212541128910144,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.9031,
+      "step": 30
+    },
+    {
+      "epoch": 0.016533333333333334,
+      "grad_norm": 0.4301450894070365,
+      "learning_rate": 0.00010877192982456141,
+      "loss": 0.8078,
+      "step": 31
+    },
+    {
+      "epoch": 0.017066666666666667,
+      "grad_norm": 0.5866260600585053,
+      "learning_rate": 0.00011228070175438597,
+      "loss": 0.9865,
+      "step": 32
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.4862368145091248,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.8985,
+      "step": 33
+    },
+    {
+      "epoch": 0.018133333333333335,
+      "grad_norm": 0.5316394614125118,
+      "learning_rate": 0.00011929824561403509,
+      "loss": 0.7971,
+      "step": 34
+    },
+    {
+      "epoch": 0.018666666666666668,
+      "grad_norm": 0.4837818428894648,
+      "learning_rate": 0.00012280701754385965,
+      "loss": 0.8283,
+      "step": 35
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.47387291685763033,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8943,
+      "step": 36
+    },
+    {
+      "epoch": 0.019733333333333332,
+      "grad_norm": 0.527997166673022,
+      "learning_rate": 0.0001298245614035088,
+      "loss": 0.906,
+      "step": 37
+    },
+    {
+      "epoch": 0.020266666666666665,
+      "grad_norm": 0.48495165100397924,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.8542,
+      "step": 38
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.48161554925204597,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.8485,
+      "step": 39
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 0.48539144350667346,
+      "learning_rate": 0.00014035087719298245,
+      "loss": 0.944,
+      "step": 40
+    },
+    {
+      "epoch": 0.021866666666666666,
+      "grad_norm": 0.505239821205383,
+      "learning_rate": 0.00014385964912280703,
+      "loss": 0.867,
+      "step": 41
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.46348589602511586,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.8555,
+      "step": 42
+    },
+    {
+      "epoch": 0.022933333333333333,
+      "grad_norm": 0.4527428066313504,
+      "learning_rate": 0.00015087719298245616,
+      "loss": 0.8377,
+      "step": 43
+    },
+    {
+      "epoch": 0.023466666666666667,
+      "grad_norm": 0.4624001837932464,
+      "learning_rate": 0.0001543859649122807,
+      "loss": 0.9011,
+      "step": 44
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.49908799700671475,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8382,
+      "step": 45
+    },
+    {
+      "epoch": 0.024533333333333334,
+      "grad_norm": 0.4355107141235247,
+      "learning_rate": 0.00016140350877192982,
+      "loss": 0.7862,
+      "step": 46
+    },
+    {
+      "epoch": 0.025066666666666668,
+      "grad_norm": 0.4386913869679193,
+      "learning_rate": 0.0001649122807017544,
+      "loss": 0.8395,
+      "step": 47
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.4541431792135339,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8412,
+      "step": 48
+    },
+    {
+      "epoch": 0.026133333333333335,
+      "grad_norm": 0.46651404808286145,
+      "learning_rate": 0.00017192982456140353,
+      "loss": 0.8349,
+      "step": 49
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.5648871754628285,
+      "learning_rate": 0.00017543859649122806,
+      "loss": 0.9897,
+      "step": 50
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.4384363421914036,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8179,
+      "step": 51
+    },
+    {
+      "epoch": 0.027733333333333332,
+      "grad_norm": 0.45413202194745417,
+      "learning_rate": 0.0001824561403508772,
+      "loss": 0.7779,
+      "step": 52
+    },
+    {
+      "epoch": 0.028266666666666666,
+      "grad_norm": 0.4857043747858801,
+      "learning_rate": 0.00018596491228070177,
+      "loss": 0.8535,
+      "step": 53
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.4719582644464707,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8999,
+      "step": 54
+    },
+    {
+      "epoch": 0.029333333333333333,
+      "grad_norm": 0.42847542026076446,
+      "learning_rate": 0.00019298245614035088,
+      "loss": 0.7584,
+      "step": 55
+    },
+    {
+      "epoch": 0.029866666666666666,
+      "grad_norm": 0.5295530041959768,
+      "learning_rate": 0.00019649122807017543,
+      "loss": 0.8573,
+      "step": 56
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.49939356616520253,
+      "learning_rate": 0.0002,
+      "loss": 0.9823,
+      "step": 57
+    },
+    {
+      "epoch": 0.030933333333333334,
+      "grad_norm": 0.4588501178491851,
+      "learning_rate": 0.00019999985069241055,
+      "loss": 0.8502,
+      "step": 58
+    },
+    {
+      "epoch": 0.031466666666666664,
+      "grad_norm": 0.44087109587454565,
+      "learning_rate": 0.00019999940277008808,
+      "loss": 0.7886,
+      "step": 59
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.4622585621540621,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8248,
+      "step": 60
+    },
+    {
+      "epoch": 0.03253333333333333,
+      "grad_norm": 0.4436021478832358,
+      "learning_rate": 0.00019999761108748597,
+      "loss": 0.7526,
+      "step": 61
+    },
+    {
+      "epoch": 0.03306666666666667,
+      "grad_norm": 0.7233960848577827,
+      "learning_rate": 0.00019999626733255662,
+      "loss": 0.8258,
+      "step": 62
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.3961220197860444,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.7528,
+      "step": 63
+    },
+    {
+      "epoch": 0.034133333333333335,
+      "grad_norm": 0.4814306656556086,
+      "learning_rate": 0.00019999268401550447,
+      "loss": 0.8285,
+      "step": 64
+    },
+    {
+      "epoch": 0.034666666666666665,
+      "grad_norm": 0.47322986486209684,
+      "learning_rate": 0.000199990444464082,
+      "loss": 0.901,
+      "step": 65
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.38116839724000634,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.7233,
+      "step": 66
+    },
+    {
+      "epoch": 0.03573333333333333,
+      "grad_norm": 0.4660356984106766,
+      "learning_rate": 0.00019998506960888256,
+      "loss": 0.8805,
+      "step": 67
+    },
+    {
+      "epoch": 0.03626666666666667,
+      "grad_norm": 0.48008173674551324,
+      "learning_rate": 0.00019998193432115572,
+      "loss": 0.8075,
+      "step": 68
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.43163708053698957,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8487,
+      "step": 69
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 0.46084182108467253,
+      "learning_rate": 0.00019997476807225985,
+      "loss": 0.7633,
+      "step": 70
+    },
+    {
+      "epoch": 0.037866666666666667,
+      "grad_norm": 0.4725108721087422,
+      "learning_rate": 0.0001999707371324904,
+      "loss": 0.8055,
+      "step": 71
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.4397202059126982,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.7986,
+      "step": 72
+    },
+    {
+      "epoch": 0.038933333333333334,
+      "grad_norm": 0.43448440899059,
+      "learning_rate": 0.00019996177968249334,
+      "loss": 0.8379,
+      "step": 73
+    },
+    {
+      "epoch": 0.039466666666666664,
+      "grad_norm": 0.48113391072739375,
+      "learning_rate": 0.0001999568531990141,
+      "loss": 0.9333,
+      "step": 74
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.43618973002024286,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8005,
+      "step": 75
+    },
+    {
+      "epoch": 0.04053333333333333,
+      "grad_norm": 0.4871652179681046,
+      "learning_rate": 0.00019994610478865011,
+      "loss": 0.8683,
+      "step": 76
+    },
+    {
+      "epoch": 0.04106666666666667,
+      "grad_norm": 0.4446358368823649,
+      "learning_rate": 0.0001999402828938618,
+      "loss": 0.8502,
+      "step": 77
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4248813501781938,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8766,
+      "step": 78
+    },
+    {
+      "epoch": 0.042133333333333335,
+      "grad_norm": 0.46175056141708515,
+      "learning_rate": 0.00019992774381199778,
+      "loss": 0.7788,
+      "step": 79
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 0.4459040827699482,
+      "learning_rate": 0.00019992102666236566,
+      "loss": 0.8855,
+      "step": 80
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.43452898585972144,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.7885,
+      "step": 81
+    },
+    {
+      "epoch": 0.04373333333333333,
+      "grad_norm": 0.42858107910034143,
+      "learning_rate": 0.00019990669724599336,
+      "loss": 0.8269,
+      "step": 82
+    },
+    {
+      "epoch": 0.04426666666666667,
+      "grad_norm": 0.43497214484723523,
+      "learning_rate": 0.00019989908502204292,
+      "loss": 0.8486,
+      "step": 83
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.424602618688084,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.7366,
+      "step": 84
+    },
+    {
+      "epoch": 0.04533333333333334,
+      "grad_norm": 0.5237333285609881,
+      "learning_rate": 0.00019988296565626987,
+      "loss": 0.9504,
+      "step": 85
+    },
+    {
+      "epoch": 0.04586666666666667,
+      "grad_norm": 0.5282217133631587,
+      "learning_rate": 0.00019987445856258206,
+      "loss": 0.8777,
+      "step": 86
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.45125628762546255,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.8052,
+      "step": 87
+    },
+    {
+      "epoch": 0.046933333333333334,
+      "grad_norm": 0.4478849100987688,
+      "learning_rate": 0.00019985654968062122,
+      "loss": 0.824,
+      "step": 88
+    },
+    {
+      "epoch": 0.047466666666666664,
+      "grad_norm": 0.5348709935122913,
+      "learning_rate": 0.00019984714794582683,
+      "loss": 0.8655,
+      "step": 89
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.39791200867162885,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.7754,
+      "step": 90
+    },
+    {
+      "epoch": 0.04853333333333333,
+      "grad_norm": 0.44945064830262604,
+      "learning_rate": 0.000199827450028985,
+      "loss": 0.784,
+      "step": 91
+    },
+    {
+      "epoch": 0.04906666666666667,
+      "grad_norm": 0.48213644490370783,
+      "learning_rate": 0.00019981715390575858,
+      "loss": 0.7894,
+      "step": 92
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.45037742809641995,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8057,
+      "step": 93
+    },
+    {
+      "epoch": 0.050133333333333335,
+      "grad_norm": 0.44421092222784353,
+      "learning_rate": 0.00019979566748342347,
+      "loss": 0.813,
+      "step": 94
+    },
+    {
+      "epoch": 0.050666666666666665,
+      "grad_norm": 0.4147330635700553,
+      "learning_rate": 0.00019978447724847652,
+      "loss": 0.7554,
+      "step": 95
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4361018690916765,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.8233,
+      "step": 96
+    },
+    {
+      "epoch": 0.05173333333333333,
+      "grad_norm": 0.4548187706211488,
+      "learning_rate": 0.00019976120289810247,
+      "loss": 0.8032,
+      "step": 97
+    },
+    {
+      "epoch": 0.05226666666666667,
+      "grad_norm": 0.49615822470903287,
+      "learning_rate": 0.00019974911885217608,
+      "loss": 0.8775,
+      "step": 98
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.43517490620884064,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8126,
+      "step": 99
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.4189323002296131,
+      "learning_rate": 0.0001997240571992685,
+      "loss": 0.7906,
+      "step": 100
+    },
+    {
+      "epoch": 0.05386666666666667,
+      "grad_norm": 0.5545137024772776,
+      "learning_rate": 0.00019971107966712518,
+      "loss": 0.8504,
+      "step": 101
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.49566000330799753,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.8578,
+      "step": 102
+    },
+    {
+      "epoch": 0.054933333333333334,
+      "grad_norm": 0.6582761803171807,
+      "learning_rate": 0.0001996842313852238,
+      "loss": 0.8038,
+      "step": 103
+    },
+    {
+      "epoch": 0.055466666666666664,
+      "grad_norm": 0.41933245789817597,
+      "learning_rate": 0.00019967036071563877,
+      "loss": 0.8217,
+      "step": 104
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.45876305310028254,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.8069,
+      "step": 105
+    },
+    {
+      "epoch": 0.05653333333333333,
+      "grad_norm": 0.4481832891439225,
+      "learning_rate": 0.0001996417265262996,
+      "loss": 0.7822,
+      "step": 106
+    },
+    {
+      "epoch": 0.05706666666666667,
+      "grad_norm": 0.4002701266887735,
+      "learning_rate": 0.00019962696309205148,
+      "loss": 0.7476,
+      "step": 107
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.4251044019248547,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.7378,
+      "step": 108
+    },
+    {
+      "epoch": 0.058133333333333335,
+      "grad_norm": 0.4482236948746077,
+      "learning_rate": 0.0001995965437648273,
+      "loss": 0.7619,
+      "step": 109
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 0.3942240120579654,
+      "learning_rate": 0.00019958088796268793,
+      "loss": 0.7811,
+      "step": 110
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.4285156667145782,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.7162,
+      "step": 111
+    },
+    {
+      "epoch": 0.05973333333333333,
+      "grad_norm": 0.46219900495041816,
+      "learning_rate": 0.00019954868431510764,
+      "loss": 0.7972,
+      "step": 112
+    },
+    {
+      "epoch": 0.06026666666666667,
+      "grad_norm": 0.43503169376098205,
+      "learning_rate": 0.00019953213656583168,
+      "loss": 0.8306,
+      "step": 113
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.4532641384418678,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.858,
+      "step": 114
+    },
+    {
+      "epoch": 0.06133333333333333,
+      "grad_norm": 0.5123681549994666,
+      "learning_rate": 0.00019949814946337838,
+      "loss": 0.9013,
+      "step": 115
+    },
+    {
+      "epoch": 0.06186666666666667,
+      "grad_norm": 0.4393644420012338,
+      "learning_rate": 0.00019948071021169174,
+      "loss": 0.8141,
+      "step": 116
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.38899172381386843,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.6798,
+      "step": 117
+    },
+    {
+      "epoch": 0.06293333333333333,
+      "grad_norm": 0.44230739320439016,
+      "learning_rate": 0.00019944494056777946,
+      "loss": 0.733,
+      "step": 118
+    },
+    {
+      "epoch": 0.06346666666666667,
+      "grad_norm": 0.5149114930756535,
+      "learning_rate": 0.00019942661028236745,
+      "loss": 0.8444,
+      "step": 119
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4485062818049623,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.7891,
+      "step": 120
+    },
+    {
+      "epoch": 0.06453333333333333,
+      "grad_norm": 0.4636467000362323,
+      "learning_rate": 0.00019938905905831654,
+      "loss": 0.8849,
+      "step": 121
+    },
+    {
+      "epoch": 0.06506666666666666,
+      "grad_norm": 0.40070958153035385,
+      "learning_rate": 0.00019936983823181132,
+      "loss": 0.7495,
+      "step": 122
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.4268024210922533,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.7842,
+      "step": 123
+    },
+    {
+      "epoch": 0.06613333333333334,
+      "grad_norm": 0.44017457248078234,
+      "learning_rate": 0.00019933050643682269,
+      "loss": 0.7507,
+      "step": 124
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.4902090345072743,
+      "learning_rate": 0.00019931039558578997,
+      "loss": 0.8333,
+      "step": 125
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4015997691801862,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.7465,
+      "step": 126
+    },
+    {
+      "epoch": 0.06773333333333334,
+      "grad_norm": 0.3998053633531983,
+      "learning_rate": 0.00019926928427691786,
+      "loss": 0.7843,
+      "step": 127
+    },
+    {
+      "epoch": 0.06826666666666667,
+      "grad_norm": 0.48673822688688767,
+      "learning_rate": 0.00019924828394184306,
+      "loss": 0.8761,
+      "step": 128
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.40929288391082347,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.7823,
+      "step": 129
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 0.4408492375250808,
+      "learning_rate": 0.0001992053942239668,
+      "loss": 0.8763,
+      "step": 130
+    },
+    {
+      "epoch": 0.06986666666666666,
+      "grad_norm": 0.3877134360544658,
+      "learning_rate": 0.0001991835049692405,
+      "loss": 0.7505,
+      "step": 131
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.4271539457232771,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.7919,
+      "step": 132
+    },
+    {
+      "epoch": 0.07093333333333333,
+      "grad_norm": 0.4794108836221678,
+      "learning_rate": 0.0001991388379950346,
+      "loss": 0.8133,
+      "step": 133
+    },
+    {
+      "epoch": 0.07146666666666666,
+      "grad_norm": 0.4969442947898306,
+      "learning_rate": 0.0001991160604089374,
+      "loss": 0.7914,
+      "step": 134
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.48217440979886395,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.8682,
+      "step": 135
+    },
+    {
+      "epoch": 0.07253333333333334,
+      "grad_norm": 0.41120601922700956,
+      "learning_rate": 0.00019906961737884077,
+      "loss": 0.7721,
+      "step": 136
+    },
+    {
+      "epoch": 0.07306666666666667,
+      "grad_norm": 0.46070134695490805,
+      "learning_rate": 0.00019904595207352737,
+      "loss": 0.8376,
+      "step": 137
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.44083539879169376,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.8188,
+      "step": 138
+    },
+    {
+      "epoch": 0.07413333333333333,
+      "grad_norm": 0.4276897943709089,
+      "learning_rate": 0.000198997734235711,
+      "loss": 0.7807,
+      "step": 139
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.44924721877515844,
+      "learning_rate": 0.00019897318184719385,
+      "loss": 0.8032,
+      "step": 140
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.4221092408881946,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.7598,
+      "step": 141
+    },
+    {
+      "epoch": 0.07573333333333333,
+      "grad_norm": 0.41134688433967886,
+      "learning_rate": 0.0001989231904975272,
+      "loss": 0.7283,
+      "step": 142
+    },
+    {
+      "epoch": 0.07626666666666666,
+      "grad_norm": 0.41291215830679434,
+      "learning_rate": 0.00019889775168565943,
+      "loss": 0.7853,
+      "step": 143
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.40653646989488545,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7657,
+      "step": 144
+    },
+    {
+      "epoch": 0.07733333333333334,
+      "grad_norm": 0.38656478464242355,
+      "learning_rate": 0.00019884598816767563,
+      "loss": 0.7072,
+      "step": 145
+    },
+    {
+      "epoch": 0.07786666666666667,
+      "grad_norm": 0.39581179446802794,
+      "learning_rate": 0.0001988196636161333,
+      "loss": 0.7828,
+      "step": 146
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.4471986990104352,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.7872,
+      "step": 147
+    },
+    {
+      "epoch": 0.07893333333333333,
+      "grad_norm": 0.4580108314655394,
+      "learning_rate": 0.00019876612932099308,
+      "loss": 0.8182,
+      "step": 148
+    },
+    {
+      "epoch": 0.07946666666666667,
+      "grad_norm": 0.46925373163903356,
+      "learning_rate": 0.0001987389197372567,
+      "loss": 0.7852,
+      "step": 149
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.4265013835353865,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.7921,
+      "step": 150
+    },
+    {
+      "epoch": 0.08053333333333333,
+      "grad_norm": 0.4005929185954514,
+      "learning_rate": 0.00019868361610371097,
+      "loss": 0.786,
+      "step": 151
+    },
+    {
+      "epoch": 0.08106666666666666,
+      "grad_norm": 0.45260132174087747,
+      "learning_rate": 0.00019865552221904665,
+      "loss": 0.8836,
+      "step": 152
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.37954072790512317,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.7182,
+      "step": 153
+    },
+    {
+      "epoch": 0.08213333333333334,
+      "grad_norm": 0.4522102056887562,
+      "learning_rate": 0.00019859845073339787,
+      "loss": 0.8447,
+      "step": 154
+    },
+    {
+      "epoch": 0.08266666666666667,
+      "grad_norm": 0.5369238359841023,
+      "learning_rate": 0.00019856947330283752,
+      "loss": 0.8587,
+      "step": 155
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.45061713714114243,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.8492,
+      "step": 156
+    },
+    {
+      "epoch": 0.08373333333333334,
+      "grad_norm": 0.3965762218434296,
+      "learning_rate": 0.0001985106354988997,
+      "loss": 0.7762,
+      "step": 157
+    },
+    {
+      "epoch": 0.08426666666666667,
+      "grad_norm": 0.43393098138537767,
+      "learning_rate": 0.00019848077530122083,
+      "loss": 0.8346,
+      "step": 158
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.4222125134860982,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.7854,
+      "step": 159
+    },
+    {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.4327902552736384,
+      "learning_rate": 0.00019842017276027832,
+      "loss": 0.815,
+      "step": 160
+    },
+    {
+      "epoch": 0.08586666666666666,
+      "grad_norm": 0.4395089818254555,
+      "learning_rate": 0.00019838943059798304,
+      "loss": 0.8101,
+      "step": 161
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.4301132582767848,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.7872,
+      "step": 162
+    },
+    {
+      "epoch": 0.08693333333333333,
+      "grad_norm": 0.4941306433943535,
+      "learning_rate": 0.0001983270649487481,
+      "loss": 0.861,
+      "step": 163
+    },
+    {
+      "epoch": 0.08746666666666666,
+      "grad_norm": 0.5003393724639728,
+      "learning_rate": 0.0001982954416480417,
+      "loss": 0.7904,
+      "step": 164
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.4959199719206625,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.838,
+      "step": 165
+    },
+    {
+      "epoch": 0.08853333333333334,
+      "grad_norm": 0.4606645442730648,
+      "learning_rate": 0.00019823131456661063,
+      "loss": 0.8272,
+      "step": 166
+    },
+    {
+      "epoch": 0.08906666666666667,
+      "grad_norm": 0.5755397880461268,
+      "learning_rate": 0.00019819881097737915,
+      "loss": 0.9147,
+      "step": 167
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.46518023513909984,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.8505,
+      "step": 168
+    },
+    {
+      "epoch": 0.09013333333333333,
+      "grad_norm": 0.41777269290883634,
+      "learning_rate": 0.00019813292418718732,
+      "loss": 0.8053,
+      "step": 169
+    },
+    {
+      "epoch": 0.09066666666666667,
+      "grad_norm": 0.4079364409236901,
+      "learning_rate": 0.0001980995411829749,
+      "loss": 0.7325,
+      "step": 170
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.4635173524945271,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8913,
+      "step": 171
+    },
+    {
+      "epoch": 0.09173333333333333,
+      "grad_norm": 0.43020478512779425,
+      "learning_rate": 0.0001980318964547504,
+      "loss": 0.79,
+      "step": 172
+    },
+    {
+      "epoch": 0.09226666666666666,
+      "grad_norm": 0.4922312608876827,
+      "learning_rate": 0.0001979976349327357,
+      "loss": 0.844,
+      "step": 173
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4134610957125838,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7729,
+      "step": 174
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 0.5264752394888129,
+      "learning_rate": 0.00019792823408445174,
+      "loss": 0.8994,
+      "step": 175
+    },
+    {
+      "epoch": 0.09386666666666667,
+      "grad_norm": 0.501755351477428,
+      "learning_rate": 0.0001978930949654239,
+      "loss": 0.8309,
+      "step": 176
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.4656418181692643,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.8631,
+      "step": 177
+    },
+    {
+      "epoch": 0.09493333333333333,
+      "grad_norm": 0.4500200556366832,
+      "learning_rate": 0.00019782193986224995,
+      "loss": 0.8611,
+      "step": 178
+    },
+    {
+      "epoch": 0.09546666666666667,
+      "grad_norm": 0.40695089580446,
+      "learning_rate": 0.00019778592409058378,
+      "loss": 0.7457,
+      "step": 179
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.43254424843847755,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.8365,
+      "step": 180
+    },
+    {
+      "epoch": 0.09653333333333333,
+      "grad_norm": 0.38460837023161293,
+      "learning_rate": 0.0001977130166448355,
+      "loss": 0.7343,
+      "step": 181
+    },
+    {
+      "epoch": 0.09706666666666666,
+      "grad_norm": 0.4331189167109147,
+      "learning_rate": 0.00019767612518846608,
+      "loss": 0.7382,
+      "step": 182
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.4504243679472576,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.818,
+      "step": 183
+    },
+    {
+      "epoch": 0.09813333333333334,
+      "grad_norm": 0.4610839042953873,
+      "learning_rate": 0.00019760146735955388,
+      "loss": 0.7838,
+      "step": 184
+    },
+    {
+      "epoch": 0.09866666666666667,
+      "grad_norm": 0.45098999422050184,
+      "learning_rate": 0.00019756370120995066,
+      "loss": 0.8295,
+      "step": 185
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4405463454412497,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7709,
+      "step": 186
+    },
+    {
+      "epoch": 0.09973333333333333,
+      "grad_norm": 0.42813823129016565,
+      "learning_rate": 0.000197487295004327,
+      "loss": 0.7631,
+      "step": 187
+    },
+    {
+      "epoch": 0.10026666666666667,
+      "grad_norm": 0.5041895967649391,
+      "learning_rate": 0.00019744865517646706,
+      "loss": 0.8779,
+      "step": 188
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.4660587382794085,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.7709,
+      "step": 189
+    },
+    {
+      "epoch": 0.10133333333333333,
+      "grad_norm": 0.45041274116588015,
+      "learning_rate": 0.0001973705026475726,
+      "loss": 0.8405,
+      "step": 190
+    },
+    {
+      "epoch": 0.10186666666666666,
+      "grad_norm": 0.42737248919635235,
+      "learning_rate": 0.00019733099017991341,
+      "loss": 0.7928,
+      "step": 191
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4344179415196128,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.7602,
+      "step": 192
+    },
+    {
+      "epoch": 0.10293333333333334,
+      "grad_norm": 0.47364046417924766,
+      "learning_rate": 0.0001972510934281218,
+      "loss": 0.8361,
+      "step": 193
+    },
+    {
+      "epoch": 0.10346666666666667,
+      "grad_norm": 0.4672841088278783,
+      "learning_rate": 0.00019721070938257324,
+      "loss": 0.8234,
+      "step": 194
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.4325747981059211,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.7891,
+      "step": 195
+    },
+    {
+      "epoch": 0.10453333333333334,
+      "grad_norm": 0.48082654673640257,
+      "learning_rate": 0.0001971290705551347,
+      "loss": 0.7887,
+      "step": 196
+    },
+    {
+      "epoch": 0.10506666666666667,
+      "grad_norm": 0.42110747240245316,
+      "learning_rate": 0.00019708781601703065,
+      "loss": 0.7707,
+      "step": 197
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.4522076625082216,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.7181,
+      "step": 198
+    },
+    {
+      "epoch": 0.10613333333333333,
+      "grad_norm": 0.4596901115843113,
+      "learning_rate": 0.00019700443730801413,
+      "loss": 0.8409,
+      "step": 199
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.45036275626350614,
+      "learning_rate": 0.00019696231338608316,
+      "loss": 0.8092,
+      "step": 200
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.41744518461348945,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.7511,
+      "step": 201
+    },
+    {
+      "epoch": 0.10773333333333333,
+      "grad_norm": 0.4835766664050528,
+      "learning_rate": 0.00019687719703631755,
+      "loss": 0.8132,
+      "step": 202
+    },
+    {
+      "epoch": 0.10826666666666666,
+      "grad_norm": 0.46211166533568315,
+      "learning_rate": 0.00019683420486265327,
+      "loss": 0.7628,
+      "step": 203
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.4546632477213524,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.7828,
+      "step": 204
+    },
+    {
+      "epoch": 0.10933333333333334,
+      "grad_norm": 0.456838499007109,
+      "learning_rate": 0.0001967473531596671,
+      "loss": 0.789,
+      "step": 205
+    },
+    {
+      "epoch": 0.10986666666666667,
+      "grad_norm": 0.48846667241796904,
+      "learning_rate": 0.0001967034938896976,
+      "loss": 0.7639,
+      "step": 206
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.4320391164167524,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8361,
+      "step": 207
+    },
+    {
+      "epoch": 0.11093333333333333,
+      "grad_norm": 0.41527293292274986,
+      "learning_rate": 0.0001966149091676575,
+      "loss": 0.7995,
+      "step": 208
+    },
+    {
+      "epoch": 0.11146666666666667,
+      "grad_norm": 0.512836433667297,
+      "learning_rate": 0.00019657018398011434,
+      "loss": 0.8323,
+      "step": 209
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.4237823379933958,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.7709,
+      "step": 210
+    },
+    {
+      "epoch": 0.11253333333333333,
+      "grad_norm": 0.5352301258587794,
+      "learning_rate": 0.00019647986861976246,
+      "loss": 0.8263,
+      "step": 211
+    },
+    {
+      "epoch": 0.11306666666666666,
+      "grad_norm": 0.4231418182692326,
+      "learning_rate": 0.0001964342787166491,
+      "loss": 0.7834,
+      "step": 212
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.45272515603161934,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7671,
+      "step": 213
+    },
+    {
+      "epoch": 0.11413333333333334,
+      "grad_norm": 0.3920941873110034,
+      "learning_rate": 0.0001963422351452389,
+      "loss": 0.7368,
+      "step": 214
+    },
+    {
+      "epoch": 0.11466666666666667,
+      "grad_norm": 0.49631245516639083,
+      "learning_rate": 0.0001962957817517982,
+      "loss": 0.8451,
+      "step": 215
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.44609129602396586,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.8309,
+      "step": 216
+    },
+    {
+      "epoch": 0.11573333333333333,
+      "grad_norm": 0.4022816924207915,
+      "learning_rate": 0.00019620201244302952,
+      "loss": 0.7527,
+      "step": 217
+    },
+    {
+      "epoch": 0.11626666666666667,
+      "grad_norm": 0.478976632328823,
+      "learning_rate": 0.00019615469680771096,
+      "loss": 0.7908,
+      "step": 218
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.45135239518940673,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7858,
+      "step": 219
+    },
+    {
+      "epoch": 0.11733333333333333,
+      "grad_norm": 0.4358801136556032,
+      "learning_rate": 0.00019605920428166323,
+      "loss": 0.8212,
+      "step": 220
+    },
+    {
+      "epoch": 0.11786666666666666,
+      "grad_norm": 0.39111685562574505,
+      "learning_rate": 0.00019601102767608923,
+      "loss": 0.8156,
+      "step": 221
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.4797254303453361,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8835,
+      "step": 222
+    },
+    {
+      "epoch": 0.11893333333333334,
+      "grad_norm": 0.37436051437624385,
+      "learning_rate": 0.00019591381449915397,
+      "loss": 0.7374,
+      "step": 223
+    },
+    {
+      "epoch": 0.11946666666666667,
+      "grad_norm": 0.42537579894871347,
+      "learning_rate": 0.00019586477821808597,
+      "loss": 0.8072,
+      "step": 224
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.4347633584472628,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.8094,
+      "step": 225
+    },
+    {
+      "epoch": 0.12053333333333334,
+      "grad_norm": 0.42567851321973016,
+      "learning_rate": 0.00019576584700289768,
+      "loss": 0.8078,
+      "step": 226
+    },
+    {
+      "epoch": 0.12106666666666667,
+      "grad_norm": 0.41827443110157314,
+      "learning_rate": 0.00019571595236420102,
+      "loss": 0.7221,
+      "step": 227
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.4259899522879917,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.7287,
+      "step": 228
+    },
+    {
+      "epoch": 0.12213333333333333,
+      "grad_norm": 0.4608962828880691,
+      "learning_rate": 0.00019561530576956703,
+      "loss": 0.7996,
+      "step": 229
+    },
+    {
+      "epoch": 0.12266666666666666,
+      "grad_norm": 0.4676139355530056,
+      "learning_rate": 0.00019556455411417573,
+      "loss": 0.8287,
+      "step": 230
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.4210394577520858,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.7907,
+      "step": 231
+    },
+    {
+      "epoch": 0.12373333333333333,
+      "grad_norm": 0.46891173284904897,
+      "learning_rate": 0.00019546219484500475,
+      "loss": 0.8668,
+      "step": 232
+    },
+    {
+      "epoch": 0.12426666666666666,
+      "grad_norm": 0.3575156070876499,
+      "learning_rate": 0.00019541058753688538,
+      "loss": 0.7177,
+      "step": 233
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.3990387525951872,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7482,
+      "step": 234
+    },
+    {
+      "epoch": 0.12533333333333332,
+      "grad_norm": 0.41120803401329314,
+      "learning_rate": 0.00019530651834411474,
+      "loss": 0.8065,
+      "step": 235
+    },
+    {
+      "epoch": 0.12586666666666665,
+      "grad_norm": 0.4477273088222328,
+      "learning_rate": 0.00019525405677022989,
+      "loss": 0.8217,
+      "step": 236
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.44031792113180557,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.791,
+      "step": 237
+    },
+    {
+      "epoch": 0.12693333333333334,
+      "grad_norm": 0.438521440691371,
+      "learning_rate": 0.0001951482804507517,
+      "loss": 0.8312,
+      "step": 238
+    },
+    {
+      "epoch": 0.12746666666666667,
+      "grad_norm": 0.4647933969283093,
+      "learning_rate": 0.00019509496602102252,
+      "loss": 0.7561,
+      "step": 239
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.42542690295590896,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.7849,
+      "step": 240
+    },
+    {
+      "epoch": 0.12853333333333333,
+      "grad_norm": 0.4283048163569719,
+      "learning_rate": 0.00019498748541760846,
+      "loss": 0.7429,
+      "step": 241
+    },
+    {
+      "epoch": 0.12906666666666666,
+      "grad_norm": 0.41729230349711055,
+      "learning_rate": 0.0001949333195648769,
+      "loss": 0.7911,
+      "step": 242
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.42409827036893505,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.7667,
+      "step": 243
+    },
+    {
+      "epoch": 0.13013333333333332,
+      "grad_norm": 0.45555074238723325,
+      "learning_rate": 0.00019482413756610173,
+      "loss": 0.8111,
+      "step": 244
+    },
+    {
+      "epoch": 0.13066666666666665,
+      "grad_norm": 0.4275715808233641,
+      "learning_rate": 0.0001947691217460921,
+      "loss": 0.7544,
+      "step": 245
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.43475231347946225,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.7598,
+      "step": 246
+    },
+    {
+      "epoch": 0.13173333333333334,
+      "grad_norm": 0.40237868403890853,
+      "learning_rate": 0.00019465824128625617,
+      "loss": 0.755,
+      "step": 247
+    },
+    {
+      "epoch": 0.13226666666666667,
+      "grad_norm": 0.4070690513157304,
+      "learning_rate": 0.00019460237697753577,
+      "loss": 0.734,
+      "step": 248
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.4524921752244452,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8266,
+      "step": 249
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.43520211044043444,
+      "learning_rate": 0.00019448980103658613,
+      "loss": 0.767,
+      "step": 250
+    },
+    {
+      "epoch": 0.13386666666666666,
+      "grad_norm": 0.4503370166116177,
+      "learning_rate": 0.0001944330897405257,
+      "loss": 0.7568,
+      "step": 251
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.4770182557409676,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8477,
+      "step": 252
+    },
+    {
+      "epoch": 0.13493333333333332,
+      "grad_norm": 0.38594426530951254,
+      "learning_rate": 0.00019431882134397598,
+      "loss": 0.683,
+      "step": 253
+    },
+    {
+      "epoch": 0.13546666666666668,
+      "grad_norm": 0.4628521389132707,
+      "learning_rate": 0.00019426126458470936,
+      "loss": 0.7948,
+      "step": 254
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4483194544871081,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.7581,
+      "step": 255
+    },
+    {
+      "epoch": 0.13653333333333334,
+      "grad_norm": 0.4274640982303173,
+      "learning_rate": 0.00019414530680355837,
+      "loss": 0.7732,
+      "step": 256
+    },
+    {
+      "epoch": 0.13706666666666667,
+      "grad_norm": 0.4578237135150185,
+      "learning_rate": 0.00019408690612794148,
+      "loss": 0.7698,
+      "step": 257
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.40657945119684724,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7358,
+      "step": 258
+    },
+    {
+      "epoch": 0.13813333333333333,
+      "grad_norm": 0.45123971773383875,
+      "learning_rate": 0.00019396926207859084,
+      "loss": 0.8045,
+      "step": 259
+    },
+    {
+      "epoch": 0.13866666666666666,
+      "grad_norm": 0.4450941319936875,
+      "learning_rate": 0.0001939100190561601,
+      "loss": 0.759,
+      "step": 260
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.45149620245292893,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.806,
+      "step": 261
+    },
+    {
+      "epoch": 0.13973333333333332,
+      "grad_norm": 0.4467897175764718,
+      "learning_rate": 0.0001937906919003304,
+      "loss": 0.7645,
+      "step": 262
+    },
+    {
+      "epoch": 0.14026666666666668,
+      "grad_norm": 0.40514445423020395,
+      "learning_rate": 0.00019373060812326052,
+      "loss": 0.751,
+      "step": 263
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.48831563084136037,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8054,
+      "step": 264
+    },
+    {
+      "epoch": 0.14133333333333334,
+      "grad_norm": 0.42281959883654185,
+      "learning_rate": 0.00019360960106790643,
+      "loss": 0.7842,
+      "step": 265
+    },
+    {
+      "epoch": 0.14186666666666667,
+      "grad_norm": 0.39126326593973365,
+      "learning_rate": 0.0001935486781509677,
+      "loss": 0.7645,
+      "step": 266
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.41985934472402275,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7772,
+      "step": 267
+    },
+    {
+      "epoch": 0.14293333333333333,
+      "grad_norm": 0.48212383198795683,
+      "learning_rate": 0.00019342599444819168,
+      "loss": 0.8167,
+      "step": 268
+    },
+    {
+      "epoch": 0.14346666666666666,
+      "grad_norm": 0.41699215934629774,
+      "learning_rate": 0.00019336423402870653,
+      "loss": 0.7826,
+      "step": 269
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.3980813154245854,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.718,
+      "step": 270
+    },
+    {
+      "epoch": 0.14453333333333335,
+      "grad_norm": 0.40892013264312427,
+      "learning_rate": 0.0001932398769756714,
+      "loss": 0.7783,
+      "step": 271
+    },
+    {
+      "epoch": 0.14506666666666668,
+      "grad_norm": 0.40845481146044704,
+      "learning_rate": 0.0001931772807134704,
+      "loss": 0.7519,
+      "step": 272
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.41275194566666135,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.755,
+      "step": 273
+    },
+    {
+      "epoch": 0.14613333333333334,
+      "grad_norm": 0.3756995702327026,
+      "learning_rate": 0.00019305125365231084,
+      "loss": 0.7828,
+      "step": 274
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 0.4069881524615693,
+      "learning_rate": 0.00019298782322968815,
+      "loss": 0.8123,
+      "step": 275
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.44556873876785796,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.7432,
+      "step": 276
+    },
+    {
+      "epoch": 0.14773333333333333,
+      "grad_norm": 0.4267954294684829,
+      "learning_rate": 0.0001928601295474208,
+      "loss": 0.7555,
+      "step": 277
+    },
+    {
+      "epoch": 0.14826666666666666,
+      "grad_norm": 0.41854023230084636,
+      "learning_rate": 0.00019279586666908884,
+      "loss": 0.8007,
+      "step": 278
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.4226803217688415,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7786,
+      "step": 279
+    },
+    {
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.4029724657127977,
+      "learning_rate": 0.00019266650979752136,
+      "loss": 0.754,
+      "step": 280
+    },
+    {
+      "epoch": 0.14986666666666668,
+      "grad_norm": 0.49131066949159913,
+      "learning_rate": 0.00019260141619056507,
+      "loss": 0.7747,
+      "step": 281
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.47760159762825755,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.8425,
+      "step": 282
+    },
+    {
+      "epoch": 0.15093333333333334,
+      "grad_norm": 0.4454821627434679,
+      "learning_rate": 0.0001924703996062038,
+      "loss": 0.7882,
+      "step": 283
+    },
+    {
+      "epoch": 0.15146666666666667,
+      "grad_norm": 0.3936910423411889,
+      "learning_rate": 0.0001924044770200342,
+      "loss": 0.7664,
+      "step": 284
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.6269939926797371,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.8943,
+      "step": 285
+    },
+    {
+      "epoch": 0.15253333333333333,
+      "grad_norm": 0.4841497625636035,
+      "learning_rate": 0.0001922718042439908,
+      "loss": 0.8011,
+      "step": 286
+    },
+    {
+      "epoch": 0.15306666666666666,
+      "grad_norm": 0.4415326311332174,
+      "learning_rate": 0.000192205054450298,
+      "loss": 0.765,
+      "step": 287
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.454293611217275,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.7912,
+      "step": 288
+    },
+    {
+      "epoch": 0.15413333333333334,
+      "grad_norm": 0.46757920591817387,
+      "learning_rate": 0.00019207072904819486,
+      "loss": 0.8217,
+      "step": 289
+    },
+    {
+      "epoch": 0.15466666666666667,
+      "grad_norm": 0.46476638206133764,
+      "learning_rate": 0.00019200315384090044,
+      "loss": 0.78,
+      "step": 290
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.4151735777273285,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7033,
+      "step": 291
+    },
+    {
+      "epoch": 0.15573333333333333,
+      "grad_norm": 0.4274306460164623,
+      "learning_rate": 0.00019186717942277462,
+      "loss": 0.7739,
+      "step": 292
+    },
+    {
+      "epoch": 0.15626666666666666,
+      "grad_norm": 0.455430053788848,
+      "learning_rate": 0.00019179878061798347,
+      "loss": 0.8669,
+      "step": 293
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.41159284980500294,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.778,
+      "step": 294
+    },
+    {
+      "epoch": 0.15733333333333333,
+      "grad_norm": 0.41012325260301996,
+      "learning_rate": 0.00019166116083819002,
+      "loss": 0.754,
+      "step": 295
+    },
+    {
+      "epoch": 0.15786666666666666,
+      "grad_norm": 0.4612699513123622,
+      "learning_rate": 0.00019159194027414128,
+      "loss": 0.7306,
+      "step": 296
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.43272865871889055,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8341,
+      "step": 297
+    },
+    {
+      "epoch": 0.15893333333333334,
+      "grad_norm": 0.41963200331568457,
+      "learning_rate": 0.00019145267883125482,
+      "loss": 0.7798,
+      "step": 298
+    },
+    {
+      "epoch": 0.15946666666666667,
+      "grad_norm": 0.3969513919598155,
+      "learning_rate": 0.00019138263836827288,
+      "loss": 0.7619,
+      "step": 299
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4297719531502467,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.7681,
+      "step": 300
+    },
+    {
+      "epoch": 0.16053333333333333,
+      "grad_norm": 0.45766795714776487,
+      "learning_rate": 0.00019124173900498818,
+      "loss": 0.8352,
+      "step": 301
+    },
+    {
+      "epoch": 0.16106666666666666,
+      "grad_norm": 0.42487143616388595,
+      "learning_rate": 0.00019117088052543233,
+      "loss": 0.7428,
+      "step": 302
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.5211110951690558,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.916,
+      "step": 303
+    },
+    {
+      "epoch": 0.16213333333333332,
+      "grad_norm": 0.3778763748723273,
+      "learning_rate": 0.00019102834702846387,
+      "loss": 0.7194,
+      "step": 304
+    },
+    {
+      "epoch": 0.16266666666666665,
+      "grad_norm": 0.4077185288189954,
+      "learning_rate": 0.0001909566724366779,
+      "loss": 0.7293,
+      "step": 305
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.4208184764998574,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.7764,
+      "step": 306
+    },
+    {
+      "epoch": 0.16373333333333334,
+      "grad_norm": 0.42027135789116177,
+      "learning_rate": 0.00019081250863665794,
+      "loss": 0.788,
+      "step": 307
+    },
+    {
+      "epoch": 0.16426666666666667,
+      "grad_norm": 0.40554900276058214,
+      "learning_rate": 0.0001907400198589189,
+      "loss": 0.761,
+      "step": 308
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.42742857255717165,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7439,
+      "step": 309
+    },
+    {
+      "epoch": 0.16533333333333333,
+      "grad_norm": 0.4601886210902722,
+      "learning_rate": 0.00019059422963029464,
+      "loss": 0.8274,
+      "step": 310
+    },
+    {
+      "epoch": 0.16586666666666666,
+      "grad_norm": 0.478589187295555,
+      "learning_rate": 0.0001905209286147611,
+      "loss": 0.778,
+      "step": 311
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4633748369336836,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.8225,
+      "step": 312
+    },
+    {
+      "epoch": 0.16693333333333332,
+      "grad_norm": 0.4083129578873108,
+      "learning_rate": 0.0001903735158756905,
+      "loss": 0.7405,
+      "step": 313
+    },
+    {
+      "epoch": 0.16746666666666668,
+      "grad_norm": 0.620832278768044,
+      "learning_rate": 0.0001902994045923502,
+      "loss": 0.7697,
+      "step": 314
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.4077726180167016,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.7279,
+      "step": 315
+    },
+    {
+      "epoch": 0.16853333333333334,
+      "grad_norm": 0.4834457372161605,
+      "learning_rate": 0.0001901503733045967,
+      "loss": 0.8183,
+      "step": 316
+    },
+    {
+      "epoch": 0.16906666666666667,
+      "grad_norm": 0.44163255685108305,
+      "learning_rate": 0.00019007545374521355,
+      "loss": 0.7696,
+      "step": 317
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.4687268712645231,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7692,
+      "step": 318
+    },
+    {
+      "epoch": 0.17013333333333333,
+      "grad_norm": 0.43966287562173245,
+      "learning_rate": 0.00018992480791403958,
+      "loss": 0.7565,
+      "step": 319
+    },
+    {
+      "epoch": 0.17066666666666666,
+      "grad_norm": 0.46945289527620604,
+      "learning_rate": 0.0001898490820921001,
+      "loss": 0.8155,
+      "step": 320
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.40981539825547586,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.7615,
+      "step": 321
+    },
+    {
+      "epoch": 0.17173333333333332,
+      "grad_norm": 0.4528331049187482,
+      "learning_rate": 0.0001896968257661595,
+      "loss": 0.7866,
+      "step": 322
+    },
+    {
+      "epoch": 0.17226666666666668,
+      "grad_norm": 0.4279879948857359,
+      "learning_rate": 0.00018962029571681886,
+      "loss": 0.806,
+      "step": 323
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.4567720668919121,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.8126,
+      "step": 324
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 0.42559932918287957,
+      "learning_rate": 0.00018946643298804793,
+      "loss": 0.751,
+      "step": 325
+    },
+    {
+      "epoch": 0.17386666666666667,
+      "grad_norm": 0.4374833787533294,
+      "learning_rate": 0.00018938910076807513,
+      "loss": 0.7758,
+      "step": 326
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.400521048887267,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7182,
+      "step": 327
+    },
+    {
+      "epoch": 0.17493333333333333,
+      "grad_norm": 0.4411305070030174,
+      "learning_rate": 0.0001892336357715829,
+      "loss": 0.7422,
+      "step": 328
+    },
+    {
+      "epoch": 0.17546666666666666,
+      "grad_norm": 0.3960947529502556,
+      "learning_rate": 0.0001891555034593055,
+      "loss": 0.706,
+      "step": 329
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.3983703097172135,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.7251,
+      "step": 330
+    },
+    {
+      "epoch": 0.17653333333333332,
+      "grad_norm": 0.45846564057134404,
+      "learning_rate": 0.00018899844037326225,
+      "loss": 0.7802,
+      "step": 331
+    },
+    {
+      "epoch": 0.17706666666666668,
+      "grad_norm": 0.41159932485400996,
+      "learning_rate": 0.0001889195100685106,
+      "loss": 0.7601,
+      "step": 332
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.447424805825611,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.8029,
+      "step": 333
+    },
+    {
+      "epoch": 0.17813333333333334,
+      "grad_norm": 0.40604851956494914,
+      "learning_rate": 0.00018876085311403593,
+      "loss": 0.7416,
+      "step": 334
+    },
+    {
+      "epoch": 0.17866666666666667,
+      "grad_norm": 0.45501976602393485,
+      "learning_rate": 0.00018868112693808665,
+      "loss": 0.8366,
+      "step": 335
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.42459722612502765,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.7481,
+      "step": 336
+    },
+    {
+      "epoch": 0.17973333333333333,
+      "grad_norm": 0.4919631166230819,
+      "learning_rate": 0.00018852088037913577,
+      "loss": 0.7855,
+      "step": 337
+    },
+    {
+      "epoch": 0.18026666666666666,
+      "grad_norm": 0.43033665282321837,
+      "learning_rate": 0.0001884403604746547,
+      "loss": 0.7469,
+      "step": 338
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.3804600692068997,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.756,
+      "step": 339
+    },
+    {
+      "epoch": 0.18133333333333335,
+      "grad_norm": 0.3941496431808659,
+      "learning_rate": 0.00018827852861790398,
+      "loss": 0.6994,
+      "step": 340
+    },
+    {
+      "epoch": 0.18186666666666668,
+      "grad_norm": 0.4135361886334295,
+      "learning_rate": 0.00018819721714888877,
+      "loss": 0.787,
+      "step": 341
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.4036937660863021,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.6809,
+      "step": 342
+    },
+    {
+      "epoch": 0.18293333333333334,
+      "grad_norm": 0.4144768391836234,
+      "learning_rate": 0.00018803380434362,
+      "loss": 0.7235,
+      "step": 343
+    },
+    {
+      "epoch": 0.18346666666666667,
+      "grad_norm": 0.414924772270386,
+      "learning_rate": 0.0001879517034953418,
+      "loss": 0.7745,
+      "step": 344
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.5104440648202117,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.814,
+      "step": 345
+    },
+    {
+      "epoch": 0.18453333333333333,
+      "grad_norm": 0.4510674205737547,
+      "learning_rate": 0.00018778671413332513,
+      "loss": 0.8003,
+      "step": 346
+    },
+    {
+      "epoch": 0.18506666666666666,
+      "grad_norm": 0.4615965741984511,
+      "learning_rate": 0.00018770382611226987,
+      "loss": 0.7685,
+      "step": 347
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.3867991235423707,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.7224,
+      "step": 348
+    },
+    {
+      "epoch": 0.18613333333333335,
+      "grad_norm": 0.4064479524634665,
+      "learning_rate": 0.000187537264627646,
+      "loss": 0.7762,
+      "step": 349
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.4437818670338459,
+      "learning_rate": 0.00018745359166145523,
+      "loss": 0.7865,
+      "step": 350
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.45360184922661184,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7674,
+      "step": 351
+    },
+    {
+      "epoch": 0.18773333333333334,
+      "grad_norm": 0.4196062484442272,
+      "learning_rate": 0.00018728546253061614,
+      "loss": 0.7544,
+      "step": 352
+    },
+    {
+      "epoch": 0.18826666666666667,
+      "grad_norm": 0.4803602897556589,
+      "learning_rate": 0.00018720100686802694,
+      "loss": 0.8425,
+      "step": 353
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.49072751080915683,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.8314,
+      "step": 354
+    },
+    {
+      "epoch": 0.18933333333333333,
+      "grad_norm": 0.47772940016280147,
+      "learning_rate": 0.00018703131460949554,
+      "loss": 0.8408,
+      "step": 355
+    },
+    {
+      "epoch": 0.18986666666666666,
+      "grad_norm": 0.42634504050193084,
+      "learning_rate": 0.0001869460785202802,
+      "loss": 0.7796,
+      "step": 356
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.34872832935612424,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.6529,
+      "step": 357
+    },
+    {
+      "epoch": 0.19093333333333334,
+      "grad_norm": 0.4980111280217522,
+      "learning_rate": 0.00018677482769458904,
+      "loss": 0.796,
+      "step": 358
+    },
+    {
+      "epoch": 0.19146666666666667,
+      "grad_norm": 0.39347240314594795,
+      "learning_rate": 0.00018668881346949417,
+      "loss": 0.7456,
+      "step": 359
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4051626526402004,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7584,
+      "step": 360
+    },
+    {
+      "epoch": 0.19253333333333333,
+      "grad_norm": 0.48572228625634567,
+      "learning_rate": 0.00018651600867906272,
+      "loss": 0.747,
+      "step": 361
+    },
+    {
+      "epoch": 0.19306666666666666,
+      "grad_norm": 0.41051675505313506,
+      "learning_rate": 0.00018642921862974742,
+      "loss": 0.7866,
+      "step": 362
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.4152477619489462,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.7572,
+      "step": 363
+    },
+    {
+      "epoch": 0.19413333333333332,
+      "grad_norm": 0.4682045694000191,
+      "learning_rate": 0.00018625486451875843,
+      "loss": 0.7866,
+      "step": 364
+    },
+    {
+      "epoch": 0.19466666666666665,
+      "grad_norm": 0.4064667329570052,
+      "learning_rate": 0.0001861673009777325,
+      "loss": 0.7861,
+      "step": 365
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4351942533622352,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.8325,
+      "step": 366
+    },
+    {
+      "epoch": 0.19573333333333334,
+      "grad_norm": 0.432979227691755,
+      "learning_rate": 0.00018599140223200716,
+      "loss": 0.7952,
+      "step": 367
+    },
+    {
+      "epoch": 0.19626666666666667,
+      "grad_norm": 0.44332700796138425,
+      "learning_rate": 0.0001859030675525681,
+      "loss": 0.7879,
+      "step": 368
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.39922637527320176,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.6837,
+      "step": 369
+    },
+    {
+      "epoch": 0.19733333333333333,
+      "grad_norm": 0.41532893705236523,
+      "learning_rate": 0.0001857256288994402,
+      "loss": 0.7282,
+      "step": 370
+    },
+    {
+      "epoch": 0.19786666666666666,
+      "grad_norm": 0.45028989276063536,
+      "learning_rate": 0.00018563652545561013,
+      "loss": 0.7992,
+      "step": 371
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4512458676474834,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.822,
+      "step": 372
+    },
+    {
+      "epoch": 0.19893333333333332,
+      "grad_norm": 0.41483902235497905,
+      "learning_rate": 0.000185457551663799,
+      "loss": 0.7586,
+      "step": 373
+    },
+    {
+      "epoch": 0.19946666666666665,
+      "grad_norm": 0.479064586610721,
+      "learning_rate": 0.00018536768185026083,
+      "loss": 0.8433,
+      "step": 374
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.42314123383647745,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.7837,
+      "step": 375
+    },
+    {
+      "epoch": 0.20053333333333334,
+      "grad_norm": 0.3852527022550998,
+      "learning_rate": 0.00018518717772974302,
+      "loss": 0.7022,
+      "step": 376
+    },
+    {
+      "epoch": 0.20106666666666667,
+      "grad_norm": 0.34546969848499054,
+      "learning_rate": 0.00018509654396177609,
+      "loss": 0.6765,
+      "step": 377
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.39831988638266946,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.7054,
+      "step": 378
+    },
+    {
+      "epoch": 0.20213333333333333,
+      "grad_norm": 0.40016473875548725,
+      "learning_rate": 0.00018491451436365627,
+      "loss": 0.7309,
+      "step": 379
+    },
+    {
+      "epoch": 0.20266666666666666,
+      "grad_norm": 0.41862062761699875,
+      "learning_rate": 0.0001848231190770714,
+      "loss": 0.7649,
+      "step": 380
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.4216060227393941,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.7933,
+      "step": 381
+    },
+    {
+      "epoch": 0.20373333333333332,
+      "grad_norm": 0.38746765063921507,
+      "learning_rate": 0.00018463956889345194,
+      "loss": 0.6973,
+      "step": 382
+    },
+    {
+      "epoch": 0.20426666666666668,
+      "grad_norm": 0.42422814568525985,
+      "learning_rate": 0.00018454741454452603,
+      "loss": 0.7335,
+      "step": 383
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4175831076791634,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7776,
+      "step": 384
+    },
+    {
+      "epoch": 0.20533333333333334,
+      "grad_norm": 0.4488967788947463,
+      "learning_rate": 0.00018436234870837547,
+      "loss": 0.8528,
+      "step": 385
+    },
+    {
+      "epoch": 0.20586666666666667,
+      "grad_norm": 0.3804940783199971,
+      "learning_rate": 0.00018426943777378552,
+      "loss": 0.7103,
+      "step": 386
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.39265951485970907,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7021,
+      "step": 387
+    },
+    {
+      "epoch": 0.20693333333333333,
+      "grad_norm": 0.4170108586130046,
+      "learning_rate": 0.00018408286125880604,
+      "loss": 0.7074,
+      "step": 388
+    },
+    {
+      "epoch": 0.20746666666666666,
+      "grad_norm": 0.41255415960615377,
+      "learning_rate": 0.00018398919623556238,
+      "loss": 0.7455,
+      "step": 389
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.41316936831534123,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.7786,
+      "step": 390
+    },
+    {
+      "epoch": 0.20853333333333332,
+      "grad_norm": 0.4522356929167597,
+      "learning_rate": 0.0001838011140560562,
+      "loss": 0.8267,
+      "step": 391
+    },
+    {
+      "epoch": 0.20906666666666668,
+      "grad_norm": 0.4199394826524308,
+      "learning_rate": 0.00018370669746143564,
+      "loss": 0.786,
+      "step": 392
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.43124818409602006,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.697,
+      "step": 393
+    },
+    {
+      "epoch": 0.21013333333333334,
+      "grad_norm": 0.45302212707459977,
+      "learning_rate": 0.0001835171146721701,
+      "loss": 0.7375,
+      "step": 394
+    },
+    {
+      "epoch": 0.21066666666666667,
+      "grad_norm": 0.38967910480164086,
+      "learning_rate": 0.00018342194904364813,
+      "loss": 0.7245,
+      "step": 395
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.4413737693750598,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.783,
+      "step": 396
+    },
+    {
+      "epoch": 0.21173333333333333,
+      "grad_norm": 0.4270130265037395,
+      "learning_rate": 0.00018323087073971993,
+      "loss": 0.7503,
+      "step": 397
+    },
+    {
+      "epoch": 0.21226666666666666,
+      "grad_norm": 0.3981751574168622,
+      "learning_rate": 0.00018313495863490258,
+      "loss": 0.7069,
+      "step": 398
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.4343883652859093,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.763,
+      "step": 399
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.43844604515187074,
+      "learning_rate": 0.00018294238995160094,
+      "loss": 0.792,
+      "step": 400
+    },
+    {
+      "epoch": 0.21386666666666668,
+      "grad_norm": 0.4848368467011727,
+      "learning_rate": 0.00018284573394815597,
+      "loss": 0.7928,
+      "step": 401
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.4670982578822998,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.8336,
+      "step": 402
+    },
+    {
+      "epoch": 0.21493333333333334,
+      "grad_norm": 0.41721696610767667,
+      "learning_rate": 0.00018265168006082437,
+      "loss": 0.7267,
+      "step": 403
+    },
+    {
+      "epoch": 0.21546666666666667,
+      "grad_norm": 0.5100412521942448,
+      "learning_rate": 0.00018255428275641214,
+      "loss": 0.8037,
+      "step": 404
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.423327472770163,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7078,
+      "step": 405
+    },
+    {
+      "epoch": 0.21653333333333333,
+      "grad_norm": 0.39903295305294834,
+      "learning_rate": 0.0001823587488803095,
+      "loss": 0.78,
+      "step": 406
+    },
+    {
+      "epoch": 0.21706666666666666,
+      "grad_norm": 0.4273087221317984,
+      "learning_rate": 0.00018226061289251298,
+      "loss": 0.7656,
+      "step": 407
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4701733296751263,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7457,
+      "step": 408
+    },
+    {
+      "epoch": 0.21813333333333335,
+      "grad_norm": 0.4849339935513332,
+      "learning_rate": 0.00018206360428267332,
+      "loss": 0.8422,
+      "step": 409
+    },
+    {
+      "epoch": 0.21866666666666668,
+      "grad_norm": 0.4365292534868585,
+      "learning_rate": 0.00018196473224892784,
+      "loss": 0.7459,
+      "step": 410
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.45540180676703806,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.7434,
+      "step": 411
+    },
+    {
+      "epoch": 0.21973333333333334,
+      "grad_norm": 0.3821015207412149,
+      "learning_rate": 0.0001817662542000192,
+      "loss": 0.7346,
+      "step": 412
+    },
+    {
+      "epoch": 0.22026666666666667,
+      "grad_norm": 0.4237340453439172,
+      "learning_rate": 0.0001816666487775416,
+      "loss": 0.7572,
+      "step": 413
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4260188088738735,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7807,
+      "step": 414
+    },
+    {
+      "epoch": 0.22133333333333333,
+      "grad_norm": 0.390168704071244,
+      "learning_rate": 0.00018146670662372354,
+      "loss": 0.7416,
+      "step": 415
+    },
+    {
+      "epoch": 0.22186666666666666,
+      "grad_norm": 0.3834271348861522,
+      "learning_rate": 0.0001813663704894407,
+      "loss": 0.7651,
+      "step": 416
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.42596724690668536,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.8005,
+      "step": 417
+    },
+    {
+      "epoch": 0.22293333333333334,
+      "grad_norm": 0.41677988711146163,
+      "learning_rate": 0.00018116496960422107,
+      "loss": 0.7555,
+      "step": 418
+    },
+    {
+      "epoch": 0.22346666666666667,
+      "grad_norm": 0.38915845494043444,
+      "learning_rate": 0.00018106390545469795,
+      "loss": 0.7186,
+      "step": 419
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.40347775870221103,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.7605,
+      "step": 420
+    },
+    {
+      "epoch": 0.22453333333333333,
+      "grad_norm": 0.4255599589983862,
+      "learning_rate": 0.00018086105125078857,
+      "loss": 0.7362,
+      "step": 421
+    },
+    {
+      "epoch": 0.22506666666666666,
+      "grad_norm": 0.4357997850956404,
+      "learning_rate": 0.00018075926180215576,
+      "loss": 0.7813,
+      "step": 422
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.41914054066713735,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.78,
+      "step": 423
+    },
+    {
+      "epoch": 0.22613333333333333,
+      "grad_norm": 0.44432795823533866,
+      "learning_rate": 0.0001805549597313267,
+      "loss": 0.7659,
+      "step": 424
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 0.38912853603617054,
+      "learning_rate": 0.0001804524477192075,
+      "loss": 0.7743,
+      "step": 425
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.46612585654323574,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.8243,
+      "step": 426
+    },
+    {
+      "epoch": 0.22773333333333334,
+      "grad_norm": 0.3984567394637064,
+      "learning_rate": 0.00018024670327214084,
+      "loss": 0.7212,
+      "step": 427
+    },
+    {
+      "epoch": 0.22826666666666667,
+      "grad_norm": 0.40799714213879,
+      "learning_rate": 0.00018014347145157755,
+      "loss": 0.7352,
+      "step": 428
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.43347839663468984,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.782,
+      "step": 429
+    },
+    {
+      "epoch": 0.22933333333333333,
+      "grad_norm": 0.4253061709077604,
+      "learning_rate": 0.0001799362901577196,
+      "loss": 0.7764,
+      "step": 430
+    },
+    {
+      "epoch": 0.22986666666666666,
+      "grad_norm": 0.42087057607363554,
+      "learning_rate": 0.00017983234130309968,
+      "loss": 0.7868,
+      "step": 431
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.43254532172608784,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7589,
+      "step": 432
+    },
+    {
+      "epoch": 0.23093333333333332,
+      "grad_norm": 0.3914305001715474,
+      "learning_rate": 0.00017962372873051252,
+      "loss": 0.7172,
+      "step": 433
+    },
+    {
+      "epoch": 0.23146666666666665,
+      "grad_norm": 0.4321606521827664,
+      "learning_rate": 0.00017951906563549397,
+      "loss": 0.8013,
+      "step": 434
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.38543498665079473,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7057,
+      "step": 435
+    },
+    {
+      "epoch": 0.23253333333333334,
+      "grad_norm": 0.43362129868488725,
+      "learning_rate": 0.00017930902739070562,
+      "loss": 0.8642,
+      "step": 436
+    },
+    {
+      "epoch": 0.23306666666666667,
+      "grad_norm": 0.41177356200450155,
+      "learning_rate": 0.00017920365286814183,
+      "loss": 0.7745,
+      "step": 437
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4252255336463102,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7475,
+      "step": 438
+    },
+    {
+      "epoch": 0.23413333333333333,
+      "grad_norm": 0.44562431381397916,
+      "learning_rate": 0.0001789921945959958,
+      "loss": 0.8311,
+      "step": 439
+    },
+    {
+      "epoch": 0.23466666666666666,
+      "grad_norm": 0.45539376204648174,
+      "learning_rate": 0.00017888611147786002,
+      "loss": 0.7755,
+      "step": 440
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.4034765961175896,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7289,
+      "step": 441
+    },
+    {
+      "epoch": 0.23573333333333332,
+      "grad_norm": 0.4141539447877892,
+      "learning_rate": 0.00017867323886136348,
+      "loss": 0.7284,
+      "step": 442
+    },
+    {
+      "epoch": 0.23626666666666668,
+      "grad_norm": 0.3813398646439378,
+      "learning_rate": 0.00017856644999867264,
+      "loss": 0.6716,
+      "step": 443
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.4291614520514329,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7606,
+      "step": 444
+    },
+    {
+      "epoch": 0.23733333333333334,
+      "grad_norm": 0.41686238869141945,
+      "learning_rate": 0.00017835216875884368,
+      "loss": 0.7308,
+      "step": 445
+    },
+    {
+      "epoch": 0.23786666666666667,
+      "grad_norm": 0.4201586702551558,
+      "learning_rate": 0.0001782446770215819,
+      "loss": 0.7733,
+      "step": 446
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.4444739909491757,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7768,
+      "step": 447
+    },
+    {
+      "epoch": 0.23893333333333333,
+      "grad_norm": 0.4435520129736999,
+      "learning_rate": 0.00017802899291729585,
+      "loss": 0.8036,
+      "step": 448
+    },
+    {
+      "epoch": 0.23946666666666666,
+      "grad_norm": 0.3649018873711,
+      "learning_rate": 0.0001779208011943371,
+      "loss": 0.745,
+      "step": 449
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3444176443967446,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.6535,
+      "step": 450
+    },
+    {
+      "epoch": 0.24053333333333332,
+      "grad_norm": 0.4298665170208588,
+      "learning_rate": 0.00017770372002217172,
+      "loss": 0.8094,
+      "step": 451
+    },
+    {
+      "epoch": 0.24106666666666668,
+      "grad_norm": 0.40676712466276643,
+      "learning_rate": 0.00017759483122120238,
+      "loss": 0.7194,
+      "step": 452
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.43694320022290134,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7734,
+      "step": 453
+    },
+    {
+      "epoch": 0.24213333333333334,
+      "grad_norm": 0.4159195167919046,
+      "learning_rate": 0.00017737635881528196,
+      "loss": 0.8063,
+      "step": 454
+    },
+    {
+      "epoch": 0.24266666666666667,
+      "grad_norm": 0.4483277895803319,
+      "learning_rate": 0.00017726677586272263,
+      "loss": 0.7298,
+      "step": 455
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.4747362557925032,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7752,
+      "step": 456
+    },
+    {
+      "epoch": 0.24373333333333333,
+      "grad_norm": 0.35323565552285224,
+      "learning_rate": 0.00017704691809456143,
+      "loss": 0.6711,
+      "step": 457
+    },
+    {
+      "epoch": 0.24426666666666666,
+      "grad_norm": 0.41893469671433203,
+      "learning_rate": 0.0001769366439354882,
+      "loss": 0.7222,
+      "step": 458
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.39907565151968166,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7211,
+      "step": 459
+    },
+    {
+      "epoch": 0.24533333333333332,
+      "grad_norm": 0.35675320654518644,
+      "learning_rate": 0.00017671540671383243,
+      "loss": 0.6646,
+      "step": 460
+    },
+    {
+      "epoch": 0.24586666666666668,
+      "grad_norm": 0.4270554230200422,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 0.7759,
+      "step": 461
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.40229978004171096,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.6955,
+      "step": 462
+    },
+    {
+      "epoch": 0.24693333333333334,
+      "grad_norm": 0.40312632692715095,
+      "learning_rate": 0.00017638183358256696,
+      "loss": 0.7573,
+      "step": 463
+    },
+    {
+      "epoch": 0.24746666666666667,
+      "grad_norm": 0.4113460109150638,
+      "learning_rate": 0.00017627018591992018,
+      "loss": 0.7848,
+      "step": 464
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.3618275117102789,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.6895,
+      "step": 465
+    },
+    {
+      "epoch": 0.24853333333333333,
+      "grad_norm": 0.37828688022097967,
+      "learning_rate": 0.00017604620766564723,
+      "loss": 0.6574,
+      "step": 466
+    },
+    {
+      "epoch": 0.24906666666666666,
+      "grad_norm": 0.36553798988091857,
+      "learning_rate": 0.00017593387774285412,
+      "loss": 0.6943,
+      "step": 467
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.42836526001614417,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8108,
+      "step": 468
+    },
+    {
+      "epoch": 0.2501333333333333,
+      "grad_norm": 0.3877425115957267,
+      "learning_rate": 0.0001757085379831246,
+      "loss": 0.7378,
+      "step": 469
+    },
+    {
+      "epoch": 0.25066666666666665,
+      "grad_norm": 0.3977654548543868,
+      "learning_rate": 0.00017559552881908695,
+      "loss": 0.7236,
+      "step": 470
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.37716494049838983,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.7261,
+      "step": 471
+    },
+    {
+      "epoch": 0.2517333333333333,
+      "grad_norm": 0.45903834158387774,
+      "learning_rate": 0.00017536883360997743,
+      "loss": 0.7993,
+      "step": 472
+    },
+    {
+      "epoch": 0.25226666666666664,
+      "grad_norm": 0.4169406540559001,
+      "learning_rate": 0.00017525514824185185,
+      "loss": 0.7397,
+      "step": 473
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.45744325740569985,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.7573,
+      "step": 474
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 0.4001821206604834,
+      "learning_rate": 0.00017502710367586687,
+      "loss": 0.7025,
+      "step": 475
+    },
+    {
+      "epoch": 0.2538666666666667,
+      "grad_norm": 0.39634111966326696,
+      "learning_rate": 0.0001749127451589832,
+      "loss": 0.6665,
+      "step": 476
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.5070556136631094,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.8214,
+      "step": 477
+    },
+    {
+      "epoch": 0.25493333333333335,
+      "grad_norm": 0.38470652364492003,
+      "learning_rate": 0.00017468335736489177,
+      "loss": 0.6866,
+      "step": 478
+    },
+    {
+      "epoch": 0.2554666666666667,
+      "grad_norm": 0.40788529857724415,
+      "learning_rate": 0.00017456832877267084,
+      "loss": 0.7448,
+      "step": 479
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4278925754220809,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7503,
+      "step": 480
+    },
+    {
+      "epoch": 0.25653333333333334,
+      "grad_norm": 0.4428610484994966,
+      "learning_rate": 0.00017433760391534167,
+      "loss": 0.7242,
+      "step": 481
+    },
+    {
+      "epoch": 0.25706666666666667,
+      "grad_norm": 0.3869065267998443,
+      "learning_rate": 0.00017422190833921283,
+      "loss": 0.7129,
+      "step": 482
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.4282684396189687,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7906,
+      "step": 483
+    },
+    {
+      "epoch": 0.2581333333333333,
+      "grad_norm": 0.43378828051890717,
+      "learning_rate": 0.00017398985261944856,
+      "loss": 0.7897,
+      "step": 484
+    },
+    {
+      "epoch": 0.25866666666666666,
+      "grad_norm": 0.39590989306573193,
+      "learning_rate": 0.00017387349316876666,
+      "loss": 0.7515,
+      "step": 485
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4865555425675011,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.7963,
+      "step": 486
+    },
+    {
+      "epoch": 0.2597333333333333,
+      "grad_norm": 0.42908700805942523,
+      "learning_rate": 0.0001736401128231373,
+      "loss": 0.8124,
+      "step": 487
+    },
+    {
+      "epoch": 0.26026666666666665,
+      "grad_norm": 0.39729606754853064,
+      "learning_rate": 0.00017352309262509894,
+      "loss": 0.7174,
+      "step": 488
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.4422087780106003,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.8041,
+      "step": 489
+    },
+    {
+      "epoch": 0.2613333333333333,
+      "grad_norm": 0.4319822642481652,
+      "learning_rate": 0.0001732883939257742,
+      "loss": 0.8377,
+      "step": 490
+    },
+    {
+      "epoch": 0.2618666666666667,
+      "grad_norm": 0.39146511331112815,
+      "learning_rate": 0.0001731707161253338,
+      "loss": 0.7496,
+      "step": 491
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3872969620787598,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.742,
+      "step": 492
+    },
+    {
+      "epoch": 0.26293333333333335,
+      "grad_norm": 0.3722417379050899,
+      "learning_rate": 0.00017293470537991463,
+      "loss": 0.7036,
+      "step": 493
+    },
+    {
+      "epoch": 0.2634666666666667,
+      "grad_norm": 0.427224706505018,
+      "learning_rate": 0.00017281637313969978,
+      "loss": 0.7732,
+      "step": 494
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3659168170781193,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.6904,
+      "step": 495
+    },
+    {
+      "epoch": 0.26453333333333334,
+      "grad_norm": 0.38312089240575625,
+      "learning_rate": 0.00017257905669104874,
+      "loss": 0.6751,
+      "step": 496
+    },
+    {
+      "epoch": 0.2650666666666667,
+      "grad_norm": 0.47528377786799064,
+      "learning_rate": 0.00017246007319127545,
+      "loss": 0.7232,
+      "step": 497
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.5043159420482285,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7561,
+      "step": 498
+    },
+    {
+      "epoch": 0.26613333333333333,
+      "grad_norm": 0.4071912729726721,
+      "learning_rate": 0.00017222145741734626,
+      "loss": 0.7082,
+      "step": 499
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.3692359251504417,
+      "learning_rate": 0.00017210182585573327,
+      "loss": 0.7264,
+      "step": 500
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.43310810077684875,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.7667,
+      "step": 501
+    },
+    {
+      "epoch": 0.2677333333333333,
+      "grad_norm": 0.4303282860540202,
+      "learning_rate": 0.00017186191716939944,
+      "loss": 0.752,
+      "step": 502
+    },
+    {
+      "epoch": 0.26826666666666665,
+      "grad_norm": 0.4892142742692592,
+      "learning_rate": 0.0001717416407610824,
+      "loss": 0.7797,
+      "step": 503
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.47756245089452326,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.8877,
+      "step": 504
+    },
+    {
+      "epoch": 0.2693333333333333,
+      "grad_norm": 0.44826648467260455,
+      "learning_rate": 0.00017150044560996488,
+      "loss": 0.7485,
+      "step": 505
+    },
+    {
+      "epoch": 0.26986666666666664,
+      "grad_norm": 0.3986453148625839,
+      "learning_rate": 0.00017137952758740978,
+      "loss": 0.7154,
+      "step": 506
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.46155684944406006,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.7527,
+      "step": 507
+    },
+    {
+      "epoch": 0.27093333333333336,
+      "grad_norm": 0.38390418611076577,
+      "learning_rate": 0.00017113705245370368,
+      "loss": 0.7593,
+      "step": 508
+    },
+    {
+      "epoch": 0.2714666666666667,
+      "grad_norm": 0.39270062767400077,
+      "learning_rate": 0.00017101549606662024,
+      "loss": 0.7018,
+      "step": 509
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4368403876738737,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7272,
+      "step": 510
+    },
+    {
+      "epoch": 0.27253333333333335,
+      "grad_norm": 0.38999942509832336,
+      "learning_rate": 0.00017077174746692056,
+      "loss": 0.6816,
+      "step": 511
+    },
+    {
+      "epoch": 0.2730666666666667,
+      "grad_norm": 0.4413102369849412,
+      "learning_rate": 0.00017064955598217462,
+      "loss": 0.7619,
+      "step": 512
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.4362991249943504,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7747,
+      "step": 513
+    },
+    {
+      "epoch": 0.27413333333333334,
+      "grad_norm": 0.39460974516061925,
+      "learning_rate": 0.00017040454046730115,
+      "loss": 0.7176,
+      "step": 514
+    },
+    {
+      "epoch": 0.27466666666666667,
+      "grad_norm": 0.45970430321639394,
+      "learning_rate": 0.00017028171716882714,
+      "loss": 0.7949,
+      "step": 515
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4118570173992488,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7681,
+      "step": 516
+    },
+    {
+      "epoch": 0.27573333333333333,
+      "grad_norm": 0.39082874113516197,
+      "learning_rate": 0.00017003544132364846,
+      "loss": 0.7562,
+      "step": 517
+    },
+    {
+      "epoch": 0.27626666666666666,
+      "grad_norm": 0.44533486466923117,
+      "learning_rate": 0.00016991198951236088,
+      "loss": 0.7804,
+      "step": 518
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.4308733152481316,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.7762,
+      "step": 519
+    },
+    {
+      "epoch": 0.2773333333333333,
+      "grad_norm": 0.4078213675567917,
+      "learning_rate": 0.00016966445995561727,
+      "loss": 0.8142,
+      "step": 520
+    },
+    {
+      "epoch": 0.27786666666666665,
+      "grad_norm": 0.46235105153878314,
+      "learning_rate": 0.00016954038294932216,
+      "loss": 0.7766,
+      "step": 521
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.42235316836926556,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.7069,
+      "step": 522
+    },
+    {
+      "epoch": 0.2789333333333333,
+      "grad_norm": 0.4307082196447437,
+      "learning_rate": 0.0001692916063334479,
+      "loss": 0.729,
+      "step": 523
+    },
+    {
+      "epoch": 0.27946666666666664,
+      "grad_norm": 0.40872749605019615,
+      "learning_rate": 0.0001691669074667535,
+      "loss": 0.7246,
+      "step": 524
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.4170055420382616,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.7001,
+      "step": 525
+    },
+    {
+      "epoch": 0.28053333333333336,
+      "grad_norm": 0.4946542189552747,
+      "learning_rate": 0.0001689168904776979,
+      "loss": 0.7429,
+      "step": 526
+    },
+    {
+      "epoch": 0.2810666666666667,
+      "grad_norm": 0.413904366744721,
+      "learning_rate": 0.00016879157310192535,
+      "loss": 0.6733,
+      "step": 527
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.3846958968706317,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.6999,
+      "step": 528
+    },
+    {
+      "epoch": 0.28213333333333335,
+      "grad_norm": 0.39876829263664965,
+      "learning_rate": 0.00016854032245897308,
+      "loss": 0.7422,
+      "step": 529
+    },
+    {
+      "epoch": 0.2826666666666667,
+      "grad_norm": 0.3984058117963898,
+      "learning_rate": 0.00016841438994206595,
+      "loss": 0.7055,
+      "step": 530
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.3856828329145514,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7004,
+      "step": 531
+    },
+    {
+      "epoch": 0.28373333333333334,
+      "grad_norm": 0.43491636219821733,
+      "learning_rate": 0.00016816191239765667,
+      "loss": 0.7563,
+      "step": 532
+    },
+    {
+      "epoch": 0.28426666666666667,
+      "grad_norm": 0.4554947566059297,
+      "learning_rate": 0.00016803536812409075,
+      "loss": 0.7972,
+      "step": 533
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.4000537333043658,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.7201,
+      "step": 534
+    },
+    {
+      "epoch": 0.2853333333333333,
+      "grad_norm": 0.4248067779621954,
+      "learning_rate": 0.00016778167046363734,
+      "loss": 0.6962,
+      "step": 535
+    },
+    {
+      "epoch": 0.28586666666666666,
+      "grad_norm": 0.42771621442800506,
+      "learning_rate": 0.00016765451783432953,
+      "loss": 0.7559,
+      "step": 536
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.4100187500353154,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7197,
+      "step": 537
+    },
+    {
+      "epoch": 0.2869333333333333,
+      "grad_norm": 0.4177323128557551,
+      "learning_rate": 0.0001673996068760359,
+      "loss": 0.7485,
+      "step": 538
+    },
+    {
+      "epoch": 0.28746666666666665,
+      "grad_norm": 0.46767615316900574,
+      "learning_rate": 0.00016727184930825288,
+      "loss": 0.8013,
+      "step": 539
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3972361306178011,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.7479,
+      "step": 540
+    },
+    {
+      "epoch": 0.2885333333333333,
+      "grad_norm": 0.44556085338785484,
+      "learning_rate": 0.00016701573190293077,
+      "loss": 0.7711,
+      "step": 541
+    },
+    {
+      "epoch": 0.2890666666666667,
+      "grad_norm": 0.3905199839578012,
+      "learning_rate": 0.00016688737283019706,
+      "loss": 0.7301,
+      "step": 542
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.41990641788526667,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.747,
+      "step": 543
+    },
+    {
+      "epoch": 0.29013333333333335,
+      "grad_norm": 0.39307884536736915,
+      "learning_rate": 0.00016663005586108176,
+      "loss": 0.7578,
+      "step": 544
+    },
+    {
+      "epoch": 0.2906666666666667,
+      "grad_norm": 0.4117627250864761,
+      "learning_rate": 0.00016650109873308765,
+      "loss": 0.783,
+      "step": 545
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.4932581378692019,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.813,
+      "step": 546
+    },
+    {
+      "epoch": 0.29173333333333334,
+      "grad_norm": 0.4586273614922315,
+      "learning_rate": 0.0001662425891156531,
+      "loss": 0.8288,
+      "step": 547
+    },
+    {
+      "epoch": 0.2922666666666667,
+      "grad_norm": 0.4019580126071487,
+      "learning_rate": 0.00016611303739816168,
+      "loss": 0.7366,
+      "step": 548
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.46632431465167107,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7385,
+      "step": 549
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.3905508336380154,
+      "learning_rate": 0.00016585334207993476,
+      "loss": 0.7321,
+      "step": 550
+    },
+    {
+      "epoch": 0.29386666666666666,
+      "grad_norm": 0.4315518883593066,
+      "learning_rate": 0.00016572319925468892,
+      "loss": 0.746,
+      "step": 551
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.5496492145646412,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.8785,
+      "step": 552
+    },
+    {
+      "epoch": 0.2949333333333333,
+      "grad_norm": 0.3935500643047362,
+      "learning_rate": 0.0001654623252150624,
+      "loss": 0.7155,
+      "step": 553
+    },
+    {
+      "epoch": 0.29546666666666666,
+      "grad_norm": 0.3738068156508414,
+      "learning_rate": 0.00016533159477969122,
+      "loss": 0.7019,
+      "step": 554
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4522757933248983,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.7664,
+      "step": 555
+    },
+    {
+      "epoch": 0.2965333333333333,
+      "grad_norm": 0.38570043302049833,
+      "learning_rate": 0.00016506954902973655,
+      "loss": 0.7314,
+      "step": 556
+    },
+    {
+      "epoch": 0.29706666666666665,
+      "grad_norm": 0.39520128159650453,
+      "learning_rate": 0.00016493823449766136,
+      "loss": 0.7076,
+      "step": 557
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.34875746220500975,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.6401,
+      "step": 558
+    },
+    {
+      "epoch": 0.2981333333333333,
+      "grad_norm": 0.3850207062278375,
+      "learning_rate": 0.00016467502407993992,
+      "loss": 0.7624,
+      "step": 559
+    },
+    {
+      "epoch": 0.2986666666666667,
+      "grad_norm": 0.39356618449823616,
+      "learning_rate": 0.0001645431289802799,
+      "loss": 0.7934,
+      "step": 560
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.41023404326225077,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.7533,
+      "step": 561
+    },
+    {
+      "epoch": 0.29973333333333335,
+      "grad_norm": 0.38185451478416427,
+      "learning_rate": 0.00016427876096865394,
+      "loss": 0.6844,
+      "step": 562
+    },
+    {
+      "epoch": 0.3002666666666667,
+      "grad_norm": 0.43521746572164594,
+      "learning_rate": 0.00016414628884613107,
+      "loss": 0.7764,
+      "step": 563
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.37052202040599447,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7119,
+      "step": 564
+    },
+    {
+      "epoch": 0.30133333333333334,
+      "grad_norm": 0.40201740247949785,
+      "learning_rate": 0.00016388077034557355,
+      "loss": 0.7365,
+      "step": 565
+    },
+    {
+      "epoch": 0.30186666666666667,
+      "grad_norm": 0.4173521352111898,
+      "learning_rate": 0.00016374772476041748,
+      "loss": 0.7902,
+      "step": 566
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.41543959694153865,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.7467,
+      "step": 567
+    },
+    {
+      "epoch": 0.30293333333333333,
+      "grad_norm": 0.46137903716965895,
+      "learning_rate": 0.00016348106290682118,
+      "loss": 0.7581,
+      "step": 568
+    },
+    {
+      "epoch": 0.30346666666666666,
+      "grad_norm": 0.4078436798194787,
+      "learning_rate": 0.00016334744743467364,
+      "loss": 0.7116,
+      "step": 569
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4415643373573497,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.7527,
+      "step": 570
+    },
+    {
+      "epoch": 0.3045333333333333,
+      "grad_norm": 0.45056751725975336,
+      "learning_rate": 0.00016307964939465914,
+      "loss": 0.7513,
+      "step": 571
+    },
+    {
+      "epoch": 0.30506666666666665,
+      "grad_norm": 0.38398698181612956,
+      "learning_rate": 0.00016294546762647775,
+      "loss": 0.7066,
+      "step": 572
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.47607256049205215,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.8087,
+      "step": 573
+    },
+    {
+      "epoch": 0.3061333333333333,
+      "grad_norm": 0.46320237969409656,
+      "learning_rate": 0.0001626765405972011,
+      "loss": 0.7637,
+      "step": 574
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 0.3900926639602743,
+      "learning_rate": 0.00016254179613916278,
+      "loss": 0.759,
+      "step": 575
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3945527842305872,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.6851,
+      "step": 576
+    },
+    {
+      "epoch": 0.30773333333333336,
+      "grad_norm": 0.38971042494020103,
+      "learning_rate": 0.000162271747348122,
+      "loss": 0.7286,
+      "step": 577
+    },
+    {
+      "epoch": 0.3082666666666667,
+      "grad_norm": 0.4045725727158994,
+      "learning_rate": 0.0001621364438215262,
+      "loss": 0.7351,
+      "step": 578
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.44749498801010373,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.8119,
+      "step": 579
+    },
+    {
+      "epoch": 0.30933333333333335,
+      "grad_norm": 0.42103833073666364,
+      "learning_rate": 0.00016186528052636692,
+      "loss": 0.7662,
+      "step": 580
+    },
+    {
+      "epoch": 0.3098666666666667,
+      "grad_norm": 0.3775041905408943,
+      "learning_rate": 0.0001617294215675382,
+      "loss": 0.6972,
+      "step": 581
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.43153381420568787,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7316,
+      "step": 582
+    },
+    {
+      "epoch": 0.31093333333333334,
+      "grad_norm": 0.39947290789932155,
+      "learning_rate": 0.0001614571510558588,
+      "loss": 0.7309,
+      "step": 583
+    },
+    {
+      "epoch": 0.31146666666666667,
+      "grad_norm": 0.45729284062204945,
+      "learning_rate": 0.00016132074031604917,
+      "loss": 0.6902,
+      "step": 584
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.48660864694329276,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.8257,
+      "step": 585
+    },
+    {
+      "epoch": 0.31253333333333333,
+      "grad_norm": 0.4825049084275357,
+      "learning_rate": 0.00016104736990520468,
+      "loss": 0.8216,
+      "step": 586
+    },
+    {
+      "epoch": 0.31306666666666666,
+      "grad_norm": 0.3935027451385052,
+      "learning_rate": 0.0001609104110504954,
+      "loss": 0.6903,
+      "step": 587
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.42125994179094617,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.724,
+      "step": 588
+    },
+    {
+      "epoch": 0.3141333333333333,
+      "grad_norm": 0.44333295681260426,
+      "learning_rate": 0.00016063594808740113,
+      "loss": 0.6565,
+      "step": 589
+    },
+    {
+      "epoch": 0.31466666666666665,
+      "grad_norm": 0.4313425441161452,
+      "learning_rate": 0.00016049844479860422,
+      "loss": 0.7675,
+      "step": 590
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.3772164435988223,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.6755,
+      "step": 591
+    },
+    {
+      "epoch": 0.3157333333333333,
+      "grad_norm": 0.42915178129946013,
+      "learning_rate": 0.00016022289665953808,
+      "loss": 0.7515,
+      "step": 592
+    },
+    {
+      "epoch": 0.31626666666666664,
+      "grad_norm": 0.4184007203040218,
+      "learning_rate": 0.00016008485263209742,
+      "loss": 0.7358,
+      "step": 593
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.40765011752081065,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7265,
+      "step": 594
+    },
+    {
+      "epoch": 0.31733333333333336,
+      "grad_norm": 0.3951190521913299,
+      "learning_rate": 0.0001598082267225018,
+      "loss": 0.6542,
+      "step": 595
+    },
+    {
+      "epoch": 0.3178666666666667,
+      "grad_norm": 0.3855910412512755,
+      "learning_rate": 0.0001596696456663938,
+      "loss": 0.7006,
+      "step": 596
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.42424025815136057,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7634,
+      "step": 597
+    },
+    {
+      "epoch": 0.31893333333333335,
+      "grad_norm": 0.3877096634029584,
+      "learning_rate": 0.00015939194942067646,
+      "loss": 0.7645,
+      "step": 598
+    },
+    {
+      "epoch": 0.3194666666666667,
+      "grad_norm": 0.4755030554081805,
+      "learning_rate": 0.0001592528350603103,
+      "loss": 0.7008,
+      "step": 599
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3945273065026347,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7261,
+      "step": 600
+    },
+    {
+      "epoch": 0.32053333333333334,
+      "grad_norm": 0.4322918940947344,
+      "learning_rate": 0.00015897407594164467,
+      "loss": 0.7855,
+      "step": 601
+    },
+    {
+      "epoch": 0.32106666666666667,
+      "grad_norm": 0.4008059808950541,
+      "learning_rate": 0.00015883443201576225,
+      "loss": 0.7038,
+      "step": 602
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.42735681713122126,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7303,
+      "step": 603
+    },
+    {
+      "epoch": 0.3221333333333333,
+      "grad_norm": 0.44799972860329035,
+      "learning_rate": 0.00015855461751588677,
+      "loss": 0.7592,
+      "step": 604
+    },
+    {
+      "epoch": 0.32266666666666666,
+      "grad_norm": 0.429338520686409,
+      "learning_rate": 0.0001584144477774623,
+      "loss": 0.7074,
+      "step": 605
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.4465954476240394,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7319,
+      "step": 606
+    },
+    {
+      "epoch": 0.3237333333333333,
+      "grad_norm": 0.4900134635713606,
+      "learning_rate": 0.00015813358541647915,
+      "loss": 0.8457,
+      "step": 607
+    },
+    {
+      "epoch": 0.32426666666666665,
+      "grad_norm": 0.4207105842197505,
+      "learning_rate": 0.00015799289363261813,
+      "loss": 0.7544,
+      "step": 608
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.42451770322137466,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7044,
+      "step": 609
+    },
+    {
+      "epoch": 0.3253333333333333,
+      "grad_norm": 0.3589686471116743,
+      "learning_rate": 0.00015771099095879108,
+      "loss": 0.6743,
+      "step": 610
+    },
+    {
+      "epoch": 0.3258666666666667,
+      "grad_norm": 0.4450708223693317,
+      "learning_rate": 0.0001575697809106292,
+      "loss": 0.71,
+      "step": 611
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.40686694321392736,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.688,
+      "step": 612
+    },
+    {
+      "epoch": 0.32693333333333335,
+      "grad_norm": 0.39753103284223834,
+      "learning_rate": 0.00015728684550018064,
+      "loss": 0.7351,
+      "step": 613
+    },
+    {
+      "epoch": 0.3274666666666667,
+      "grad_norm": 0.409828206041139,
+      "learning_rate": 0.0001571451209827821,
+      "loss": 0.745,
+      "step": 614
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.3667751581575778,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.6701,
+      "step": 615
+    },
+    {
+      "epoch": 0.32853333333333334,
+      "grad_norm": 0.4484573939780761,
+      "learning_rate": 0.00015686116043968972,
+      "loss": 0.7822,
+      "step": 616
+    },
+    {
+      "epoch": 0.3290666666666667,
+      "grad_norm": 0.4242610062223099,
+      "learning_rate": 0.00015671892526194516,
+      "loss": 0.7993,
+      "step": 617
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.4532416341596631,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.8287,
+      "step": 618
+    },
+    {
+      "epoch": 0.33013333333333333,
+      "grad_norm": 0.4887705723429555,
+      "learning_rate": 0.0001564339472177373,
+      "loss": 0.792,
+      "step": 619
+    },
+    {
+      "epoch": 0.33066666666666666,
+      "grad_norm": 0.48812937766764497,
+      "learning_rate": 0.00015629120520226165,
+      "loss": 0.7167,
+      "step": 620
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.40975242626263575,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.6962,
+      "step": 621
+    },
+    {
+      "epoch": 0.3317333333333333,
+      "grad_norm": 0.3697513759304386,
+      "learning_rate": 0.0001560052173158123,
+      "loss": 0.7037,
+      "step": 622
+    },
+    {
+      "epoch": 0.33226666666666665,
+      "grad_norm": 0.4373933446685291,
+      "learning_rate": 0.00015586197229884184,
+      "loss": 0.7209,
+      "step": 623
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3852841466947889,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.7435,
+      "step": 624
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.44567788187910024,
+      "learning_rate": 0.00015557498225616487,
+      "loss": 0.7432,
+      "step": 625
+    },
+    {
+      "epoch": 0.33386666666666664,
+      "grad_norm": 0.4685861065616478,
+      "learning_rate": 0.0001554312380874542,
+      "loss": 0.8281,
+      "step": 626
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.5071683487190545,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.8902,
+      "step": 627
+    },
+    {
+      "epoch": 0.33493333333333336,
+      "grad_norm": 0.4164121017068378,
+      "learning_rate": 0.00015514325360149668,
+      "loss": 0.7467,
+      "step": 628
+    },
+    {
+      "epoch": 0.3354666666666667,
+      "grad_norm": 0.4111203128036995,
+      "learning_rate": 0.0001549990141442153,
+      "loss": 0.7431,
+      "step": 629
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4100653264715916,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7005,
+      "step": 630
+    },
+    {
+      "epoch": 0.33653333333333335,
+      "grad_norm": 0.452828147400987,
+      "learning_rate": 0.00015471004295465035,
+      "loss": 0.7994,
+      "step": 631
+    },
+    {
+      "epoch": 0.3370666666666667,
+      "grad_norm": 0.40800931677571645,
+      "learning_rate": 0.0001545653120852787,
+      "loss": 0.7733,
+      "step": 632
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.44311289069816373,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7218,
+      "step": 633
+    },
+    {
+      "epoch": 0.33813333333333334,
+      "grad_norm": 0.4271579407616302,
+      "learning_rate": 0.00015427536195829742,
+      "loss": 0.8083,
+      "step": 634
+    },
+    {
+      "epoch": 0.33866666666666667,
+      "grad_norm": 0.5112305632950536,
+      "learning_rate": 0.00015413014356652286,
+      "loss": 0.8084,
+      "step": 635
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4006633008920473,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7126,
+      "step": 636
+    },
+    {
+      "epoch": 0.33973333333333333,
+      "grad_norm": 0.40727304965830635,
+      "learning_rate": 0.00015383922229462549,
+      "loss": 0.7368,
+      "step": 637
+    },
+    {
+      "epoch": 0.34026666666666666,
+      "grad_norm": 0.477949911384884,
+      "learning_rate": 0.00015369352028323774,
+      "loss": 0.8195,
+      "step": 638
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.4526096288861997,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.8008,
+      "step": 639
+    },
+    {
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.4512229875267074,
+      "learning_rate": 0.0001534016356850244,
+      "loss": 0.7436,
+      "step": 640
+    },
+    {
+      "epoch": 0.34186666666666665,
+      "grad_norm": 0.43480158539272745,
+      "learning_rate": 0.0001532554539698105,
+      "loss": 0.7677,
+      "step": 641
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.39829699026731086,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7303,
+      "step": 642
+    },
+    {
+      "epoch": 0.3429333333333333,
+      "grad_norm": 0.43235734102891127,
+      "learning_rate": 0.00015296261388977108,
+      "loss": 0.7614,
+      "step": 643
+    },
+    {
+      "epoch": 0.34346666666666664,
+      "grad_norm": 0.40414847698605944,
+      "learning_rate": 0.0001528159563994104,
+      "loss": 0.7194,
+      "step": 644
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.42756992399038113,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7621,
+      "step": 645
+    },
+    {
+      "epoch": 0.34453333333333336,
+      "grad_norm": 0.40257715695337964,
+      "learning_rate": 0.00015252216870771345,
+      "loss": 0.7167,
+      "step": 646
+    },
+    {
+      "epoch": 0.3450666666666667,
+      "grad_norm": 0.43021398031232083,
+      "learning_rate": 0.00015237503938367186,
+      "loss": 0.7408,
+      "step": 647
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.3759716060216146,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.6591,
+      "step": 648
+    },
+    {
+      "epoch": 0.34613333333333335,
+      "grad_norm": 0.4931180390492152,
+      "learning_rate": 0.00015208031197595356,
+      "loss": 0.7957,
+      "step": 649
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.4355637206307468,
+      "learning_rate": 0.0001519327147723776,
+      "loss": 0.7681,
+      "step": 650
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.37464229247023656,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.6569,
+      "step": 651
+    },
+    {
+      "epoch": 0.34773333333333334,
+      "grad_norm": 0.4020759123911303,
+      "learning_rate": 0.0001516370555695291,
+      "loss": 0.7167,
+      "step": 652
+    },
+    {
+      "epoch": 0.34826666666666667,
+      "grad_norm": 0.4221289896570997,
+      "learning_rate": 0.00015148899445313981,
+      "loss": 0.7557,
+      "step": 653
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.3939997279620741,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.6649,
+      "step": 654
+    },
+    {
+      "epoch": 0.34933333333333333,
+      "grad_norm": 0.3917110467034314,
+      "learning_rate": 0.00015119241140109467,
+      "loss": 0.7077,
+      "step": 655
+    },
+    {
+      "epoch": 0.34986666666666666,
+      "grad_norm": 0.43924193247345256,
+      "learning_rate": 0.00015104389035108077,
+      "loss": 0.7942,
+      "step": 656
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.3969193877305016,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.6673,
+      "step": 657
+    },
+    {
+      "epoch": 0.3509333333333333,
+      "grad_norm": 0.45038221706615134,
+      "learning_rate": 0.0001507463914206012,
+      "loss": 0.7246,
+      "step": 658
+    },
+    {
+      "epoch": 0.35146666666666665,
+      "grad_norm": 0.42164306602391805,
+      "learning_rate": 0.0001505974144285124,
+      "loss": 0.7425,
+      "step": 659
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.40140230723508524,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.6991,
+      "step": 660
+    },
+    {
+      "epoch": 0.3525333333333333,
+      "grad_norm": 0.4750484854460713,
+      "learning_rate": 0.00015029900761497506,
+      "loss": 0.7824,
+      "step": 661
+    },
+    {
+      "epoch": 0.35306666666666664,
+      "grad_norm": 0.4455116350805326,
+      "learning_rate": 0.00015014957868461458,
+      "loss": 0.8367,
+      "step": 662
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.44618110095389835,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.7152,
+      "step": 663
+    },
+    {
+      "epoch": 0.35413333333333336,
+      "grad_norm": 0.4475622955097561,
+      "learning_rate": 0.000149850272007796,
+      "loss": 0.761,
+      "step": 664
+    },
+    {
+      "epoch": 0.3546666666666667,
+      "grad_norm": 0.4359749671738415,
+      "learning_rate": 0.00014970039515511304,
+      "loss": 0.7621,
+      "step": 665
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.38747409675067007,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.7391,
+      "step": 666
+    },
+    {
+      "epoch": 0.35573333333333335,
+      "grad_norm": 0.4164598379253978,
+      "learning_rate": 0.0001494001966589736,
+      "loss": 0.7612,
+      "step": 667
+    },
+    {
+      "epoch": 0.3562666666666667,
+      "grad_norm": 0.4229031544604517,
+      "learning_rate": 0.00014924987591195547,
+      "loss": 0.7445,
+      "step": 668
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.3673383598153167,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7138,
+      "step": 669
+    },
+    {
+      "epoch": 0.35733333333333334,
+      "grad_norm": 0.39070215256485175,
+      "learning_rate": 0.0001489487936644237,
+      "loss": 0.7118,
+      "step": 670
+    },
+    {
+      "epoch": 0.35786666666666667,
+      "grad_norm": 0.42452823633461184,
+      "learning_rate": 0.00014879803306298736,
+      "loss": 0.7388,
+      "step": 671
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.4263707701419675,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7526,
+      "step": 672
+    },
+    {
+      "epoch": 0.3589333333333333,
+      "grad_norm": 0.3689626812113726,
+      "learning_rate": 0.00014849607515574276,
+      "loss": 0.7108,
+      "step": 673
+    },
+    {
+      "epoch": 0.35946666666666666,
+      "grad_norm": 0.41376531374820275,
+      "learning_rate": 0.00014834487875162657,
+      "loss": 0.7284,
+      "step": 674
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.3923399191140614,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.7255,
+      "step": 675
+    },
+    {
+      "epoch": 0.3605333333333333,
+      "grad_norm": 0.4748090626326111,
+      "learning_rate": 0.00014804205329988225,
+      "loss": 0.7893,
+      "step": 676
+    },
+    {
+      "epoch": 0.36106666666666665,
+      "grad_norm": 0.4158425725741022,
+      "learning_rate": 0.00014789042515653687,
+      "loss": 0.7224,
+      "step": 677
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4459627005422889,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7157,
+      "step": 678
+    },
+    {
+      "epoch": 0.3621333333333333,
+      "grad_norm": 0.4273511318023322,
+      "learning_rate": 0.00014758674029882152,
+      "loss": 0.7572,
+      "step": 679
+    },
+    {
+      "epoch": 0.3626666666666667,
+      "grad_norm": 0.37482426750995995,
+      "learning_rate": 0.00014743468449130063,
+      "loss": 0.6706,
+      "step": 680
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.4006428451318836,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7872,
+      "step": 681
+    },
+    {
+      "epoch": 0.36373333333333335,
+      "grad_norm": 0.4083100773492591,
+      "learning_rate": 0.00014713014838923976,
+      "loss": 0.7192,
+      "step": 682
+    },
+    {
+      "epoch": 0.3642666666666667,
+      "grad_norm": 0.3265496595368322,
+      "learning_rate": 0.00014697766900409074,
+      "loss": 0.6042,
+      "step": 683
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.5120580846177375,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.8355,
+      "step": 684
+    },
+    {
+      "epoch": 0.36533333333333334,
+      "grad_norm": 0.39935262745294176,
+      "learning_rate": 0.0001466722898421873,
+      "loss": 0.6913,
+      "step": 685
+    },
+    {
+      "epoch": 0.3658666666666667,
+      "grad_norm": 0.38092835117157264,
+      "learning_rate": 0.0001465193909773413,
+      "loss": 0.7005,
+      "step": 686
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.386526250523854,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.6555,
+      "step": 687
+    },
+    {
+      "epoch": 0.36693333333333333,
+      "grad_norm": 0.4667619972650561,
+      "learning_rate": 0.00014621317696275564,
+      "loss": 0.8299,
+      "step": 688
+    },
+    {
+      "epoch": 0.36746666666666666,
+      "grad_norm": 0.3839903028142022,
+      "learning_rate": 0.00014605986272741748,
+      "loss": 0.6692,
+      "step": 689
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.48381307084576924,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7754,
+      "step": 690
+    },
+    {
+      "epoch": 0.3685333333333333,
+      "grad_norm": 0.45713833522429165,
+      "learning_rate": 0.00014575282208974702,
+      "loss": 0.7679,
+      "step": 691
+    },
+    {
+      "epoch": 0.36906666666666665,
+      "grad_norm": 0.4131366758433008,
+      "learning_rate": 0.00014559909660428468,
+      "loss": 0.679,
+      "step": 692
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.38089852736252117,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7627,
+      "step": 693
+    },
+    {
+      "epoch": 0.3701333333333333,
+      "grad_norm": 0.3699847132301396,
+      "learning_rate": 0.00014529123759534255,
+      "loss": 0.645,
+      "step": 694
+    },
+    {
+      "epoch": 0.37066666666666664,
+      "grad_norm": 0.3696014091782039,
+      "learning_rate": 0.00014513710499117647,
+      "loss": 0.7249,
+      "step": 695
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.5056206052640094,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.8714,
+      "step": 696
+    },
+    {
+      "epoch": 0.37173333333333336,
+      "grad_norm": 0.539932145998434,
+      "learning_rate": 0.00014482843588476974,
+      "loss": 0.7293,
+      "step": 697
+    },
+    {
+      "epoch": 0.3722666666666667,
+      "grad_norm": 0.3700863336476834,
+      "learning_rate": 0.00014467390030426186,
+      "loss": 0.6846,
+      "step": 698
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.42855540848732004,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.6818,
+      "step": 699
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.4371717624109415,
+      "learning_rate": 0.0001443644293959693,
+      "loss": 0.7209,
+      "step": 700
+    },
+    {
+      "epoch": 0.3738666666666667,
+      "grad_norm": 0.4705215508951361,
+      "learning_rate": 0.00014420949499231172,
+      "loss": 0.7311,
+      "step": 701
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4458159431732122,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.7703,
+      "step": 702
+    },
+    {
+      "epoch": 0.37493333333333334,
+      "grad_norm": 0.3917851894183172,
+      "learning_rate": 0.00014389923059926062,
+      "loss": 0.6679,
+      "step": 703
+    },
+    {
+      "epoch": 0.37546666666666667,
+      "grad_norm": 0.3580350158463694,
+      "learning_rate": 0.0001437439015363638,
+      "loss": 0.6806,
+      "step": 704
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.5055093024285995,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.8129,
+      "step": 705
+    },
+    {
+      "epoch": 0.37653333333333333,
+      "grad_norm": 0.3572657850969502,
+      "learning_rate": 0.00014343285199700683,
+      "loss": 0.646,
+      "step": 706
+    },
+    {
+      "epoch": 0.37706666666666666,
+      "grad_norm": 0.35043563922430143,
+      "learning_rate": 0.0001432771324493879,
+      "loss": 0.6541,
+      "step": 707
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.46945371784425727,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.8082,
+      "step": 708
+    },
+    {
+      "epoch": 0.3781333333333333,
+      "grad_norm": 0.3687691495275247,
+      "learning_rate": 0.00014296530612327863,
+      "loss": 0.6711,
+      "step": 709
+    },
+    {
+      "epoch": 0.37866666666666665,
+      "grad_norm": 0.40667822947642723,
+      "learning_rate": 0.00014280920027594907,
+      "loss": 0.7518,
+      "step": 710
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.43797146761055344,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.7835,
+      "step": 711
+    },
+    {
+      "epoch": 0.3797333333333333,
+      "grad_norm": 0.40210353606465987,
+      "learning_rate": 0.00014249660554351752,
+      "loss": 0.6762,
+      "step": 712
+    },
+    {
+      "epoch": 0.38026666666666664,
+      "grad_norm": 0.4270392278398279,
+      "learning_rate": 0.00014234011759187083,
+      "loss": 0.7539,
+      "step": 713
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.392145917174963,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7186,
+      "step": 714
+    },
+    {
+      "epoch": 0.38133333333333336,
+      "grad_norm": 0.38864957905158737,
+      "learning_rate": 0.00014202676285419812,
+      "loss": 0.7011,
+      "step": 715
+    },
+    {
+      "epoch": 0.3818666666666667,
+      "grad_norm": 0.39289194672110705,
+      "learning_rate": 0.00014186989700389687,
+      "loss": 0.7745,
+      "step": 716
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.3708892524080064,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.6661,
+      "step": 717
+    },
+    {
+      "epoch": 0.38293333333333335,
+      "grad_norm": 0.3585159080195186,
+      "learning_rate": 0.0001415557906824895,
+      "loss": 0.6753,
+      "step": 718
+    },
+    {
+      "epoch": 0.3834666666666667,
+      "grad_norm": 0.40272715048225605,
+      "learning_rate": 0.00014139855114935252,
+      "loss": 0.7383,
+      "step": 719
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.41579811759066937,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.7354,
+      "step": 720
+    },
+    {
+      "epoch": 0.38453333333333334,
+      "grad_norm": 0.32887259036258953,
+      "learning_rate": 0.0001410837016859161,
+      "loss": 0.5956,
+      "step": 721
+    },
+    {
+      "epoch": 0.38506666666666667,
+      "grad_norm": 0.3603762966046134,
+      "learning_rate": 0.00014092609269580496,
+      "loss": 0.6439,
+      "step": 722
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.38779763759157626,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.6249,
+      "step": 723
+    },
+    {
+      "epoch": 0.38613333333333333,
+      "grad_norm": 0.4327625662612419,
+      "learning_rate": 0.00014061050855201723,
+      "loss": 0.7579,
+      "step": 724
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 0.44107105407560276,
+      "learning_rate": 0.0001404525343407228,
+      "loss": 0.7354,
+      "step": 725
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.48944825347676674,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7462,
+      "step": 726
+    },
+    {
+      "epoch": 0.3877333333333333,
+      "grad_norm": 0.47187198334662633,
+      "learning_rate": 0.00014013622399800627,
+      "loss": 0.7667,
+      "step": 727
+    },
+    {
+      "epoch": 0.38826666666666665,
+      "grad_norm": 0.3811573487925462,
+      "learning_rate": 0.00013997788881113489,
+      "loss": 0.729,
+      "step": 728
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.3498926333357584,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.6226,
+      "step": 729
+    },
+    {
+      "epoch": 0.3893333333333333,
+      "grad_norm": 0.3645109010829484,
+      "learning_rate": 0.0001396608607704289,
+      "loss": 0.6822,
+      "step": 730
+    },
+    {
+      "epoch": 0.38986666666666664,
+      "grad_norm": 0.45291067357258424,
+      "learning_rate": 0.0001395021688632882,
+      "loss": 0.7952,
+      "step": 731
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.39190670348598583,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.703,
+      "step": 732
+    },
+    {
+      "epoch": 0.39093333333333335,
+      "grad_norm": 0.38835265989584566,
+      "learning_rate": 0.00013918443164482046,
+      "loss": 0.6677,
+      "step": 733
+    },
+    {
+      "epoch": 0.3914666666666667,
+      "grad_norm": 0.40703525351336206,
+      "learning_rate": 0.000139025387282305,
+      "loss": 0.805,
+      "step": 734
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.4110630421712686,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7354,
+      "step": 735
+    },
+    {
+      "epoch": 0.39253333333333335,
+      "grad_norm": 0.3840940545901596,
+      "learning_rate": 0.0001387069494253626,
+      "loss": 0.7197,
+      "step": 736
+    },
+    {
+      "epoch": 0.3930666666666667,
+      "grad_norm": 0.4467381460243381,
+      "learning_rate": 0.0001385475568818394,
+      "loss": 0.7054,
+      "step": 737
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.46166401359335174,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7335,
+      "step": 738
+    },
+    {
+      "epoch": 0.39413333333333334,
+      "grad_norm": 0.36351310619125765,
+      "learning_rate": 0.00013822842694453924,
+      "loss": 0.667,
+      "step": 739
+    },
+    {
+      "epoch": 0.39466666666666667,
+      "grad_norm": 0.38036769676459325,
+      "learning_rate": 0.0001380686905037327,
+      "loss": 0.7207,
+      "step": 740
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.41443306808069247,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7301,
+      "step": 741
+    },
+    {
+      "epoch": 0.3957333333333333,
+      "grad_norm": 0.3976722254079516,
+      "learning_rate": 0.00013774887706279165,
+      "loss": 0.7417,
+      "step": 742
+    },
+    {
+      "epoch": 0.39626666666666666,
+      "grad_norm": 0.42931174429527336,
+      "learning_rate": 0.0001375888010176686,
+      "loss": 0.7351,
+      "step": 743
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4231884462170058,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7328,
+      "step": 744
+    },
+    {
+      "epoch": 0.3973333333333333,
+      "grad_norm": 0.37324639296983,
+      "learning_rate": 0.00013726831266817278,
+      "loss": 0.659,
+      "step": 745
+    },
+    {
+      "epoch": 0.39786666666666665,
+      "grad_norm": 0.3905727457838949,
+      "learning_rate": 0.00013710790132082692,
+      "loss": 0.6958,
+      "step": 746
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.40036871893220416,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7091,
+      "step": 747
+    },
+    {
+      "epoch": 0.3989333333333333,
+      "grad_norm": 0.38528150187429655,
+      "learning_rate": 0.00013678674667600102,
+      "loss": 0.6755,
+      "step": 748
+    },
+    {
+      "epoch": 0.3994666666666667,
+      "grad_norm": 0.3797882884409939,
+      "learning_rate": 0.00013662600433753745,
+      "loss": 0.6851,
+      "step": 749
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.44454612812770267,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7884,
+      "step": 750
+    },
+    {
+      "epoch": 0.40053333333333335,
+      "grad_norm": 0.4253589924655092,
+      "learning_rate": 0.00013630419202851284,
+      "loss": 0.6989,
+      "step": 751
+    },
+    {
+      "epoch": 0.4010666666666667,
+      "grad_norm": 0.47466516275733234,
+      "learning_rate": 0.00013614312301893223,
+      "loss": 0.6635,
+      "step": 752
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.4277493972756891,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.7397,
+      "step": 753
+    },
+    {
+      "epoch": 0.40213333333333334,
+      "grad_norm": 0.4150799867682261,
+      "learning_rate": 0.00013582066169451535,
+      "loss": 0.7359,
+      "step": 754
+    },
+    {
+      "epoch": 0.4026666666666667,
+      "grad_norm": 0.4079182840730704,
+      "learning_rate": 0.0001356592703425976,
+      "loss": 0.7243,
+      "step": 755
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.3961983582766719,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7152,
+      "step": 756
+    },
+    {
+      "epoch": 0.40373333333333333,
+      "grad_norm": 0.37186248794663584,
+      "learning_rate": 0.00013533616866903735,
+      "loss": 0.6974,
+      "step": 757
+    },
+    {
+      "epoch": 0.40426666666666666,
+      "grad_norm": 0.43562388220908504,
+      "learning_rate": 0.0001351744593122255,
+      "loss": 0.721,
+      "step": 758
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.4005835205040354,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.7137,
+      "step": 759
+    },
+    {
+      "epoch": 0.4053333333333333,
+      "grad_norm": 0.3687616415807105,
+      "learning_rate": 0.00013485072597298038,
+      "loss": 0.6721,
+      "step": 760
+    },
+    {
+      "epoch": 0.40586666666666665,
+      "grad_norm": 0.4346945852782822,
+      "learning_rate": 0.00013468870295726398,
+      "loss": 0.7192,
+      "step": 761
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.40119708767717255,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.7319,
+      "step": 762
+    },
+    {
+      "epoch": 0.4069333333333333,
+      "grad_norm": 0.40914752470278004,
+      "learning_rate": 0.00013436434665276865,
+      "loss": 0.725,
+      "step": 763
+    },
+    {
+      "epoch": 0.40746666666666664,
+      "grad_norm": 0.4515602601088684,
+      "learning_rate": 0.00013420201433256689,
+      "loss": 0.7346,
+      "step": 764
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.41088961193235307,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7259,
+      "step": 765
+    },
+    {
+      "epoch": 0.40853333333333336,
+      "grad_norm": 0.41329628712878974,
+      "learning_rate": 0.00013387704377999842,
+      "loss": 0.6726,
+      "step": 766
+    },
+    {
+      "epoch": 0.4090666666666667,
+      "grad_norm": 0.37944231545414464,
+      "learning_rate": 0.00013371440651804313,
+      "loss": 0.6497,
+      "step": 767
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.39338187736531505,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.6974,
+      "step": 768
+    },
+    {
+      "epoch": 0.41013333333333335,
+      "grad_norm": 0.3584335156406577,
+      "learning_rate": 0.00013338883045108674,
+      "loss": 0.6539,
+      "step": 769
+    },
+    {
+      "epoch": 0.4106666666666667,
+      "grad_norm": 0.3996459812552077,
+      "learning_rate": 0.00013322589261830517,
+      "loss": 0.7899,
+      "step": 770
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.4186247409116387,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7389,
+      "step": 771
+    },
+    {
+      "epoch": 0.41173333333333334,
+      "grad_norm": 0.45203679024896604,
+      "learning_rate": 0.0001328997197869194,
+      "loss": 0.7771,
+      "step": 772
+    },
+    {
+      "epoch": 0.41226666666666667,
+      "grad_norm": 0.4127072642765617,
+      "learning_rate": 0.0001327364857623168,
+      "loss": 0.7293,
+      "step": 773
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.40356005146175844,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7156,
+      "step": 774
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 0.39594502540516324,
+      "learning_rate": 0.00013240972493249847,
+      "loss": 0.694,
+      "step": 775
+    },
+    {
+      "epoch": 0.41386666666666666,
+      "grad_norm": 0.40434994474435,
+      "learning_rate": 0.0001322461991030402,
+      "loss": 0.6816,
+      "step": 776
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.4323229255072922,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7809,
+      "step": 777
+    },
+    {
+      "epoch": 0.4149333333333333,
+      "grad_norm": 0.3869847380684972,
+      "learning_rate": 0.00013191885905658872,
+      "loss": 0.6869,
+      "step": 778
+    },
+    {
+      "epoch": 0.41546666666666665,
+      "grad_norm": 0.4210267062837234,
+      "learning_rate": 0.0001317550458170826,
+      "loss": 0.7502,
+      "step": 779
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.4063539287165066,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.723,
+      "step": 780
+    },
+    {
+      "epoch": 0.4165333333333333,
+      "grad_norm": 0.3740075391352541,
+      "learning_rate": 0.00013142713535136414,
+      "loss": 0.6909,
+      "step": 781
+    },
+    {
+      "epoch": 0.41706666666666664,
+      "grad_norm": 0.41440904326048766,
+      "learning_rate": 0.00013126303910434214,
+      "loss": 0.6842,
+      "step": 782
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.38046217217993816,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.6394,
+      "step": 783
+    },
+    {
+      "epoch": 0.41813333333333336,
+      "grad_norm": 0.4247582721688887,
+      "learning_rate": 0.00013093456703205288,
+      "loss": 0.683,
+      "step": 784
+    },
+    {
+      "epoch": 0.4186666666666667,
+      "grad_norm": 0.4434915669750256,
+      "learning_rate": 0.00013077019218765305,
+      "loss": 0.7719,
+      "step": 785
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4982210512879041,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.8067,
+      "step": 786
+    },
+    {
+      "epoch": 0.41973333333333335,
+      "grad_norm": 0.371623762692025,
+      "learning_rate": 0.0001304411673365826,
+      "loss": 0.658,
+      "step": 787
+    },
+    {
+      "epoch": 0.4202666666666667,
+      "grad_norm": 0.48032846665453766,
+      "learning_rate": 0.0001302765183124302,
+      "loss": 0.7838,
+      "step": 788
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.46365591562736086,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7915,
+      "step": 789
+    },
+    {
+      "epoch": 0.42133333333333334,
+      "grad_norm": 0.46413594665498614,
+      "learning_rate": 0.00012994694952522435,
+      "loss": 0.7562,
+      "step": 790
+    },
+    {
+      "epoch": 0.42186666666666667,
+      "grad_norm": 0.42736336506004996,
+      "learning_rate": 0.00012978203074631334,
+      "loss": 0.7377,
+      "step": 791
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3786148512927807,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7068,
+      "step": 792
+    },
+    {
+      "epoch": 0.42293333333333333,
+      "grad_norm": 0.4600288748192486,
+      "learning_rate": 0.00012945192688023624,
+      "loss": 0.8246,
+      "step": 793
+    },
+    {
+      "epoch": 0.42346666666666666,
+      "grad_norm": 0.4337822021767261,
+      "learning_rate": 0.0001292867427788104,
+      "loss": 0.7531,
+      "step": 794
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.3747108498459979,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.6639,
+      "step": 795
+    },
+    {
+      "epoch": 0.4245333333333333,
+      "grad_norm": 0.3738286018775399,
+      "learning_rate": 0.00012895611270550666,
+      "loss": 0.7045,
+      "step": 796
+    },
+    {
+      "epoch": 0.42506666666666665,
+      "grad_norm": 0.3924295584496147,
+      "learning_rate": 0.0001287906677209403,
+      "loss": 0.7193,
+      "step": 797
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.4617688049197154,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7818,
+      "step": 798
+    },
+    {
+      "epoch": 0.4261333333333333,
+      "grad_norm": 0.37537774354945436,
+      "learning_rate": 0.0001284595203261965,
+      "loss": 0.6879,
+      "step": 799
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.4054147302983524,
+      "learning_rate": 0.00012829381890487536,
+      "loss": 0.6908,
+      "step": 800
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.41882507548900794,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.717,
+      "step": 801
+    },
+    {
+      "epoch": 0.42773333333333335,
+      "grad_norm": 0.432655112010318,
+      "learning_rate": 0.00012796216308838117,
+      "loss": 0.6868,
+      "step": 802
+    },
+    {
+      "epoch": 0.4282666666666667,
+      "grad_norm": 0.389907435925452,
+      "learning_rate": 0.00012779620968358273,
+      "loss": 0.7122,
+      "step": 803
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3816531957696866,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.6609,
+      "step": 804
+    },
+    {
+      "epoch": 0.42933333333333334,
+      "grad_norm": 0.3702862962953432,
+      "learning_rate": 0.00012746405435869198,
+      "loss": 0.6818,
+      "step": 805
+    },
+    {
+      "epoch": 0.4298666666666667,
+      "grad_norm": 0.40667358788801045,
+      "learning_rate": 0.00012729785343046588,
+      "loss": 0.6645,
+      "step": 806
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.3784239646653406,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.6443,
+      "step": 807
+    },
+    {
+      "epoch": 0.43093333333333333,
+      "grad_norm": 0.49086708606364227,
+      "learning_rate": 0.00012696520752395672,
+      "loss": 0.7911,
+      "step": 808
+    },
+    {
+      "epoch": 0.43146666666666667,
+      "grad_norm": 0.3961628703192048,
+      "learning_rate": 0.00012679876353900482,
+      "loss": 0.6759,
+      "step": 809
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4463250420691135,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7735,
+      "step": 810
+    },
+    {
+      "epoch": 0.4325333333333333,
+      "grad_norm": 0.4686810913140729,
+      "learning_rate": 0.00012646563599083996,
+      "loss": 0.7573,
+      "step": 811
+    },
+    {
+      "epoch": 0.43306666666666666,
+      "grad_norm": 0.40909977652780205,
+      "learning_rate": 0.00012629895342239643,
+      "loss": 0.7469,
+      "step": 812
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.3727309167141433,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.6745,
+      "step": 813
+    },
+    {
+      "epoch": 0.4341333333333333,
+      "grad_norm": 0.38008510283465036,
+      "learning_rate": 0.00012596535318548289,
+      "loss": 0.658,
+      "step": 814
+    },
+    {
+      "epoch": 0.43466666666666665,
+      "grad_norm": 0.4033939911249178,
+      "learning_rate": 0.0001257984365131938,
+      "loss": 0.6661,
+      "step": 815
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.3718141592194589,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.6621,
+      "step": 816
+    },
+    {
+      "epoch": 0.4357333333333333,
+      "grad_norm": 0.41062395716625416,
+      "learning_rate": 0.00012546437255314222,
+      "loss": 0.694,
+      "step": 817
+    },
+    {
+      "epoch": 0.4362666666666667,
+      "grad_norm": 0.3862113919135583,
+      "learning_rate": 0.0001252972262629454,
+      "loss": 0.7508,
+      "step": 818
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.4045790094090187,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7028,
+      "step": 819
+    },
+    {
+      "epoch": 0.43733333333333335,
+      "grad_norm": 0.38043534595271034,
+      "learning_rate": 0.00012496270755782914,
+      "loss": 0.6517,
+      "step": 820
+    },
+    {
+      "epoch": 0.4378666666666667,
+      "grad_norm": 0.42898380122884394,
+      "learning_rate": 0.00012479533614183334,
+      "loss": 0.765,
+      "step": 821
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.4195425785194382,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7503,
+      "step": 822
+    },
+    {
+      "epoch": 0.43893333333333334,
+      "grad_norm": 0.3797934256521421,
+      "learning_rate": 0.00012446037168194714,
+      "loss": 0.7062,
+      "step": 823
+    },
+    {
+      "epoch": 0.43946666666666667,
+      "grad_norm": 0.4054779301842965,
+      "learning_rate": 0.00012429277963831148,
+      "loss": 0.7178,
+      "step": 824
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.386901638774086,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.6683,
+      "step": 825
+    },
+    {
+      "epoch": 0.44053333333333333,
+      "grad_norm": 0.36400702988521083,
+      "learning_rate": 0.00012395737842592995,
+      "loss": 0.6829,
+      "step": 826
+    },
+    {
+      "epoch": 0.44106666666666666,
+      "grad_norm": 0.40184726566934065,
+      "learning_rate": 0.000123789570258743,
+      "loss": 0.726,
+      "step": 827
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.39365513267463004,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.676,
+      "step": 828
+    },
+    {
+      "epoch": 0.4421333333333333,
+      "grad_norm": 0.39919769902042423,
+      "learning_rate": 0.00012345374130787854,
+      "loss": 0.7201,
+      "step": 829
+    },
+    {
+      "epoch": 0.44266666666666665,
+      "grad_norm": 0.3665210804076896,
+      "learning_rate": 0.00012328572152703725,
+      "loss": 0.6659,
+      "step": 830
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.44196580313984124,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7519,
+      "step": 831
+    },
+    {
+      "epoch": 0.4437333333333333,
+      "grad_norm": 0.4614342280148402,
+      "learning_rate": 0.00012294947386319794,
+      "loss": 0.7913,
+      "step": 832
+    },
+    {
+      "epoch": 0.44426666666666664,
+      "grad_norm": 0.3685609530040209,
+      "learning_rate": 0.0001227812469842864,
+      "loss": 0.702,
+      "step": 833
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.5376246603998738,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7592,
+      "step": 834
+    },
+    {
+      "epoch": 0.44533333333333336,
+      "grad_norm": 0.43015997174916043,
+      "learning_rate": 0.00012244458964423327,
+      "loss": 0.7116,
+      "step": 835
+    },
+    {
+      "epoch": 0.4458666666666667,
+      "grad_norm": 0.45254388676147145,
+      "learning_rate": 0.00012227616018840154,
+      "loss": 0.7703,
+      "step": 836
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.4368478152575418,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.712,
+      "step": 837
+    },
+    {
+      "epoch": 0.44693333333333335,
+      "grad_norm": 0.40621015655682485,
+      "learning_rate": 0.00012193910221990581,
+      "loss": 0.7853,
+      "step": 838
+    },
+    {
+      "epoch": 0.4474666666666667,
+      "grad_norm": 0.45605323048807755,
+      "learning_rate": 0.00012177047471374807,
+      "loss": 0.7578,
+      "step": 839
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4152530436713154,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.7036,
+      "step": 840
+    },
+    {
+      "epoch": 0.44853333333333334,
+      "grad_norm": 0.4014292071555056,
+      "learning_rate": 0.0001214330251753481,
+      "loss": 0.7025,
+      "step": 841
+    },
+    {
+      "epoch": 0.44906666666666667,
+      "grad_norm": 0.36497959792830553,
+      "learning_rate": 0.00012126420415078132,
+      "loss": 0.6592,
+      "step": 842
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.3880987257987707,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.7066,
+      "step": 843
+    },
+    {
+      "epoch": 0.45013333333333333,
+      "grad_norm": 0.4352812133149904,
+      "learning_rate": 0.00012092637211153885,
+      "loss": 0.6807,
+      "step": 844
+    },
+    {
+      "epoch": 0.45066666666666666,
+      "grad_norm": 0.39263478153811443,
+      "learning_rate": 0.0001207573621056809,
+      "loss": 0.6943,
+      "step": 845
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.367181541698975,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7061,
+      "step": 846
+    },
+    {
+      "epoch": 0.4517333333333333,
+      "grad_norm": 0.441297066790881,
+      "learning_rate": 0.00012041915664493761,
+      "loss": 0.7277,
+      "step": 847
+    },
+    {
+      "epoch": 0.45226666666666665,
+      "grad_norm": 0.464626225780676,
+      "learning_rate": 0.00012024996219998517,
+      "loss": 0.7754,
+      "step": 848
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.3887689717509452,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.6797,
+      "step": 849
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.38715143855728307,
+      "learning_rate": 0.00011991139240711857,
+      "loss": 0.7329,
+      "step": 850
+    },
+    {
+      "epoch": 0.45386666666666664,
+      "grad_norm": 0.37379709303278474,
+      "learning_rate": 0.00011974201807022525,
+      "loss": 0.6773,
+      "step": 851
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.4160027753940018,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7129,
+      "step": 852
+    },
+    {
+      "epoch": 0.45493333333333336,
+      "grad_norm": 0.3758872943401131,
+      "learning_rate": 0.00011940309304440433,
+      "loss": 0.6992,
+      "step": 853
+    },
+    {
+      "epoch": 0.4554666666666667,
+      "grad_norm": 0.4019839763291867,
+      "learning_rate": 0.00011923354336755835,
+      "loss": 0.6855,
+      "step": 854
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.5039043350205292,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.659,
+      "step": 855
+    },
+    {
+      "epoch": 0.45653333333333335,
+      "grad_norm": 0.42677069004798573,
+      "learning_rate": 0.00011889427221749916,
+      "loss": 0.7523,
+      "step": 856
+    },
+    {
+      "epoch": 0.4570666666666667,
+      "grad_norm": 0.35839391585223196,
+      "learning_rate": 0.00011872455175740112,
+      "loss": 0.6313,
+      "step": 857
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.45502376077565315,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.6875,
+      "step": 858
+    },
+    {
+      "epoch": 0.45813333333333334,
+      "grad_norm": 0.4431890444060972,
+      "learning_rate": 0.00011838494360112185,
+      "loss": 0.7041,
+      "step": 859
+    },
+    {
+      "epoch": 0.45866666666666667,
+      "grad_norm": 0.37562137625609004,
+      "learning_rate": 0.00011821505691906216,
+      "loss": 0.6908,
+      "step": 860
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.3817588509796904,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.6974,
+      "step": 861
+    },
+    {
+      "epoch": 0.4597333333333333,
+      "grad_norm": 0.45904913572049466,
+      "learning_rate": 0.00011787512088363817,
+      "loss": 0.8008,
+      "step": 862
+    },
+    {
+      "epoch": 0.46026666666666666,
+      "grad_norm": 0.41227840756214557,
+      "learning_rate": 0.00011770507254537453,
+      "loss": 0.7239,
+      "step": 863
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.4055746575625792,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.7047,
+      "step": 864
+    },
+    {
+      "epoch": 0.4613333333333333,
+      "grad_norm": 0.4056965741181116,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.6877,
+      "step": 865
+    },
+    {
+      "epoch": 0.46186666666666665,
+      "grad_norm": 0.4386878658210948,
+      "learning_rate": 0.00011719461234232764,
+      "loss": 0.7164,
+      "step": 866
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.5597734463574393,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.8012,
+      "step": 867
+    },
+    {
+      "epoch": 0.4629333333333333,
+      "grad_norm": 0.41627029095711515,
+      "learning_rate": 0.00011685404796484225,
+      "loss": 0.7141,
+      "step": 868
+    },
+    {
+      "epoch": 0.4634666666666667,
+      "grad_norm": 0.394981353695425,
+      "learning_rate": 0.00011668369002869912,
+      "loss": 0.7664,
+      "step": 869
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4845057923121737,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7589,
+      "step": 870
+    },
+    {
+      "epoch": 0.46453333333333335,
+      "grad_norm": 0.42184021473800426,
+      "learning_rate": 0.00011634282520518383,
+      "loss": 0.6862,
+      "step": 871
+    },
+    {
+      "epoch": 0.4650666666666667,
+      "grad_norm": 0.3685228332680087,
+      "learning_rate": 0.00011617231933568578,
+      "loss": 0.6971,
+      "step": 872
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.48125970533061785,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7705,
+      "step": 873
+    },
+    {
+      "epoch": 0.46613333333333334,
+      "grad_norm": 0.38603224204001146,
+      "learning_rate": 0.00011583116322698935,
+      "loss": 0.6801,
+      "step": 874
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.40031986426280525,
+      "learning_rate": 0.00011566051400653486,
+      "loss": 0.6784,
+      "step": 875
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.382980299985652,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.6637,
+      "step": 876
+    },
+    {
+      "epoch": 0.46773333333333333,
+      "grad_norm": 0.37255439825452225,
+      "learning_rate": 0.00011531907578133429,
+      "loss": 0.6463,
+      "step": 877
+    },
+    {
+      "epoch": 0.46826666666666666,
+      "grad_norm": 0.38972893339230685,
+      "learning_rate": 0.00011514828779617459,
+      "loss": 0.6787,
+      "step": 878
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.3879058499146623,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.6504,
+      "step": 879
+    },
+    {
+      "epoch": 0.4693333333333333,
+      "grad_norm": 0.36745829887480014,
+      "learning_rate": 0.00011480657663072896,
+      "loss": 0.648,
+      "step": 880
+    },
+    {
+      "epoch": 0.46986666666666665,
+      "grad_norm": 0.40196670311819926,
+      "learning_rate": 0.00011463565447084445,
+      "loss": 0.6694,
+      "step": 881
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.43880697355150333,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.6748,
+      "step": 882
+    },
+    {
+      "epoch": 0.4709333333333333,
+      "grad_norm": 0.367688992998293,
+      "learning_rate": 0.00011429367954874819,
+      "loss": 0.6344,
+      "step": 883
+    },
+    {
+      "epoch": 0.47146666666666665,
+      "grad_norm": 0.4286921521160196,
+      "learning_rate": 0.0001141226278077254,
+      "loss": 0.7725,
+      "step": 884
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4276181045914976,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7045,
+      "step": 885
+    },
+    {
+      "epoch": 0.47253333333333336,
+      "grad_norm": 0.4602914646784364,
+      "learning_rate": 0.00011378039831966134,
+      "loss": 0.7106,
+      "step": 886
+    },
+    {
+      "epoch": 0.4730666666666667,
+      "grad_norm": 0.43716536637456466,
+      "learning_rate": 0.00011360922159456928,
+      "loss": 0.7746,
+      "step": 887
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3626625983037319,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.6579,
+      "step": 888
+    },
+    {
+      "epoch": 0.47413333333333335,
+      "grad_norm": 0.46860471157021877,
+      "learning_rate": 0.00011326674673806195,
+      "loss": 0.764,
+      "step": 889
+    },
+    {
+      "epoch": 0.4746666666666667,
+      "grad_norm": 0.41045660210116464,
+      "learning_rate": 0.00011309544962932862,
+      "loss": 0.731,
+      "step": 890
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.3752662286080647,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.6549,
+      "step": 891
+    },
+    {
+      "epoch": 0.47573333333333334,
+      "grad_norm": 0.4224597961900508,
+      "learning_rate": 0.00011275273860849684,
+      "loss": 0.775,
+      "step": 892
+    },
+    {
+      "epoch": 0.47626666666666667,
+      "grad_norm": 0.4024833252997957,
+      "learning_rate": 0.00011258132571978555,
+      "loss": 0.6596,
+      "step": 893
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.37603725859632336,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.6945,
+      "step": 894
+    },
+    {
+      "epoch": 0.47733333333333333,
+      "grad_norm": 0.3687870047601706,
+      "learning_rate": 0.00011223838774509514,
+      "loss": 0.6411,
+      "step": 895
+    },
+    {
+      "epoch": 0.47786666666666666,
+      "grad_norm": 0.3932754469285646,
+      "learning_rate": 0.00011206686368318086,
+      "loss": 0.6972,
+      "step": 896
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.330723604928933,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.6111,
+      "step": 897
+    },
+    {
+      "epoch": 0.4789333333333333,
+      "grad_norm": 0.3881987511650391,
+      "learning_rate": 0.00011172370797119712,
+      "loss": 0.6641,
+      "step": 898
+    },
+    {
+      "epoch": 0.47946666666666665,
+      "grad_norm": 0.37490505625431775,
+      "learning_rate": 0.00011155207734584263,
+      "loss": 0.6733,
+      "step": 899
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3797740850467834,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.684,
+      "step": 900
+    },
+    {
+      "epoch": 0.4805333333333333,
+      "grad_norm": 0.37245630143840797,
+      "learning_rate": 0.00011120871311898254,
+      "loss": 0.6783,
+      "step": 901
+    },
+    {
+      "epoch": 0.48106666666666664,
+      "grad_norm": 0.4368289721232358,
+      "learning_rate": 0.0001110369805428146,
+      "loss": 0.8057,
+      "step": 902
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.37089523762776133,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.6965,
+      "step": 903
+    },
+    {
+      "epoch": 0.48213333333333336,
+      "grad_norm": 0.3837018215606554,
+      "learning_rate": 0.0001106934170290991,
+      "loss": 0.6918,
+      "step": 904
+    },
+    {
+      "epoch": 0.4826666666666667,
+      "grad_norm": 0.4107940671971852,
+      "learning_rate": 0.00011052158711748434,
+      "loss": 0.7647,
+      "step": 905
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.38062943148129663,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.6153,
+      "step": 906
+    },
+    {
+      "epoch": 0.48373333333333335,
+      "grad_norm": 0.347440286463666,
+      "learning_rate": 0.00011017783355029026,
+      "loss": 0.6966,
+      "step": 907
+    },
+    {
+      "epoch": 0.4842666666666667,
+      "grad_norm": 0.4138494786190947,
+      "learning_rate": 0.00011000591092121127,
+      "loss": 0.6617,
+      "step": 908
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.435120628912503,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.6947,
+      "step": 909
+    },
+    {
+      "epoch": 0.48533333333333334,
+      "grad_norm": 0.5156696038671611,
+      "learning_rate": 0.0001096619765390232,
+      "loss": 0.8521,
+      "step": 910
+    },
+    {
+      "epoch": 0.48586666666666667,
+      "grad_norm": 0.40002449111770144,
+      "learning_rate": 0.00010948996581295436,
+      "loss": 0.7086,
+      "step": 911
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.4584453650081408,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7203,
+      "step": 912
+    },
+    {
+      "epoch": 0.48693333333333333,
+      "grad_norm": 0.42971470634919057,
+      "learning_rate": 0.00010914585985911632,
+      "loss": 0.6927,
+      "step": 913
+    },
+    {
+      "epoch": 0.48746666666666666,
+      "grad_norm": 0.46160825121281207,
+      "learning_rate": 0.00010897376565889971,
+      "loss": 0.753,
+      "step": 914
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.4484481414251196,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.7451,
+      "step": 915
+    },
+    {
+      "epoch": 0.4885333333333333,
+      "grad_norm": 0.353331022440045,
+      "learning_rate": 0.00010862949738136681,
+      "loss": 0.613,
+      "step": 916
+    },
+    {
+      "epoch": 0.48906666666666665,
+      "grad_norm": 0.39814982162243234,
+      "learning_rate": 0.00010845732433208779,
+      "loss": 0.6719,
+      "step": 917
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3670872104876671,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.6711,
+      "step": 918
+    },
+    {
+      "epoch": 0.4901333333333333,
+      "grad_norm": 0.44024385623542284,
+      "learning_rate": 0.00010811290298317755,
+      "loss": 0.7356,
+      "step": 919
+    },
+    {
+      "epoch": 0.49066666666666664,
+      "grad_norm": 0.41161265951685283,
+      "learning_rate": 0.00010794065571204072,
+      "loss": 0.7264,
+      "step": 920
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.3984442796149138,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.6644,
+      "step": 921
+    },
+    {
+      "epoch": 0.49173333333333336,
+      "grad_norm": 0.4294764379918491,
+      "learning_rate": 0.00010759609054818458,
+      "loss": 0.7437,
+      "step": 922
+    },
+    {
+      "epoch": 0.4922666666666667,
+      "grad_norm": 0.44704886237002023,
+      "learning_rate": 0.00010742377368438914,
+      "loss": 0.7411,
+      "step": 923
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.41290555750302516,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.7867,
+      "step": 924
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 0.3613358302667413,
+      "learning_rate": 0.00010707907396588361,
+      "loss": 0.654,
+      "step": 925
+    },
+    {
+      "epoch": 0.4938666666666667,
+      "grad_norm": 0.361773515899362,
+      "learning_rate": 0.0001069066921404992,
+      "loss": 0.7043,
+      "step": 926
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.4082345023649185,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7041,
+      "step": 927
+    },
+    {
+      "epoch": 0.49493333333333334,
+      "grad_norm": 0.4135580682589969,
+      "learning_rate": 0.00010656186713125689,
+      "loss": 0.7068,
+      "step": 928
+    },
+    {
+      "epoch": 0.49546666666666667,
+      "grad_norm": 0.4394394752224285,
+      "learning_rate": 0.0001063894249770989,
+      "loss": 0.6995,
+      "step": 929
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4332952825333924,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.7289,
+      "step": 930
+    },
+    {
+      "epoch": 0.4965333333333333,
+      "grad_norm": 0.36654579801605636,
+      "learning_rate": 0.00010604448394439983,
+      "loss": 0.662,
+      "step": 931
+    },
+    {
+      "epoch": 0.49706666666666666,
+      "grad_norm": 0.40193997576120905,
+      "learning_rate": 0.00010587198609590505,
+      "loss": 0.7351,
+      "step": 932
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.37513486505181953,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.7507,
+      "step": 933
+    },
+    {
+      "epoch": 0.4981333333333333,
+      "grad_norm": 0.3961975057784801,
+      "learning_rate": 0.00010552693831014726,
+      "loss": 0.6518,
+      "step": 934
+    },
+    {
+      "epoch": 0.49866666666666665,
+      "grad_norm": 0.4097332705661171,
+      "learning_rate": 0.0001053543894032493,
+      "loss": 0.7519,
+      "step": 935
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.39234400713069595,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.6576,
+      "step": 936
+    },
+    {
+      "epoch": 0.4997333333333333,
+      "grad_norm": 0.43052107521153943,
+      "learning_rate": 0.00010500924413769988,
+      "loss": 0.7585,
+      "step": 937
+    },
+    {
+      "epoch": 0.5002666666666666,
+      "grad_norm": 0.3936263192518126,
+      "learning_rate": 0.00010483664880970457,
+      "loss": 0.6704,
+      "step": 938
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.4657507551578899,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.6868,
+      "step": 939
+    },
+    {
+      "epoch": 0.5013333333333333,
+      "grad_norm": 0.5021464715895197,
+      "learning_rate": 0.00010449141534025045,
+      "loss": 0.6723,
+      "step": 940
+    },
+    {
+      "epoch": 0.5018666666666667,
+      "grad_norm": 0.43506222988133986,
+      "learning_rate": 0.00010431877822971117,
+      "loss": 0.704,
+      "step": 941
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.4156106779609884,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.7259,
+      "step": 942
+    },
+    {
+      "epoch": 0.5029333333333333,
+      "grad_norm": 0.3957962157850304,
+      "learning_rate": 0.00010397346583460971,
+      "loss": 0.666,
+      "step": 943
+    },
+    {
+      "epoch": 0.5034666666666666,
+      "grad_norm": 0.42205766234785436,
+      "learning_rate": 0.0001038007915812028,
+      "loss": 0.7411,
+      "step": 944
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.3900808669526344,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.6336,
+      "step": 945
+    },
+    {
+      "epoch": 0.5045333333333333,
+      "grad_norm": 0.4076171734832918,
+      "learning_rate": 0.0001034554095408326,
+      "loss": 0.7344,
+      "step": 946
+    },
+    {
+      "epoch": 0.5050666666666667,
+      "grad_norm": 0.3847244306006347,
+      "learning_rate": 0.00010328270278523256,
+      "loss": 0.6885,
+      "step": 947
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.40880811223071983,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.6969,
+      "step": 948
+    },
+    {
+      "epoch": 0.5061333333333333,
+      "grad_norm": 0.42801078617179544,
+      "learning_rate": 0.00010293726038184393,
+      "loss": 0.7446,
+      "step": 949
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.48044160927138174,
+      "learning_rate": 0.00010276452576559879,
+      "loss": 0.6577,
+      "step": 950
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.4257807920668605,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7137,
+      "step": 951
+    },
+    {
+      "epoch": 0.5077333333333334,
+      "grad_norm": 0.40497149124415505,
+      "learning_rate": 0.00010241903228306431,
+      "loss": 0.6872,
+      "step": 952
+    },
+    {
+      "epoch": 0.5082666666666666,
+      "grad_norm": 0.4335781038181838,
+      "learning_rate": 0.0001022462744484709,
+      "loss": 0.6696,
+      "step": 953
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.3985487513226291,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.6448,
+      "step": 954
+    },
+    {
+      "epoch": 0.5093333333333333,
+      "grad_norm": 0.4269506200122535,
+      "learning_rate": 0.00010190073917203589,
+      "loss": 0.754,
+      "step": 955
+    },
+    {
+      "epoch": 0.5098666666666667,
+      "grad_norm": 0.4301253765909779,
+      "learning_rate": 0.00010172796276201503,
+      "loss": 0.708,
+      "step": 956
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.3930380897648193,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.6582,
+      "step": 957
+    },
+    {
+      "epoch": 0.5109333333333334,
+      "grad_norm": 0.41425780092834197,
+      "learning_rate": 0.00010138239497804804,
+      "loss": 0.7319,
+      "step": 958
+    },
+    {
+      "epoch": 0.5114666666666666,
+      "grad_norm": 0.4496872681858521,
+      "learning_rate": 0.00010120960463601976,
+      "loss": 0.7825,
+      "step": 959
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.429109992059253,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7563,
+      "step": 960
+    },
+    {
+      "epoch": 0.5125333333333333,
+      "grad_norm": 0.3533786886319857,
+      "learning_rate": 0.00010086401363176305,
+      "loss": 0.6602,
+      "step": 961
+    },
+    {
+      "epoch": 0.5130666666666667,
+      "grad_norm": 0.36720268355982055,
+      "learning_rate": 0.00010069121400152181,
+      "loss": 0.6362,
+      "step": 962
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.37055946076736773,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.7085,
+      "step": 963
+    },
+    {
+      "epoch": 0.5141333333333333,
+      "grad_norm": 0.38241199146178695,
+      "learning_rate": 0.0001003456090648416,
+      "loss": 0.6914,
+      "step": 964
+    },
+    {
+      "epoch": 0.5146666666666667,
+      "grad_norm": 0.38049684847414167,
+      "learning_rate": 0.00010017280479043147,
+      "loss": 0.7164,
+      "step": 965
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.36780014919222614,
+      "learning_rate": 0.0001,
+      "loss": 0.6807,
+      "step": 966
+    },
+    {
+      "epoch": 0.5157333333333334,
+      "grad_norm": 0.4176219481881425,
+      "learning_rate": 9.982719520956855e-05,
+      "loss": 0.6884,
+      "step": 967
+    },
+    {
+      "epoch": 0.5162666666666667,
+      "grad_norm": 0.43880841323776304,
+      "learning_rate": 9.965439093515841e-05,
+      "loss": 0.7612,
+      "step": 968
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.4083242613195229,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.6761,
+      "step": 969
+    },
+    {
+      "epoch": 0.5173333333333333,
+      "grad_norm": 0.4035241252326839,
+      "learning_rate": 9.930878599847821e-05,
+      "loss": 0.6413,
+      "step": 970
+    },
+    {
+      "epoch": 0.5178666666666667,
+      "grad_norm": 0.4202181352789713,
+      "learning_rate": 9.913598636823693e-05,
+      "loss": 0.7091,
+      "step": 971
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.410067103211802,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.6901,
+      "step": 972
+    },
+    {
+      "epoch": 0.5189333333333334,
+      "grad_norm": 0.4316666564424048,
+      "learning_rate": 9.879039536398024e-05,
+      "loss": 0.7605,
+      "step": 973
+    },
+    {
+      "epoch": 0.5194666666666666,
+      "grad_norm": 0.5172372427864208,
+      "learning_rate": 9.861760502195197e-05,
+      "loss": 0.7423,
+      "step": 974
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.3822087471114855,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.6793,
+      "step": 975
+    },
+    {
+      "epoch": 0.5205333333333333,
+      "grad_norm": 0.4211041483634554,
+      "learning_rate": 9.827203723798498e-05,
+      "loss": 0.699,
+      "step": 976
+    },
+    {
+      "epoch": 0.5210666666666667,
+      "grad_norm": 0.34983710521200895,
+      "learning_rate": 9.809926082796415e-05,
+      "loss": 0.6348,
+      "step": 977
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3910577540068633,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.7243,
+      "step": 978
+    },
+    {
+      "epoch": 0.5221333333333333,
+      "grad_norm": 0.3960657703459683,
+      "learning_rate": 9.775372555152912e-05,
+      "loss": 0.7049,
+      "step": 979
+    },
+    {
+      "epoch": 0.5226666666666666,
+      "grad_norm": 0.40337191060802974,
+      "learning_rate": 9.758096771693573e-05,
+      "loss": 0.6345,
+      "step": 980
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.3703239762196236,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6558,
+      "step": 981
+    },
+    {
+      "epoch": 0.5237333333333334,
+      "grad_norm": 0.3634029551035167,
+      "learning_rate": 9.723547423440122e-05,
+      "loss": 0.6327,
+      "step": 982
+    },
+    {
+      "epoch": 0.5242666666666667,
+      "grad_norm": 0.48215601756608845,
+      "learning_rate": 9.70627396181561e-05,
+      "loss": 0.745,
+      "step": 983
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3638409558633442,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.6507,
+      "step": 984
+    },
+    {
+      "epoch": 0.5253333333333333,
+      "grad_norm": 0.41204323574299007,
+      "learning_rate": 9.671729721476746e-05,
+      "loss": 0.7287,
+      "step": 985
+    },
+    {
+      "epoch": 0.5258666666666667,
+      "grad_norm": 0.3852887012348035,
+      "learning_rate": 9.654459045916743e-05,
+      "loss": 0.7447,
+      "step": 986
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.3841897472928867,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.684,
+      "step": 987
+    },
+    {
+      "epoch": 0.5269333333333334,
+      "grad_norm": 0.40780006190278706,
+      "learning_rate": 9.619920841879725e-05,
+      "loss": 0.7232,
+      "step": 988
+    },
+    {
+      "epoch": 0.5274666666666666,
+      "grad_norm": 0.39603259505315236,
+      "learning_rate": 9.602653416539031e-05,
+      "loss": 0.6245,
+      "step": 989
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.45406690853388815,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.78,
+      "step": 990
+    },
+    {
+      "epoch": 0.5285333333333333,
+      "grad_norm": 0.38904989956359265,
+      "learning_rate": 9.568122177028884e-05,
+      "loss": 0.7168,
+      "step": 991
+    },
+    {
+      "epoch": 0.5290666666666667,
+      "grad_norm": 0.4325600684124193,
+      "learning_rate": 9.550858465974958e-05,
+      "loss": 0.6561,
+      "step": 992
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.3830034699641413,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.7008,
+      "step": 993
+    },
+    {
+      "epoch": 0.5301333333333333,
+      "grad_norm": 0.37968275486463826,
+      "learning_rate": 9.516335119029546e-05,
+      "loss": 0.6589,
+      "step": 994
+    },
+    {
+      "epoch": 0.5306666666666666,
+      "grad_norm": 0.4138078396366489,
+      "learning_rate": 9.499075586230013e-05,
+      "loss": 0.7354,
+      "step": 995
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.3678922571932153,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.7075,
+      "step": 996
+    },
+    {
+      "epoch": 0.5317333333333333,
+      "grad_norm": 0.3394208426931613,
+      "learning_rate": 9.464561059675073e-05,
+      "loss": 0.6508,
+      "step": 997
+    },
+    {
+      "epoch": 0.5322666666666667,
+      "grad_norm": 0.4532313710316155,
+      "learning_rate": 9.44730616898528e-05,
+      "loss": 0.6466,
+      "step": 998
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.3531050111256949,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.6072,
+      "step": 999
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.4473101196705385,
+      "learning_rate": 9.412801390409497e-05,
+      "loss": 0.7153,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5338666666666667,
+      "grad_norm": 0.4056044330445051,
+      "learning_rate": 9.395551605560018e-05,
+      "loss": 0.6826,
+      "step": 1001
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.4204381386331875,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.6903,
+      "step": 1002
+    },
+    {
+      "epoch": 0.5349333333333334,
+      "grad_norm": 0.390453552431553,
+      "learning_rate": 9.361057502290113e-05,
+      "loss": 0.6792,
+      "step": 1003
+    },
+    {
+      "epoch": 0.5354666666666666,
+      "grad_norm": 0.3792222651183991,
+      "learning_rate": 9.343813286874312e-05,
+      "loss": 0.6492,
+      "step": 1004
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.415249245822204,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.7074,
+      "step": 1005
+    },
+    {
+      "epoch": 0.5365333333333333,
+      "grad_norm": 0.37759188533924803,
+      "learning_rate": 9.309330785950086e-05,
+      "loss": 0.605,
+      "step": 1006
+    },
+    {
+      "epoch": 0.5370666666666667,
+      "grad_norm": 0.42878386883350156,
+      "learning_rate": 9.292092603411641e-05,
+      "loss": 0.6881,
+      "step": 1007
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.4605947532728995,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7806,
+      "step": 1008
+    },
+    {
+      "epoch": 0.5381333333333334,
+      "grad_norm": 0.4243274933456629,
+      "learning_rate": 9.257622631561085e-05,
+      "loss": 0.7261,
+      "step": 1009
+    },
+    {
+      "epoch": 0.5386666666666666,
+      "grad_norm": 0.4216238590088995,
+      "learning_rate": 9.240390945181543e-05,
+      "loss": 0.6769,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.3792720960569066,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.6353,
+      "step": 1011
+    },
+    {
+      "epoch": 0.5397333333333333,
+      "grad_norm": 0.41176723371050467,
+      "learning_rate": 9.205934428795929e-05,
+      "loss": 0.7357,
+      "step": 1012
+    },
+    {
+      "epoch": 0.5402666666666667,
+      "grad_norm": 0.3627687163634997,
+      "learning_rate": 9.188709701682247e-05,
+      "loss": 0.6817,
+      "step": 1013
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.47965484245707873,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7677,
+      "step": 1014
+    },
+    {
+      "epoch": 0.5413333333333333,
+      "grad_norm": 0.39258008589673843,
+      "learning_rate": 9.154267566791223e-05,
+      "loss": 0.7012,
+      "step": 1015
+    },
+    {
+      "epoch": 0.5418666666666667,
+      "grad_norm": 0.35999804522968204,
+      "learning_rate": 9.137050261863324e-05,
+      "loss": 0.7255,
+      "step": 1016
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.3982795020365924,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.6431,
+      "step": 1017
+    },
+    {
+      "epoch": 0.5429333333333334,
+      "grad_norm": 0.40106859970171016,
+      "learning_rate": 9.102623434110028e-05,
+      "loss": 0.7,
+      "step": 1018
+    },
+    {
+      "epoch": 0.5434666666666667,
+      "grad_norm": 0.42965692134016903,
+      "learning_rate": 9.085414014088369e-05,
+      "loss": 0.7035,
+      "step": 1019
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.43705161139558246,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.7427,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5445333333333333,
+      "grad_norm": 0.3918766943083777,
+      "learning_rate": 9.051003418704565e-05,
+      "loss": 0.6308,
+      "step": 1021
+    },
+    {
+      "epoch": 0.5450666666666667,
+      "grad_norm": 0.3985622780590428,
+      "learning_rate": 9.033802346097682e-05,
+      "loss": 0.6845,
+      "step": 1022
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.42672293533483363,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.6507,
+      "step": 1023
+    },
+    {
+      "epoch": 0.5461333333333334,
+      "grad_norm": 0.4143702320792897,
+      "learning_rate": 8.999408907878877e-05,
+      "loss": 0.6889,
+      "step": 1024
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 0.471380879969022,
+      "learning_rate": 8.982216644970979e-05,
+      "loss": 0.7823,
+      "step": 1025
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.46480075498108553,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7272,
+      "step": 1026
+    },
+    {
+      "epoch": 0.5477333333333333,
+      "grad_norm": 0.36976265158301574,
+      "learning_rate": 8.947841288251568e-05,
+      "loss": 0.7031,
+      "step": 1027
+    },
+    {
+      "epoch": 0.5482666666666667,
+      "grad_norm": 0.42436037298433815,
+      "learning_rate": 8.930658297090091e-05,
+      "loss": 0.7187,
+      "step": 1028
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.41953791874187923,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.741,
+      "step": 1029
+    },
+    {
+      "epoch": 0.5493333333333333,
+      "grad_norm": 0.41981610603411884,
+      "learning_rate": 8.896301945718541e-05,
+      "loss": 0.7184,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5498666666666666,
+      "grad_norm": 0.4190252188169704,
+      "learning_rate": 8.879128688101749e-05,
+      "loss": 0.7345,
+      "step": 1031
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.4184101905417997,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.7642,
+      "step": 1032
+    },
+    {
+      "epoch": 0.5509333333333334,
+      "grad_norm": 0.3879175502059154,
+      "learning_rate": 8.844792265415738e-05,
+      "loss": 0.7347,
+      "step": 1033
+    },
+    {
+      "epoch": 0.5514666666666667,
+      "grad_norm": 0.32715542633091615,
+      "learning_rate": 8.827629202880293e-05,
+      "loss": 0.6334,
+      "step": 1034
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.37958261854030473,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.6843,
+      "step": 1035
+    },
+    {
+      "epoch": 0.5525333333333333,
+      "grad_norm": 0.38701966625163015,
+      "learning_rate": 8.793313631681915e-05,
+      "loss": 0.7068,
+      "step": 1036
+    },
+    {
+      "epoch": 0.5530666666666667,
+      "grad_norm": 0.45127483979897365,
+      "learning_rate": 8.776161225490489e-05,
+      "loss": 0.7066,
+      "step": 1037
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.4101133580417039,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.6577,
+      "step": 1038
+    },
+    {
+      "epoch": 0.5541333333333334,
+      "grad_norm": 0.3870440706258641,
+      "learning_rate": 8.741867428021446e-05,
+      "loss": 0.6597,
+      "step": 1039
+    },
+    {
+      "epoch": 0.5546666666666666,
+      "grad_norm": 0.4096386184294334,
+      "learning_rate": 8.724726139150318e-05,
+      "loss": 0.7066,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.4397059282577514,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.7115,
+      "step": 1041
+    },
+    {
+      "epoch": 0.5557333333333333,
+      "grad_norm": 0.643330271525409,
+      "learning_rate": 8.690455037067141e-05,
+      "loss": 0.6834,
+      "step": 1042
+    },
+    {
+      "epoch": 0.5562666666666667,
+      "grad_norm": 0.4134643699688854,
+      "learning_rate": 8.673325326193806e-05,
+      "loss": 0.7021,
+      "step": 1043
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.4091055590321153,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7012,
+      "step": 1044
+    },
+    {
+      "epoch": 0.5573333333333333,
+      "grad_norm": 0.3450013466049551,
+      "learning_rate": 8.639077840543077e-05,
+      "loss": 0.6151,
+      "step": 1045
+    },
+    {
+      "epoch": 0.5578666666666666,
+      "grad_norm": 0.47085742331395813,
+      "learning_rate": 8.621960168033867e-05,
+      "loss": 0.7699,
+      "step": 1046
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.39515177914325017,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.6866,
+      "step": 1047
+    },
+    {
+      "epoch": 0.5589333333333333,
+      "grad_norm": 0.43479194884347794,
+      "learning_rate": 8.587737219227462e-05,
+      "loss": 0.7113,
+      "step": 1048
+    },
+    {
+      "epoch": 0.5594666666666667,
+      "grad_norm": 0.43860089142479186,
+      "learning_rate": 8.570632045125185e-05,
+      "loss": 0.6748,
+      "step": 1049
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.327668756532882,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6482,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5605333333333333,
+      "grad_norm": 0.3769926364366294,
+      "learning_rate": 8.536434552915556e-05,
+      "loss": 0.6948,
+      "step": 1051
+    },
+    {
+      "epoch": 0.5610666666666667,
+      "grad_norm": 0.4222610092944222,
+      "learning_rate": 8.519342336927105e-05,
+      "loss": 0.7279,
+      "step": 1052
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.5048788519883153,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7563,
+      "step": 1053
+    },
+    {
+      "epoch": 0.5621333333333334,
+      "grad_norm": 0.43196567813218695,
+      "learning_rate": 8.485171220382545e-05,
+      "loss": 0.6819,
+      "step": 1054
+    },
+    {
+      "epoch": 0.5626666666666666,
+      "grad_norm": 0.4293025052867687,
+      "learning_rate": 8.468092421866573e-05,
+      "loss": 0.712,
+      "step": 1055
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.42894131592218576,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.6954,
+      "step": 1056
+    },
+    {
+      "epoch": 0.5637333333333333,
+      "grad_norm": 0.5022696074419618,
+      "learning_rate": 8.433948599346516e-05,
+      "loss": 0.681,
+      "step": 1057
+    },
+    {
+      "epoch": 0.5642666666666667,
+      "grad_norm": 0.33992536862717254,
+      "learning_rate": 8.416883677301069e-05,
+      "loss": 0.6073,
+      "step": 1058
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.3643096903622327,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6004,
+      "step": 1059
+    },
+    {
+      "epoch": 0.5653333333333334,
+      "grad_norm": 0.4516354617724274,
+      "learning_rate": 8.382768066431425e-05,
+      "loss": 0.7386,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5658666666666666,
+      "grad_norm": 0.4416293943588361,
+      "learning_rate": 8.36571747948162e-05,
+      "loss": 0.7443,
+      "step": 1061
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.41531808973420403,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7243,
+      "step": 1062
+    },
+    {
+      "epoch": 0.5669333333333333,
+      "grad_norm": 0.3935473285416105,
+      "learning_rate": 8.33163099713009e-05,
+      "loss": 0.6611,
+      "step": 1063
+    },
+    {
+      "epoch": 0.5674666666666667,
+      "grad_norm": 0.3474446139720467,
+      "learning_rate": 8.31459520351578e-05,
+      "loss": 0.6343,
+      "step": 1064
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.430512345727379,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.6738,
+      "step": 1065
+    },
+    {
+      "epoch": 0.5685333333333333,
+      "grad_norm": 0.3641433058460538,
+      "learning_rate": 8.280538765767235e-05,
+      "loss": 0.6634,
+      "step": 1066
+    },
+    {
+      "epoch": 0.5690666666666667,
+      "grad_norm": 0.4136357697412744,
+      "learning_rate": 8.263518223330697e-05,
+      "loss": 0.6672,
+      "step": 1067
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3808396291501075,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.6345,
+      "step": 1068
+    },
+    {
+      "epoch": 0.5701333333333334,
+      "grad_norm": 0.3999041628392743,
+      "learning_rate": 8.22949274546255e-05,
+      "loss": 0.6915,
+      "step": 1069
+    },
+    {
+      "epoch": 0.5706666666666667,
+      "grad_norm": 0.39809173016582916,
+      "learning_rate": 8.212487911636184e-05,
+      "loss": 0.6691,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.46464138468826066,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.7021,
+      "step": 1071
+    },
+    {
+      "epoch": 0.5717333333333333,
+      "grad_norm": 0.4222476443961406,
+      "learning_rate": 8.178494308093789e-05,
+      "loss": 0.722,
+      "step": 1072
+    },
+    {
+      "epoch": 0.5722666666666667,
+      "grad_norm": 0.4231280382809865,
+      "learning_rate": 8.161505639887817e-05,
+      "loss": 0.7378,
+      "step": 1073
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.4691286662465898,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.71,
+      "step": 1074
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 0.41505030023745515,
+      "learning_rate": 8.127544824259889e-05,
+      "loss": 0.695,
+      "step": 1075
+    },
+    {
+      "epoch": 0.5738666666666666,
+      "grad_norm": 0.38762458314861936,
+      "learning_rate": 8.110572778250085e-05,
+      "loss": 0.6926,
+      "step": 1076
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.5153169192478391,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.6735,
+      "step": 1077
+    },
+    {
+      "epoch": 0.5749333333333333,
+      "grad_norm": 0.41114335332061647,
+      "learning_rate": 8.076645663244168e-05,
+      "loss": 0.6218,
+      "step": 1078
+    },
+    {
+      "epoch": 0.5754666666666667,
+      "grad_norm": 0.38947490596295675,
+      "learning_rate": 8.059690695559568e-05,
+      "loss": 0.6606,
+      "step": 1079
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4379699189436385,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.7664,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5765333333333333,
+      "grad_norm": 0.4361131199878347,
+      "learning_rate": 8.025798192977481e-05,
+      "loss": 0.7081,
+      "step": 1081
+    },
+    {
+      "epoch": 0.5770666666666666,
+      "grad_norm": 0.41313146945672086,
+      "learning_rate": 8.008860759288147e-05,
+      "loss": 0.6189,
+      "step": 1082
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.5022765303498697,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.7532,
+      "step": 1083
+    },
+    {
+      "epoch": 0.5781333333333334,
+      "grad_norm": 0.4227556056013937,
+      "learning_rate": 7.975003780001485e-05,
+      "loss": 0.687,
+      "step": 1084
+    },
+    {
+      "epoch": 0.5786666666666667,
+      "grad_norm": 0.4223792352159664,
+      "learning_rate": 7.958084335506239e-05,
+      "loss": 0.7141,
+      "step": 1085
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.44012250934714675,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.6946,
+      "step": 1086
+    },
+    {
+      "epoch": 0.5797333333333333,
+      "grad_norm": 0.426144893073359,
+      "learning_rate": 7.924263789431912e-05,
+      "loss": 0.6756,
+      "step": 1087
+    },
+    {
+      "epoch": 0.5802666666666667,
+      "grad_norm": 0.38795705651306933,
+      "learning_rate": 7.907362788846116e-05,
+      "loss": 0.6604,
+      "step": 1088
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.36687962675189395,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.6262,
+      "step": 1089
+    },
+    {
+      "epoch": 0.5813333333333334,
+      "grad_norm": 0.4227039580651776,
+      "learning_rate": 7.873579584921869e-05,
+      "loss": 0.6616,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5818666666666666,
+      "grad_norm": 0.39994912985744235,
+      "learning_rate": 7.856697482465196e-05,
+      "loss": 0.6565,
+      "step": 1091
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.36900012142350147,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6177,
+      "step": 1092
+    },
+    {
+      "epoch": 0.5829333333333333,
+      "grad_norm": 0.3751018480152569,
+      "learning_rate": 7.822952528625191e-05,
+      "loss": 0.622,
+      "step": 1093
+    },
+    {
+      "epoch": 0.5834666666666667,
+      "grad_norm": 0.3883694896917854,
+      "learning_rate": 7.806089778009421e-05,
+      "loss": 0.6898,
+      "step": 1094
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3619978615033815,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.6017,
+      "step": 1095
+    },
+    {
+      "epoch": 0.5845333333333333,
+      "grad_norm": 0.4168897067771285,
+      "learning_rate": 7.772383981159849e-05,
+      "loss": 0.6596,
+      "step": 1096
+    },
+    {
+      "epoch": 0.5850666666666666,
+      "grad_norm": 0.38026149451722435,
+      "learning_rate": 7.755541035576677e-05,
+      "loss": 0.6571,
+      "step": 1097
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.42324690565350187,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.741,
+      "step": 1098
+    },
+    {
+      "epoch": 0.5861333333333333,
+      "grad_norm": 0.38849404207526655,
+      "learning_rate": 7.721875301571359e-05,
+      "loss": 0.6486,
+      "step": 1099
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.44791957872685967,
+      "learning_rate": 7.705052613680211e-05,
+      "loss": 0.7033,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.474403779731019,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.769,
+      "step": 1101
+    },
+    {
+      "epoch": 0.5877333333333333,
+      "grad_norm": 0.38980569878904037,
+      "learning_rate": 7.671427847296275e-05,
+      "loss": 0.6718,
+      "step": 1102
+    },
+    {
+      "epoch": 0.5882666666666667,
+      "grad_norm": 0.39144939025603787,
+      "learning_rate": 7.654625869212146e-05,
+      "loss": 0.6537,
+      "step": 1103
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.4139639592906771,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.7053,
+      "step": 1104
+    },
+    {
+      "epoch": 0.5893333333333334,
+      "grad_norm": 0.3813478824967945,
+      "learning_rate": 7.6210429741257e-05,
+      "loss": 0.6919,
+      "step": 1105
+    },
+    {
+      "epoch": 0.5898666666666667,
+      "grad_norm": 0.34470477488748674,
+      "learning_rate": 7.604262157407007e-05,
+      "loss": 0.6022,
+      "step": 1106
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.349710254993917,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.6017,
+      "step": 1107
+    },
+    {
+      "epoch": 0.5909333333333333,
+      "grad_norm": 0.526494869740576,
+      "learning_rate": 7.570722036168854e-05,
+      "loss": 0.8012,
+      "step": 1108
+    },
+    {
+      "epoch": 0.5914666666666667,
+      "grad_norm": 0.354436642808636,
+      "learning_rate": 7.55396283180529e-05,
+      "loss": 0.6422,
+      "step": 1109
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.40255733702132107,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.6722,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5925333333333334,
+      "grad_norm": 0.3862620472965459,
+      "learning_rate": 7.520466385816671e-05,
+      "loss": 0.6753,
+      "step": 1111
+    },
+    {
+      "epoch": 0.5930666666666666,
+      "grad_norm": 0.5682438764980138,
+      "learning_rate": 7.503729244217086e-05,
+      "loss": 0.8275,
+      "step": 1112
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.38896197148966755,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6761,
+      "step": 1113
+    },
+    {
+      "epoch": 0.5941333333333333,
+      "grad_norm": 0.36915780047158037,
+      "learning_rate": 7.470277373705461e-05,
+      "loss": 0.647,
+      "step": 1114
+    },
+    {
+      "epoch": 0.5946666666666667,
+      "grad_norm": 0.4247439920018155,
+      "learning_rate": 7.453562744685778e-05,
+      "loss": 0.7008,
+      "step": 1115
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3757141023503735,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.705,
+      "step": 1116
+    },
+    {
+      "epoch": 0.5957333333333333,
+      "grad_norm": 0.380872384717392,
+      "learning_rate": 7.42015634868062e-05,
+      "loss": 0.6479,
+      "step": 1117
+    },
+    {
+      "epoch": 0.5962666666666666,
+      "grad_norm": 0.3710352051491737,
+      "learning_rate": 7.403464681451715e-05,
+      "loss": 0.6877,
+      "step": 1118
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.36921679104831656,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.7111,
+      "step": 1119
+    },
+    {
+      "epoch": 0.5973333333333334,
+      "grad_norm": 0.40081663228508874,
+      "learning_rate": 7.370104657760361e-05,
+      "loss": 0.7137,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5978666666666667,
+      "grad_norm": 0.43909280291070063,
+      "learning_rate": 7.353436400916004e-05,
+      "loss": 0.7445,
+      "step": 1121
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.40435688760211064,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.7437,
+      "step": 1122
+    },
+    {
+      "epoch": 0.5989333333333333,
+      "grad_norm": 0.37014311897723245,
+      "learning_rate": 7.320123646099519e-05,
+      "loss": 0.6338,
+      "step": 1123
+    },
+    {
+      "epoch": 0.5994666666666667,
+      "grad_norm": 0.3771698597332727,
+      "learning_rate": 7.303479247604332e-05,
+      "loss": 0.6736,
+      "step": 1124
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.33845853276475274,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.5986,
+      "step": 1125
+    },
+    {
+      "epoch": 0.6005333333333334,
+      "grad_norm": 0.4146528022896513,
+      "learning_rate": 7.270214656953415e-05,
+      "loss": 0.6821,
+      "step": 1126
+    },
+    {
+      "epoch": 0.6010666666666666,
+      "grad_norm": 0.4878004732486644,
+      "learning_rate": 7.253594564130804e-05,
+      "loss": 0.7403,
+      "step": 1127
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.45801342010665835,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.7183,
+      "step": 1128
+    },
+    {
+      "epoch": 0.6021333333333333,
+      "grad_norm": 0.502859319845073,
+      "learning_rate": 7.22037903164173e-05,
+      "loss": 0.7439,
+      "step": 1129
+    },
+    {
+      "epoch": 0.6026666666666667,
+      "grad_norm": 0.392322837743993,
+      "learning_rate": 7.203783691161883e-05,
+      "loss": 0.6615,
+      "step": 1130
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.3751114926081077,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.6512,
+      "step": 1131
+    },
+    {
+      "epoch": 0.6037333333333333,
+      "grad_norm": 0.3901322696216327,
+      "learning_rate": 7.170618109512465e-05,
+      "loss": 0.6758,
+      "step": 1132
+    },
+    {
+      "epoch": 0.6042666666666666,
+      "grad_norm": 0.35435486379734865,
+      "learning_rate": 7.154047967380354e-05,
+      "loss": 0.5914,
+      "step": 1133
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.3652790684399189,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.5878,
+      "step": 1134
+    },
+    {
+      "epoch": 0.6053333333333333,
+      "grad_norm": 0.4087224220507586,
+      "learning_rate": 7.12093322790597e-05,
+      "loss": 0.6718,
+      "step": 1135
+    },
+    {
+      "epoch": 0.6058666666666667,
+      "grad_norm": 0.3792542895196207,
+      "learning_rate": 7.104388729449338e-05,
+      "loss": 0.6574,
+      "step": 1136
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.39634558554168403,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.6474,
+      "step": 1137
+    },
+    {
+      "epoch": 0.6069333333333333,
+      "grad_norm": 0.38772305877199503,
+      "learning_rate": 7.071325722118963e-05,
+      "loss": 0.6769,
+      "step": 1138
+    },
+    {
+      "epoch": 0.6074666666666667,
+      "grad_norm": 0.41830416685735256,
+      "learning_rate": 7.054807311976379e-05,
+      "loss": 0.6715,
+      "step": 1139
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4545498947925662,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6623,
+      "step": 1140
+    },
+    {
+      "epoch": 0.6085333333333334,
+      "grad_norm": 0.4042365730812182,
+      "learning_rate": 7.021796925368667e-05,
+      "loss": 0.6494,
+      "step": 1141
+    },
+    {
+      "epoch": 0.6090666666666666,
+      "grad_norm": 0.35185184266505637,
+      "learning_rate": 7.005305047477566e-05,
+      "loss": 0.5844,
+      "step": 1142
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.43979189245947675,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6531,
+      "step": 1143
+    },
+    {
+      "epoch": 0.6101333333333333,
+      "grad_norm": 0.4105425607854786,
+      "learning_rate": 6.972348168756983e-05,
+      "loss": 0.7003,
+      "step": 1144
+    },
+    {
+      "epoch": 0.6106666666666667,
+      "grad_norm": 0.44073988871122777,
+      "learning_rate": 6.955883266341741e-05,
+      "loss": 0.6725,
+      "step": 1145
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.40715286216987917,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6498,
+      "step": 1146
+    },
+    {
+      "epoch": 0.6117333333333334,
+      "grad_norm": 0.4168147465359035,
+      "learning_rate": 6.922980781234699e-05,
+      "loss": 0.7153,
+      "step": 1147
+    },
+    {
+      "epoch": 0.6122666666666666,
+      "grad_norm": 0.34734906442239943,
+      "learning_rate": 6.906543296794714e-05,
+      "loss": 0.5973,
+      "step": 1148
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.4141761269341916,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6778,
+      "step": 1149
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.46154360872865435,
+      "learning_rate": 6.873696089565786e-05,
+      "loss": 0.6951,
+      "step": 1150
+    },
+    {
+      "epoch": 0.6138666666666667,
+      "grad_norm": 0.39235942831177384,
+      "learning_rate": 6.85728646486359e-05,
+      "loss": 0.6164,
+      "step": 1151
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.4086673976195869,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6314,
+      "step": 1152
+    },
+    {
+      "epoch": 0.6149333333333333,
+      "grad_norm": 0.38699047434733536,
+      "learning_rate": 6.82449541829174e-05,
+      "loss": 0.637,
+      "step": 1153
+    },
+    {
+      "epoch": 0.6154666666666667,
+      "grad_norm": 0.3864206136275288,
+      "learning_rate": 6.80811409434113e-05,
+      "loss": 0.6444,
+      "step": 1154
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.6443641593948699,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.6711,
+      "step": 1155
+    },
+    {
+      "epoch": 0.6165333333333334,
+      "grad_norm": 0.4639491609604134,
+      "learning_rate": 6.775380089695986e-05,
+      "loss": 0.7731,
+      "step": 1156
+    },
+    {
+      "epoch": 0.6170666666666667,
+      "grad_norm": 0.4338398092876548,
+      "learning_rate": 6.759027506750158e-05,
+      "loss": 0.6755,
+      "step": 1157
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.4216504465825253,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7517,
+      "step": 1158
+    },
+    {
+      "epoch": 0.6181333333333333,
+      "grad_norm": 0.39691900859881774,
+      "learning_rate": 6.726351423768322e-05,
+      "loss": 0.6456,
+      "step": 1159
+    },
+    {
+      "epoch": 0.6186666666666667,
+      "grad_norm": 0.39944058550910694,
+      "learning_rate": 6.710028021308061e-05,
+      "loss": 0.7185,
+      "step": 1160
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.37929747161823074,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.6129,
+      "step": 1161
+    },
+    {
+      "epoch": 0.6197333333333334,
+      "grad_norm": 0.4121689131249372,
+      "learning_rate": 6.677410738169485e-05,
+      "loss": 0.6424,
+      "step": 1162
+    },
+    {
+      "epoch": 0.6202666666666666,
+      "grad_norm": 0.3811750190715987,
+      "learning_rate": 6.661116954891328e-05,
+      "loss": 0.6755,
+      "step": 1163
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3789942012694531,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.6399,
+      "step": 1164
+    },
+    {
+      "epoch": 0.6213333333333333,
+      "grad_norm": 0.43042891265566546,
+      "learning_rate": 6.62855934819569e-05,
+      "loss": 0.6859,
+      "step": 1165
+    },
+    {
+      "epoch": 0.6218666666666667,
+      "grad_norm": 0.3753988772879043,
+      "learning_rate": 6.612295622000162e-05,
+      "loss": 0.6517,
+      "step": 1166
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.4132369716226117,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6835,
+      "step": 1167
+    },
+    {
+      "epoch": 0.6229333333333333,
+      "grad_norm": 0.39115922751091403,
+      "learning_rate": 6.579798566743314e-05,
+      "loss": 0.6629,
+      "step": 1168
+    },
+    {
+      "epoch": 0.6234666666666666,
+      "grad_norm": 0.4039177207102229,
+      "learning_rate": 6.563565334723134e-05,
+      "loss": 0.6752,
+      "step": 1169
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.39008089390378314,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7208,
+      "step": 1170
+    },
+    {
+      "epoch": 0.6245333333333334,
+      "grad_norm": 0.3608937832830116,
+      "learning_rate": 6.531129704273604e-05,
+      "loss": 0.624,
+      "step": 1171
+    },
+    {
+      "epoch": 0.6250666666666667,
+      "grad_norm": 0.43383834658583326,
+      "learning_rate": 6.514927402701964e-05,
+      "loss": 0.6717,
+      "step": 1172
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.4405346402821102,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.742,
+      "step": 1173
+    },
+    {
+      "epoch": 0.6261333333333333,
+      "grad_norm": 0.41521861877120464,
+      "learning_rate": 6.48255406877745e-05,
+      "loss": 0.7284,
+      "step": 1174
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 0.37123179121419514,
+      "learning_rate": 6.466383133096267e-05,
+      "loss": 0.6345,
+      "step": 1175
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.38631151315292184,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6942,
+      "step": 1176
+    },
+    {
+      "epoch": 0.6277333333333334,
+      "grad_norm": 0.38387266191895614,
+      "learning_rate": 6.434072965740242e-05,
+      "loss": 0.6166,
+      "step": 1177
+    },
+    {
+      "epoch": 0.6282666666666666,
+      "grad_norm": 0.3582324416184515,
+      "learning_rate": 6.417933830548467e-05,
+      "loss": 0.6622,
+      "step": 1178
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.39699462890408693,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.6902,
+      "step": 1179
+    },
+    {
+      "epoch": 0.6293333333333333,
+      "grad_norm": 0.38693347836303943,
+      "learning_rate": 6.385687698106781e-05,
+      "loss": 0.6592,
+      "step": 1180
+    },
+    {
+      "epoch": 0.6298666666666667,
+      "grad_norm": 0.4052606688475977,
+      "learning_rate": 6.369580797148718e-05,
+      "loss": 0.6575,
+      "step": 1181
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.5729492026539168,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.7168,
+      "step": 1182
+    },
+    {
+      "epoch": 0.6309333333333333,
+      "grad_norm": 0.45176947535627326,
+      "learning_rate": 6.337399566246257e-05,
+      "loss": 0.6704,
+      "step": 1183
+    },
+    {
+      "epoch": 0.6314666666666666,
+      "grad_norm": 0.3954642199930997,
+      "learning_rate": 6.321325332399903e-05,
+      "loss": 0.669,
+      "step": 1184
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.3705010476821673,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.6375,
+      "step": 1185
+    },
+    {
+      "epoch": 0.6325333333333333,
+      "grad_norm": 0.4110192203534155,
+      "learning_rate": 6.289209867917312e-05,
+      "loss": 0.7027,
+      "step": 1186
+    },
+    {
+      "epoch": 0.6330666666666667,
+      "grad_norm": 0.49477491154053943,
+      "learning_rate": 6.273168733182722e-05,
+      "loss": 0.6808,
+      "step": 1187
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.42176395673777783,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.7151,
+      "step": 1188
+    },
+    {
+      "epoch": 0.6341333333333333,
+      "grad_norm": 0.41520653526248,
+      "learning_rate": 6.241119898233144e-05,
+      "loss": 0.6729,
+      "step": 1189
+    },
+    {
+      "epoch": 0.6346666666666667,
+      "grad_norm": 0.3758987690503946,
+      "learning_rate": 6.225112293720836e-05,
+      "loss": 0.6589,
+      "step": 1190
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.3961954582396681,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.671,
+      "step": 1191
+    },
+    {
+      "epoch": 0.6357333333333334,
+      "grad_norm": 0.39901339556539506,
+      "learning_rate": 6.19313094962673e-05,
+      "loss": 0.6496,
+      "step": 1192
+    },
+    {
+      "epoch": 0.6362666666666666,
+      "grad_norm": 0.4580813497560127,
+      "learning_rate": 6.177157305546078e-05,
+      "loss": 0.79,
+      "step": 1193
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.46003217150633746,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.7883,
+      "step": 1194
+    },
+    {
+      "epoch": 0.6373333333333333,
+      "grad_norm": 0.3964260553568906,
+      "learning_rate": 6.145244311816063e-05,
+      "loss": 0.6181,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6378666666666667,
+      "grad_norm": 0.3807548731983716,
+      "learning_rate": 6.129305057463741e-05,
+      "loss": 0.6998,
+      "step": 1196
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.4360297205208044,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.6262,
+      "step": 1197
+    },
+    {
+      "epoch": 0.6389333333333334,
+      "grad_norm": 0.3865799540892062,
+      "learning_rate": 6.0974612717695004e-05,
+      "loss": 0.6751,
+      "step": 1198
+    },
+    {
+      "epoch": 0.6394666666666666,
+      "grad_norm": 0.3996530539284033,
+      "learning_rate": 6.0815568355179556e-05,
+      "loss": 0.6568,
+      "step": 1199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3727877088707968,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6643,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6405333333333333,
+      "grad_norm": 0.3506964916728646,
+      "learning_rate": 6.0497831136711836e-05,
+      "loss": 0.633,
+      "step": 1201
+    },
+    {
+      "epoch": 0.6410666666666667,
+      "grad_norm": 0.4065845009917423,
+      "learning_rate": 6.0339139229571116e-05,
+      "loss": 0.6625,
+      "step": 1202
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.40512894680210737,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.6539,
+      "step": 1203
+    },
+    {
+      "epoch": 0.6421333333333333,
+      "grad_norm": 0.41785748473259554,
+      "learning_rate": 6.002211118886514e-05,
+      "loss": 0.6112,
+      "step": 1204
+    },
+    {
+      "epoch": 0.6426666666666667,
+      "grad_norm": 0.37645224956069007,
+      "learning_rate": 5.986377600199371e-05,
+      "loss": 0.6089,
+      "step": 1205
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.38218319419326885,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6297,
+      "step": 1206
+    },
+    {
+      "epoch": 0.6437333333333334,
+      "grad_norm": 0.45907322660182703,
+      "learning_rate": 5.9547465659277215e-05,
+      "loss": 0.7344,
+      "step": 1207
+    },
+    {
+      "epoch": 0.6442666666666667,
+      "grad_norm": 0.36262594700865114,
+      "learning_rate": 5.938949144798279e-05,
+      "loss": 0.5988,
+      "step": 1208
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.39423976779658926,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6889,
+      "step": 1209
+    },
+    {
+      "epoch": 0.6453333333333333,
+      "grad_norm": 0.37613984271554846,
+      "learning_rate": 5.907390730419507e-05,
+      "loss": 0.6263,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6458666666666667,
+      "grad_norm": 0.4122433587935938,
+      "learning_rate": 5.8916298314083915e-05,
+      "loss": 0.7386,
+      "step": 1211
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.43667685771125925,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6977,
+      "step": 1212
+    },
+    {
+      "epoch": 0.6469333333333334,
+      "grad_norm": 0.3711145671652538,
+      "learning_rate": 5.860144885064751e-05,
+      "loss": 0.6362,
+      "step": 1213
+    },
+    {
+      "epoch": 0.6474666666666666,
+      "grad_norm": 0.37451275746745327,
+      "learning_rate": 5.8444209317510514e-05,
+      "loss": 0.6478,
+      "step": 1214
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.3979133670270158,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.6893,
+      "step": 1215
+    },
+    {
+      "epoch": 0.6485333333333333,
+      "grad_norm": 0.3484883827048639,
+      "learning_rate": 5.813010299610313e-05,
+      "loss": 0.6092,
+      "step": 1216
+    },
+    {
+      "epoch": 0.6490666666666667,
+      "grad_norm": 0.34144110736344746,
+      "learning_rate": 5.797323714580192e-05,
+      "loss": 0.5815,
+      "step": 1217
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.4301704864516479,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6662,
+      "step": 1218
+    },
+    {
+      "epoch": 0.6501333333333333,
+      "grad_norm": 0.4348656862576235,
+      "learning_rate": 5.765988240812921e-05,
+      "loss": 0.7228,
+      "step": 1219
+    },
+    {
+      "epoch": 0.6506666666666666,
+      "grad_norm": 0.35639601367192286,
+      "learning_rate": 5.750339445648252e-05,
+      "loss": 0.6469,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.40792386695573035,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.7141,
+      "step": 1221
+    },
+    {
+      "epoch": 0.6517333333333334,
+      "grad_norm": 0.40742722168314527,
+      "learning_rate": 5.7190799724050924e-05,
+      "loss": 0.6018,
+      "step": 1222
+    },
+    {
+      "epoch": 0.6522666666666667,
+      "grad_norm": 0.38447771453604046,
+      "learning_rate": 5.7034693876721376e-05,
+      "loss": 0.6091,
+      "step": 1223
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.38990919056436035,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.6791,
+      "step": 1224
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 0.4344561767186191,
+      "learning_rate": 5.6722867550612116e-05,
+      "loss": 0.6499,
+      "step": 1225
+    },
+    {
+      "epoch": 0.6538666666666667,
+      "grad_norm": 0.3548734099373113,
+      "learning_rate": 5.6567148002993164e-05,
+      "loss": 0.5503,
+      "step": 1226
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.38248543772581156,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.657,
+      "step": 1227
+    },
+    {
+      "epoch": 0.6549333333333334,
+      "grad_norm": 0.338733827740271,
+      "learning_rate": 5.625609846363622e-05,
+      "loss": 0.6066,
+      "step": 1228
+    },
+    {
+      "epoch": 0.6554666666666666,
+      "grad_norm": 0.41067698761926447,
+      "learning_rate": 5.6100769400739383e-05,
+      "loss": 0.6894,
+      "step": 1229
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4348889769321385,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.6455,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6565333333333333,
+      "grad_norm": 0.39813712105076304,
+      "learning_rate": 5.579050500768836e-05,
+      "loss": 0.6901,
+      "step": 1231
+    },
+    {
+      "epoch": 0.6570666666666667,
+      "grad_norm": 0.4265205105745672,
+      "learning_rate": 5.5635570604030705e-05,
+      "loss": 0.5998,
+      "step": 1232
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.4330741219625014,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.7161,
+      "step": 1233
+    },
+    {
+      "epoch": 0.6581333333333333,
+      "grad_norm": 0.3657696700746177,
+      "learning_rate": 5.53260996957381e-05,
+      "loss": 0.6783,
+      "step": 1234
+    },
+    {
+      "epoch": 0.6586666666666666,
+      "grad_norm": 0.36644757405630646,
+      "learning_rate": 5.5171564115230254e-05,
+      "loss": 0.6734,
+      "step": 1235
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.45947345643473103,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.7396,
+      "step": 1236
+    },
+    {
+      "epoch": 0.6597333333333333,
+      "grad_norm": 0.3489668548183046,
+      "learning_rate": 5.486289500882355e-05,
+      "loss": 0.6197,
+      "step": 1237
+    },
+    {
+      "epoch": 0.6602666666666667,
+      "grad_norm": 0.3690643783308543,
+      "learning_rate": 5.47087624046575e-05,
+      "loss": 0.6344,
+      "step": 1238
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.456107720397437,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.7215,
+      "step": 1239
+    },
+    {
+      "epoch": 0.6613333333333333,
+      "grad_norm": 0.40203323005580555,
+      "learning_rate": 5.4400903395715366e-05,
+      "loss": 0.7176,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6618666666666667,
+      "grad_norm": 0.38494287502414254,
+      "learning_rate": 5.424717791025302e-05,
+      "loss": 0.6035,
+      "step": 1241
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.3734995791052284,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.6354,
+      "step": 1242
+    },
+    {
+      "epoch": 0.6629333333333334,
+      "grad_norm": 0.3936094263134665,
+      "learning_rate": 5.394013727258254e-05,
+      "loss": 0.6424,
+      "step": 1243
+    },
+    {
+      "epoch": 0.6634666666666666,
+      "grad_norm": 0.44562805874341715,
+      "learning_rate": 5.378682303724435e-05,
+      "loss": 0.7226,
+      "step": 1244
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.3839119816128122,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6603,
+      "step": 1245
+    },
+    {
+      "epoch": 0.6645333333333333,
+      "grad_norm": 0.4483123370581339,
+      "learning_rate": 5.348060902265871e-05,
+      "loss": 0.7056,
+      "step": 1246
+    },
+    {
+      "epoch": 0.6650666666666667,
+      "grad_norm": 0.3468698326603632,
+      "learning_rate": 5.332771015781275e-05,
+      "loss": 0.6133,
+      "step": 1247
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.4149625705227579,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.6367,
+      "step": 1248
+    },
+    {
+      "epoch": 0.6661333333333334,
+      "grad_norm": 0.46041886071476645,
+      "learning_rate": 5.302233099590928e-05,
+      "loss": 0.7283,
+      "step": 1249
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.35143148139417685,
+      "learning_rate": 5.286985161076029e-05,
+      "loss": 0.6678,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.3556389058456103,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.587,
+      "step": 1251
+    },
+    {
+      "epoch": 0.6677333333333333,
+      "grad_norm": 0.4266997961251182,
+      "learning_rate": 5.2565315508699376e-05,
+      "loss": 0.6836,
+      "step": 1252
+    },
+    {
+      "epoch": 0.6682666666666667,
+      "grad_norm": 0.3936042753495862,
+      "learning_rate": 5.2413259701178505e-05,
+      "loss": 0.6242,
+      "step": 1253
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.4231806637101834,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.7319,
+      "step": 1254
+    },
+    {
+      "epoch": 0.6693333333333333,
+      "grad_norm": 0.41705570276158715,
+      "learning_rate": 5.210957484346314e-05,
+      "loss": 0.7147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.6698666666666667,
+      "grad_norm": 0.43294963616605125,
+      "learning_rate": 5.195794670011776e-05,
+      "loss": 0.6583,
+      "step": 1256
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.4156204083871984,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6546,
+      "step": 1257
+    },
+    {
+      "epoch": 0.6709333333333334,
+      "grad_norm": 0.35747998071692966,
+      "learning_rate": 5.165512124837344e-05,
+      "loss": 0.6083,
+      "step": 1258
+    },
+    {
+      "epoch": 0.6714666666666667,
+      "grad_norm": 0.3688837870547564,
+      "learning_rate": 5.150392484425728e-05,
+      "loss": 0.649,
+      "step": 1259
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.40179747105550656,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.6527,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6725333333333333,
+      "grad_norm": 0.37358230952691235,
+      "learning_rate": 5.120196693701267e-05,
+      "loss": 0.6378,
+      "step": 1261
+    },
+    {
+      "epoch": 0.6730666666666667,
+      "grad_norm": 0.35890694501042475,
+      "learning_rate": 5.105120633557634e-05,
+      "loss": 0.5566,
+      "step": 1262
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.49052617470132837,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.7281,
+      "step": 1263
+    },
+    {
+      "epoch": 0.6741333333333334,
+      "grad_norm": 0.46210192447504694,
+      "learning_rate": 5.075012408804458e-05,
+      "loss": 0.689,
+      "step": 1264
+    },
+    {
+      "epoch": 0.6746666666666666,
+      "grad_norm": 0.37130683789508834,
+      "learning_rate": 5.059980334102637e-05,
+      "loss": 0.6301,
+      "step": 1265
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4074895758430878,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6395,
+      "step": 1266
+    },
+    {
+      "epoch": 0.6757333333333333,
+      "grad_norm": 0.37935208826673683,
+      "learning_rate": 5.0299604844886985e-05,
+      "loss": 0.6514,
+      "step": 1267
+    },
+    {
+      "epoch": 0.6762666666666667,
+      "grad_norm": 0.34797199775753,
+      "learning_rate": 5.014972799220403e-05,
+      "loss": 0.6002,
+      "step": 1268
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.3522962014137995,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.6155,
+      "step": 1269
+    },
+    {
+      "epoch": 0.6773333333333333,
+      "grad_norm": 0.42253423847456223,
+      "learning_rate": 4.985042131538545e-05,
+      "loss": 0.7097,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6778666666666666,
+      "grad_norm": 0.3778680512518862,
+      "learning_rate": 4.9700992385024934e-05,
+      "loss": 0.6362,
+      "step": 1271
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.4296025660232695,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.7065,
+      "step": 1272
+    },
+    {
+      "epoch": 0.6789333333333334,
+      "grad_norm": 0.4062404775585464,
+      "learning_rate": 4.940258557148765e-05,
+      "loss": 0.6829,
+      "step": 1273
+    },
+    {
+      "epoch": 0.6794666666666667,
+      "grad_norm": 0.4858595019574037,
+      "learning_rate": 4.9253608579398855e-05,
+      "loss": 0.7522,
+      "step": 1274
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.377448766992822,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6163,
+      "step": 1275
+    },
+    {
+      "epoch": 0.6805333333333333,
+      "grad_norm": 0.43079110371344853,
+      "learning_rate": 4.895610964891923e-05,
+      "loss": 0.6869,
+      "step": 1276
+    },
+    {
+      "epoch": 0.6810666666666667,
+      "grad_norm": 0.39744249709827023,
+      "learning_rate": 4.880758859890536e-05,
+      "loss": 0.6974,
+      "step": 1277
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.382931377970497,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6295,
+      "step": 1278
+    },
+    {
+      "epoch": 0.6821333333333334,
+      "grad_norm": 0.5014922132487974,
+      "learning_rate": 4.851100554686021e-05,
+      "loss": 0.8143,
+      "step": 1279
+    },
+    {
+      "epoch": 0.6826666666666666,
+      "grad_norm": 0.3613806912915405,
+      "learning_rate": 4.836294443047088e-05,
+      "loss": 0.6521,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.46920678429921814,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.6624,
+      "step": 1281
+    },
+    {
+      "epoch": 0.6837333333333333,
+      "grad_norm": 0.42141869157299844,
+      "learning_rate": 4.8067285227622404e-05,
+      "loss": 0.6487,
+      "step": 1282
+    },
+    {
+      "epoch": 0.6842666666666667,
+      "grad_norm": 0.3974521208398004,
+      "learning_rate": 4.791968802404648e-05,
+      "loss": 0.6627,
+      "step": 1283
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.38704435042495616,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.722,
+      "step": 1284
+    },
+    {
+      "epoch": 0.6853333333333333,
+      "grad_norm": 0.3713696259428589,
+      "learning_rate": 4.762496061632814e-05,
+      "loss": 0.6285,
+      "step": 1285
+    },
+    {
+      "epoch": 0.6858666666666666,
+      "grad_norm": 0.371014800468022,
+      "learning_rate": 4.747783129228656e-05,
+      "loss": 0.6435,
+      "step": 1286
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.4024601896130607,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.6752,
+      "step": 1287
+    },
+    {
+      "epoch": 0.6869333333333333,
+      "grad_norm": 0.34903249981417445,
+      "learning_rate": 4.718404360058966e-05,
+      "loss": 0.606,
+      "step": 1288
+    },
+    {
+      "epoch": 0.6874666666666667,
+      "grad_norm": 0.40509439359192134,
+      "learning_rate": 4.7037386110228985e-05,
+      "loss": 0.6945,
+      "step": 1289
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3694649482760305,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.6442,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6885333333333333,
+      "grad_norm": 0.39873395808385487,
+      "learning_rate": 4.6744546030189486e-05,
+      "loss": 0.6284,
+      "step": 1291
+    },
+    {
+      "epoch": 0.6890666666666667,
+      "grad_norm": 0.3407261775615519,
+      "learning_rate": 4.659836431497563e-05,
+      "loss": 0.5643,
+      "step": 1292
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.41501238449358774,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.6191,
+      "step": 1293
+    },
+    {
+      "epoch": 0.6901333333333334,
+      "grad_norm": 0.4104495898247997,
+      "learning_rate": 4.630647971676232e-05,
+      "loss": 0.6784,
+      "step": 1294
+    },
+    {
+      "epoch": 0.6906666666666667,
+      "grad_norm": 0.3807323293291751,
+      "learning_rate": 4.6160777705374524e-05,
+      "loss": 0.635,
+      "step": 1295
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.4595544098150067,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.6875,
+      "step": 1296
+    },
+    {
+      "epoch": 0.6917333333333333,
+      "grad_norm": 0.39532369727474764,
+      "learning_rate": 4.586985643347717e-05,
+      "loss": 0.6348,
+      "step": 1297
+    },
+    {
+      "epoch": 0.6922666666666667,
+      "grad_norm": 0.40109321782211227,
+      "learning_rate": 4.572463804170263e-05,
+      "loss": 0.6506,
+      "step": 1298
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.4037411071068034,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6508,
+      "step": 1299
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.44266104893603353,
+      "learning_rate": 4.543468791472131e-05,
+      "loss": 0.6878,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6938666666666666,
+      "grad_norm": 0.4078006050037638,
+      "learning_rate": 4.5289957045349653e-05,
+      "loss": 0.6487,
+      "step": 1301
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.41100819450479764,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.6743,
+      "step": 1302
+    },
+    {
+      "epoch": 0.6949333333333333,
+      "grad_norm": 0.3538887019823367,
+      "learning_rate": 4.5000985855784746e-05,
+      "loss": 0.6215,
+      "step": 1303
+    },
+    {
+      "epoch": 0.6954666666666667,
+      "grad_norm": 0.4446714112666036,
+      "learning_rate": 4.485674639850333e-05,
+      "loss": 0.7301,
+      "step": 1304
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.41170644722765004,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.7103,
+      "step": 1305
+    },
+    {
+      "epoch": 0.6965333333333333,
+      "grad_norm": 0.4605427485758196,
+      "learning_rate": 4.456876191254582e-05,
+      "loss": 0.7189,
+      "step": 1306
+    },
+    {
+      "epoch": 0.6970666666666666,
+      "grad_norm": 0.4061454260270999,
+      "learning_rate": 4.442501774383515e-05,
+      "loss": 0.6497,
+      "step": 1307
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3603132790154237,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.61,
+      "step": 1308
+    },
+    {
+      "epoch": 0.6981333333333334,
+      "grad_norm": 0.4302046479056169,
+      "learning_rate": 4.413802770115816e-05,
+      "loss": 0.6986,
+      "step": 1309
+    },
+    {
+      "epoch": 0.6986666666666667,
+      "grad_norm": 0.3576314792300624,
+      "learning_rate": 4.399478268418771e-05,
+      "loss": 0.7011,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.43070992131470415,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.6702,
+      "step": 1311
+    },
+    {
+      "epoch": 0.6997333333333333,
+      "grad_norm": 0.3970467815078321,
+      "learning_rate": 4.3708794797738375e-05,
+      "loss": 0.684,
+      "step": 1312
+    },
+    {
+      "epoch": 0.7002666666666667,
+      "grad_norm": 0.4261021129159375,
+      "learning_rate": 4.3566052782262735e-05,
+      "loss": 0.7218,
+      "step": 1313
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.4204848942396004,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6163,
+      "step": 1314
+    },
+    {
+      "epoch": 0.7013333333333334,
+      "grad_norm": 0.41785258652619117,
+      "learning_rate": 4.328107473805487e-05,
+      "loss": 0.6481,
+      "step": 1315
+    },
+    {
+      "epoch": 0.7018666666666666,
+      "grad_norm": 0.40126603920945875,
+      "learning_rate": 4.3138839560310303e-05,
+      "loss": 0.6839,
+      "step": 1316
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.39448733805358843,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6515,
+      "step": 1317
+    },
+    {
+      "epoch": 0.7029333333333333,
+      "grad_norm": 0.36827460951186997,
+      "learning_rate": 4.2854879017217894e-05,
+      "loss": 0.6259,
+      "step": 1318
+    },
+    {
+      "epoch": 0.7034666666666667,
+      "grad_norm": 0.42407810403447627,
+      "learning_rate": 4.271315449981934e-05,
+      "loss": 0.6963,
+      "step": 1319
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4594114457629619,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.6613,
+      "step": 1320
+    },
+    {
+      "epoch": 0.7045333333333333,
+      "grad_norm": 0.3810204216023159,
+      "learning_rate": 4.2430219089370823e-05,
+      "loss": 0.5945,
+      "step": 1321
+    },
+    {
+      "epoch": 0.7050666666666666,
+      "grad_norm": 0.43608545709597685,
+      "learning_rate": 4.228900904120895e-05,
+      "loss": 0.6711,
+      "step": 1322
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.3879651248398124,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.6486,
+      "step": 1323
+    },
+    {
+      "epoch": 0.7061333333333333,
+      "grad_norm": 0.40586294188569094,
+      "learning_rate": 4.200710636738189e-05,
+      "loss": 0.713,
+      "step": 1324
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 0.40727511212981865,
+      "learning_rate": 4.1866414583520877e-05,
+      "loss": 0.6809,
+      "step": 1325
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.4776969230797926,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.7639,
+      "step": 1326
+    },
+    {
+      "epoch": 0.7077333333333333,
+      "grad_norm": 0.3578661709379814,
+      "learning_rate": 4.158555222253771e-05,
+      "loss": 0.6136,
+      "step": 1327
+    },
+    {
+      "epoch": 0.7082666666666667,
+      "grad_norm": 0.39325973556204963,
+      "learning_rate": 4.14453824841132e-05,
+      "loss": 0.6406,
+      "step": 1328
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.4229079575333293,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6801,
+      "step": 1329
+    },
+    {
+      "epoch": 0.7093333333333334,
+      "grad_norm": 0.3923372999138519,
+      "learning_rate": 4.1165567984237764e-05,
+      "loss": 0.638,
+      "step": 1330
+    },
+    {
+      "epoch": 0.7098666666666666,
+      "grad_norm": 0.4059336587960118,
+      "learning_rate": 4.102592405835536e-05,
+      "loss": 0.664,
+      "step": 1331
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3956049748736085,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7063,
+      "step": 1332
+    },
+    {
+      "epoch": 0.7109333333333333,
+      "grad_norm": 0.37588095935875665,
+      "learning_rate": 4.074716493968975e-05,
+      "loss": 0.6065,
+      "step": 1333
+    },
+    {
+      "epoch": 0.7114666666666667,
+      "grad_norm": 0.3913508936345303,
+      "learning_rate": 4.060805057932359e-05,
+      "loss": 0.6192,
+      "step": 1334
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.4283208983155436,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6759,
+      "step": 1335
+    },
+    {
+      "epoch": 0.7125333333333334,
+      "grad_norm": 0.43955154453963924,
+      "learning_rate": 4.0330354333606234e-05,
+      "loss": 0.6939,
+      "step": 1336
+    },
+    {
+      "epoch": 0.7130666666666666,
+      "grad_norm": 0.42282096656173385,
+      "learning_rate": 4.019177327749822e-05,
+      "loss": 0.7184,
+      "step": 1337
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.395847147798968,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6204,
+      "step": 1338
+    },
+    {
+      "epoch": 0.7141333333333333,
+      "grad_norm": 0.3622357159520991,
+      "learning_rate": 3.991514736790258e-05,
+      "loss": 0.6509,
+      "step": 1339
+    },
+    {
+      "epoch": 0.7146666666666667,
+      "grad_norm": 0.4081229826979606,
+      "learning_rate": 3.977710334046193e-05,
+      "loss": 0.6628,
+      "step": 1340
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.4219479283383955,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.6971,
+      "step": 1341
+    },
+    {
+      "epoch": 0.7157333333333333,
+      "grad_norm": 0.39957701543592067,
+      "learning_rate": 3.950155520139581e-05,
+      "loss": 0.6504,
+      "step": 1342
+    },
+    {
+      "epoch": 0.7162666666666667,
+      "grad_norm": 0.43649174920643524,
+      "learning_rate": 3.936405191259891e-05,
+      "loss": 0.6645,
+      "step": 1343
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.43141958081496057,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6191,
+      "step": 1344
+    },
+    {
+      "epoch": 0.7173333333333334,
+      "grad_norm": 0.6286525667864393,
+      "learning_rate": 3.9089588949504655e-05,
+      "loss": 0.7388,
+      "step": 1345
+    },
+    {
+      "epoch": 0.7178666666666667,
+      "grad_norm": 0.38817772595331723,
+      "learning_rate": 3.895263009479534e-05,
+      "loss": 0.6446,
+      "step": 1346
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.412114221055016,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6787,
+      "step": 1347
+    },
+    {
+      "epoch": 0.7189333333333333,
+      "grad_norm": 0.4102293474560031,
+      "learning_rate": 3.867925968395085e-05,
+      "loss": 0.6781,
+      "step": 1348
+    },
+    {
+      "epoch": 0.7194666666666667,
+      "grad_norm": 0.3884376821622926,
+      "learning_rate": 3.854284894414122e-05,
+      "loss": 0.6717,
+      "step": 1349
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.38493039643014365,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.6733,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7205333333333334,
+      "grad_norm": 0.43585606362873025,
+      "learning_rate": 3.82705784324618e-05,
+      "loss": 0.6728,
+      "step": 1351
+    },
+    {
+      "epoch": 0.7210666666666666,
+      "grad_norm": 0.4280181141336135,
+      "learning_rate": 3.8134719473633094e-05,
+      "loss": 0.7325,
+      "step": 1352
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.38025146419690364,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6435,
+      "step": 1353
+    },
+    {
+      "epoch": 0.7221333333333333,
+      "grad_norm": 0.5592834753588812,
+      "learning_rate": 3.786355617847385e-05,
+      "loss": 0.6667,
+      "step": 1354
+    },
+    {
+      "epoch": 0.7226666666666667,
+      "grad_norm": 0.4520181137303948,
+      "learning_rate": 3.772825265187802e-05,
+      "loss": 0.6851,
+      "step": 1355
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.39559532065065467,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.6669,
+      "step": 1356
+    },
+    {
+      "epoch": 0.7237333333333333,
+      "grad_norm": 0.35950931405771885,
+      "learning_rate": 3.7458203860837234e-05,
+      "loss": 0.6008,
+      "step": 1357
+    },
+    {
+      "epoch": 0.7242666666666666,
+      "grad_norm": 0.43270266536877905,
+      "learning_rate": 3.732345940279893e-05,
+      "loss": 0.6514,
+      "step": 1358
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.4344914160567954,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.6384,
+      "step": 1359
+    },
+    {
+      "epoch": 0.7253333333333334,
+      "grad_norm": 0.4212639702085023,
+      "learning_rate": 3.705453237352227e-05,
+      "loss": 0.6417,
+      "step": 1360
+    },
+    {
+      "epoch": 0.7258666666666667,
+      "grad_norm": 0.3616803952409905,
+      "learning_rate": 3.692035060534088e-05,
+      "loss": 0.6676,
+      "step": 1361
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.4411224765959474,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.6665,
+      "step": 1362
+    },
+    {
+      "epoch": 0.7269333333333333,
+      "grad_norm": 0.38343553384175344,
+      "learning_rate": 3.665255256532638e-05,
+      "loss": 0.6612,
+      "step": 1363
+    },
+    {
+      "epoch": 0.7274666666666667,
+      "grad_norm": 0.32958688063092734,
+      "learning_rate": 3.651893709317887e-05,
+      "loss": 0.6135,
+      "step": 1364
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.4026249083615412,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.6757,
+      "step": 1365
+    },
+    {
+      "epoch": 0.7285333333333334,
+      "grad_norm": 0.4048994078001853,
+      "learning_rate": 3.625227523958252e-05,
+      "loss": 0.6909,
+      "step": 1366
+    },
+    {
+      "epoch": 0.7290666666666666,
+      "grad_norm": 0.41934958724390453,
+      "learning_rate": 3.611922965442648e-05,
+      "loss": 0.598,
+      "step": 1367
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.42175418099779427,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.6259,
+      "step": 1368
+    },
+    {
+      "epoch": 0.7301333333333333,
+      "grad_norm": 0.3610517497042293,
+      "learning_rate": 3.5853711153868965e-05,
+      "loss": 0.5823,
+      "step": 1369
+    },
+    {
+      "epoch": 0.7306666666666667,
+      "grad_norm": 0.4244103644608558,
+      "learning_rate": 3.5721239031346066e-05,
+      "loss": 0.6816,
+      "step": 1370
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.48352806735759074,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.7407,
+      "step": 1371
+    },
+    {
+      "epoch": 0.7317333333333333,
+      "grad_norm": 0.4817810985247976,
+      "learning_rate": 3.545687101972013e-05,
+      "loss": 0.6564,
+      "step": 1372
+    },
+    {
+      "epoch": 0.7322666666666666,
+      "grad_norm": 0.4697099875994108,
+      "learning_rate": 3.53249759200601e-05,
+      "loss": 0.7001,
+      "step": 1373
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4572944881204827,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6832,
+      "step": 1374
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.4525154876758304,
+      "learning_rate": 3.506176550233863e-05,
+      "loss": 0.714,
+      "step": 1375
+    },
+    {
+      "epoch": 0.7338666666666667,
+      "grad_norm": 0.4471117031442454,
+      "learning_rate": 3.4930450970263485e-05,
+      "loss": 0.6757,
+      "step": 1376
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.44471878985597685,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6996,
+      "step": 1377
+    },
+    {
+      "epoch": 0.7349333333333333,
+      "grad_norm": 0.36843211872059467,
+      "learning_rate": 3.46684052203088e-05,
+      "loss": 0.6277,
+      "step": 1378
+    },
+    {
+      "epoch": 0.7354666666666667,
+      "grad_norm": 0.4441944802513513,
+      "learning_rate": 3.4537674784937614e-05,
+      "loss": 0.6573,
+      "step": 1379
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.39347093014029866,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.6253,
+      "step": 1380
+    },
+    {
+      "epoch": 0.7365333333333334,
+      "grad_norm": 0.3952747929672544,
+      "learning_rate": 3.427680074531113e-05,
+      "loss": 0.6505,
+      "step": 1381
+    },
+    {
+      "epoch": 0.7370666666666666,
+      "grad_norm": 0.3618825221767066,
+      "learning_rate": 3.4146657920065285e-05,
+      "loss": 0.6592,
+      "step": 1382
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.42388720908295424,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.6772,
+      "step": 1383
+    },
+    {
+      "epoch": 0.7381333333333333,
+      "grad_norm": 0.3689456209657369,
+      "learning_rate": 3.388696260183832e-05,
+      "loss": 0.5967,
+      "step": 1384
+    },
+    {
+      "epoch": 0.7386666666666667,
+      "grad_norm": 0.424668041789921,
+      "learning_rate": 3.3757410884346894e-05,
+      "loss": 0.662,
+      "step": 1385
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.3754656171823722,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6066,
+      "step": 1386
+    },
+    {
+      "epoch": 0.7397333333333334,
+      "grad_norm": 0.38912284150881926,
+      "learning_rate": 3.3498901266912396e-05,
+      "loss": 0.6442,
+      "step": 1387
+    },
+    {
+      "epoch": 0.7402666666666666,
+      "grad_norm": 0.3548864565059086,
+      "learning_rate": 3.336994413891828e-05,
+      "loss": 0.6006,
+      "step": 1388
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.3919077760510013,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.6277,
+      "step": 1389
+    },
+    {
+      "epoch": 0.7413333333333333,
+      "grad_norm": 0.39380410841258484,
+      "learning_rate": 3.3112627169802946e-05,
+      "loss": 0.6184,
+      "step": 1390
+    },
+    {
+      "epoch": 0.7418666666666667,
+      "grad_norm": 0.45027174851268076,
+      "learning_rate": 3.298426809706928e-05,
+      "loss": 0.676,
+      "step": 1391
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.40225833642099534,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6047,
+      "step": 1392
+    },
+    {
+      "epoch": 0.7429333333333333,
+      "grad_norm": 0.4376514664438265,
+      "learning_rate": 3.2728150691747115e-05,
+      "loss": 0.7156,
+      "step": 1393
+    },
+    {
+      "epoch": 0.7434666666666667,
+      "grad_norm": 0.41199669510760717,
+      "learning_rate": 3.2600393123964113e-05,
+      "loss": 0.6382,
+      "step": 1394
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.39735576466240907,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6172,
+      "step": 1395
+    },
+    {
+      "epoch": 0.7445333333333334,
+      "grad_norm": 0.3713753826248314,
+      "learning_rate": 3.234548216567049e-05,
+      "loss": 0.6646,
+      "step": 1396
+    },
+    {
+      "epoch": 0.7450666666666667,
+      "grad_norm": 0.44678575189964775,
+      "learning_rate": 3.2218329536362704e-05,
+      "loss": 0.7011,
+      "step": 1397
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.4602294032193928,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.753,
+      "step": 1398
+    },
+    {
+      "epoch": 0.7461333333333333,
+      "grad_norm": 0.46650053271103575,
+      "learning_rate": 3.196463187590929e-05,
+      "loss": 0.6214,
+      "step": 1399
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.40317733613726847,
+      "learning_rate": 3.1838087602343344e-05,
+      "loss": 0.5471,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.4230092995633492,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6656,
+      "step": 1401
+    },
+    {
+      "epoch": 0.7477333333333334,
+      "grad_norm": 0.40055090409544536,
+      "learning_rate": 3.158561005793402e-05,
+      "loss": 0.7061,
+      "step": 1402
+    },
+    {
+      "epoch": 0.7482666666666666,
+      "grad_norm": 0.4482564049775638,
+      "learning_rate": 3.145967754102691e-05,
+      "loss": 0.6839,
+      "step": 1403
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.43519562712294285,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.6938,
+      "step": 1404
+    },
+    {
+      "epoch": 0.7493333333333333,
+      "grad_norm": 0.39940095757081484,
+      "learning_rate": 3.120842689807468e-05,
+      "loss": 0.6529,
+      "step": 1405
+    },
+    {
+      "epoch": 0.7498666666666667,
+      "grad_norm": 0.4244148756048259,
+      "learning_rate": 3.108310952230212e-05,
+      "loss": 0.691,
+      "step": 1406
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.44833681532028524,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6188,
+      "step": 1407
+    },
+    {
+      "epoch": 0.7509333333333333,
+      "grad_norm": 0.4470535185148768,
+      "learning_rate": 3.083309253324651e-05,
+      "loss": 0.6594,
+      "step": 1408
+    },
+    {
+      "epoch": 0.7514666666666666,
+      "grad_norm": 0.3796134492665104,
+      "learning_rate": 3.070839366655215e-05,
+      "loss": 0.6161,
+      "step": 1409
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.4191151668570498,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6627,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7525333333333334,
+      "grad_norm": 0.3427507534658426,
+      "learning_rate": 3.0459617050677868e-05,
+      "loss": 0.5615,
+      "step": 1411
+    },
+    {
+      "epoch": 0.7530666666666667,
+      "grad_norm": 0.43880115078621856,
+      "learning_rate": 3.0335540044382694e-05,
+      "loss": 0.6187,
+      "step": 1412
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.4215481081683841,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.6477,
+      "step": 1413
+    },
+    {
+      "epoch": 0.7541333333333333,
+      "grad_norm": 0.4355019017233244,
+      "learning_rate": 3.008801048763914e-05,
+      "loss": 0.7073,
+      "step": 1414
+    },
+    {
+      "epoch": 0.7546666666666667,
+      "grad_norm": 0.4370484357019805,
+      "learning_rate": 2.996455867635155e-05,
+      "loss": 0.6765,
+      "step": 1415
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.43665035912216993,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.6749,
+      "step": 1416
+    },
+    {
+      "epoch": 0.7557333333333334,
+      "grad_norm": 0.3958647254636817,
+      "learning_rate": 2.9718282831172883e-05,
+      "loss": 0.6386,
+      "step": 1417
+    },
+    {
+      "epoch": 0.7562666666666666,
+      "grad_norm": 0.41915594147590046,
+      "learning_rate": 2.9595459532698854e-05,
+      "loss": 0.614,
+      "step": 1418
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.42218803321434667,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6571,
+      "step": 1419
+    },
+    {
+      "epoch": 0.7573333333333333,
+      "grad_norm": 0.42335751599875865,
+      "learning_rate": 2.9350444017825385e-05,
+      "loss": 0.6804,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7578666666666667,
+      "grad_norm": 0.4153881746916546,
+      "learning_rate": 2.922825253307947e-05,
+      "loss": 0.6362,
+      "step": 1421
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.38307782632840287,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6581,
+      "step": 1422
+    },
+    {
+      "epoch": 0.7589333333333333,
+      "grad_norm": 0.5095928880331001,
+      "learning_rate": 2.898450393337977e-05,
+      "loss": 0.6828,
+      "step": 1423
+    },
+    {
+      "epoch": 0.7594666666666666,
+      "grad_norm": 0.3875864408922726,
+      "learning_rate": 2.8862947546296315e-05,
+      "loss": 0.6478,
+      "step": 1424
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.44345363702168855,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.6437,
+      "step": 1425
+    },
+    {
+      "epoch": 0.7605333333333333,
+      "grad_norm": 0.34961625306269245,
+      "learning_rate": 2.8620472412590228e-05,
+      "loss": 0.5871,
+      "step": 1426
+    },
+    {
+      "epoch": 0.7610666666666667,
+      "grad_norm": 0.42627691075353674,
+      "learning_rate": 2.8499554390035143e-05,
+      "loss": 0.7225,
+      "step": 1427
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.4429482812974094,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6531,
+      "step": 1428
+    },
+    {
+      "epoch": 0.7621333333333333,
+      "grad_norm": 0.3674344162611938,
+      "learning_rate": 2.8258359238917665e-05,
+      "loss": 0.6432,
+      "step": 1429
+    },
+    {
+      "epoch": 0.7626666666666667,
+      "grad_norm": 0.4037733321667493,
+      "learning_rate": 2.8138082830600554e-05,
+      "loss": 0.617,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.3982375894729625,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6214,
+      "step": 1431
+    },
+    {
+      "epoch": 0.7637333333333334,
+      "grad_norm": 0.41177143165150104,
+      "learning_rate": 2.7898174144266732e-05,
+      "loss": 0.6611,
+      "step": 1432
+    },
+    {
+      "epoch": 0.7642666666666666,
+      "grad_norm": 0.4277176910110703,
+      "learning_rate": 2.7778542582653744e-05,
+      "loss": 0.6856,
+      "step": 1433
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.6910272239324808,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.701,
+      "step": 1434
+    },
+    {
+      "epoch": 0.7653333333333333,
+      "grad_norm": 0.4562352924729361,
+      "learning_rate": 2.753992680872457e-05,
+      "loss": 0.6448,
+      "step": 1435
+    },
+    {
+      "epoch": 0.7658666666666667,
+      "grad_norm": 0.39479950172562867,
+      "learning_rate": 2.7420943308951284e-05,
+      "loss": 0.6421,
+      "step": 1436
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.37918581631614484,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.5989,
+      "step": 1437
+    },
+    {
+      "epoch": 0.7669333333333334,
+      "grad_norm": 0.4670561638981511,
+      "learning_rate": 2.7183626860300247e-05,
+      "loss": 0.64,
+      "step": 1438
+    },
+    {
+      "epoch": 0.7674666666666666,
+      "grad_norm": 0.4052900218399078,
+      "learning_rate": 2.7065294620085424e-05,
+      "loss": 0.6371,
+      "step": 1439
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3911527017578945,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6805,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7685333333333333,
+      "grad_norm": 0.4202251589495763,
+      "learning_rate": 2.6829283874666233e-05,
+      "loss": 0.6421,
+      "step": 1441
+    },
+    {
+      "epoch": 0.7690666666666667,
+      "grad_norm": 0.3981793324863265,
+      "learning_rate": 2.6711606074225782e-05,
+      "loss": 0.6573,
+      "step": 1442
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.412000437682294,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6247,
+      "step": 1443
+    },
+    {
+      "epoch": 0.7701333333333333,
+      "grad_norm": 0.42914580210807063,
+      "learning_rate": 2.647690737490106e-05,
+      "loss": 0.6525,
+      "step": 1444
+    },
+    {
+      "epoch": 0.7706666666666667,
+      "grad_norm": 0.43285055947384987,
+      "learning_rate": 2.6359887176862718e-05,
+      "loss": 0.7626,
+      "step": 1445
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.4413730077936835,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.7046,
+      "step": 1446
+    },
+    {
+      "epoch": 0.7717333333333334,
+      "grad_norm": 0.43148966043553777,
+      "learning_rate": 2.6126506831233344e-05,
+      "loss": 0.6982,
+      "step": 1447
+    },
+    {
+      "epoch": 0.7722666666666667,
+      "grad_norm": 0.4401444936026171,
+      "learning_rate": 2.6010147380551475e-05,
+      "loss": 0.6598,
+      "step": 1448
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.3874622319447385,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6194,
+      "step": 1449
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.41266089083924673,
+      "learning_rate": 2.577809166078716e-05,
+      "loss": 0.6142,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7738666666666667,
+      "grad_norm": 0.3625437521697885,
+      "learning_rate": 2.566239608465838e-05,
+      "loss": 0.5642,
+      "step": 1451
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4604695011057813,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.6657,
+      "step": 1452
+    },
+    {
+      "epoch": 0.7749333333333334,
+      "grad_norm": 0.40972133019861046,
+      "learning_rate": 2.543167122732918e-05,
+      "loss": 0.6823,
+      "step": 1453
+    },
+    {
+      "epoch": 0.7754666666666666,
+      "grad_norm": 0.4084510280537911,
+      "learning_rate": 2.5316642635108244e-05,
+      "loss": 0.6042,
+      "step": 1454
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.44398935321392907,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.7326,
+      "step": 1455
+    },
+    {
+      "epoch": 0.7765333333333333,
+      "grad_norm": 0.4456134856463968,
+      "learning_rate": 2.508725484101684e-05,
+      "loss": 0.6628,
+      "step": 1456
+    },
+    {
+      "epoch": 0.7770666666666667,
+      "grad_norm": 0.43285530385725185,
+      "learning_rate": 2.4972896324133144e-05,
+      "loss": 0.6711,
+      "step": 1457
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.39350930186551425,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6128,
+      "step": 1458
+    },
+    {
+      "epoch": 0.7781333333333333,
+      "grad_norm": 0.40527366985463825,
+      "learning_rate": 2.4744851758148156e-05,
+      "loss": 0.6704,
+      "step": 1459
+    },
+    {
+      "epoch": 0.7786666666666666,
+      "grad_norm": 0.35349897082768456,
+      "learning_rate": 2.4631166390022574e-05,
+      "loss": 0.6075,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.4106361016149464,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6751,
+      "step": 1461
+    },
+    {
+      "epoch": 0.7797333333333333,
+      "grad_norm": 0.3891782128191905,
+      "learning_rate": 2.4404471180913058e-05,
+      "loss": 0.6061,
+      "step": 1462
+    },
+    {
+      "epoch": 0.7802666666666667,
+      "grad_norm": 0.3850547132027167,
+      "learning_rate": 2.429146201687538e-05,
+      "loss": 0.6236,
+      "step": 1463
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.44083105540317863,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6371,
+      "step": 1464
+    },
+    {
+      "epoch": 0.7813333333333333,
+      "grad_norm": 0.38691339636046546,
+      "learning_rate": 2.4066122257145894e-05,
+      "loss": 0.6539,
+      "step": 1465
+    },
+    {
+      "epoch": 0.7818666666666667,
+      "grad_norm": 0.4420340889763627,
+      "learning_rate": 2.3953792334352787e-05,
+      "loss": 0.6601,
+      "step": 1466
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.3886789068327339,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6234,
+      "step": 1467
+    },
+    {
+      "epoch": 0.7829333333333334,
+      "grad_norm": 0.38766142759886435,
+      "learning_rate": 2.3729814080079816e-05,
+      "loss": 0.6049,
+      "step": 1468
+    },
+    {
+      "epoch": 0.7834666666666666,
+      "grad_norm": 0.3506735481023233,
+      "learning_rate": 2.361816641743303e-05,
+      "loss": 0.6461,
+      "step": 1469
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4709768903977795,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.7488,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7845333333333333,
+      "grad_norm": 0.46595859291323644,
+      "learning_rate": 2.339555568810221e-05,
+      "loss": 0.6949,
+      "step": 1471
+    },
+    {
+      "epoch": 0.7850666666666667,
+      "grad_norm": 0.37820360769280126,
+      "learning_rate": 2.328459328616759e-05,
+      "loss": 0.6135,
+      "step": 1472
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.4254358917377633,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6419,
+      "step": 1473
+    },
+    {
+      "epoch": 0.7861333333333334,
+      "grad_norm": 0.4207943846089345,
+      "learning_rate": 2.306335606451181e-05,
+      "loss": 0.6858,
+      "step": 1474
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 0.35872268946323377,
+      "learning_rate": 2.295308190543859e-05,
+      "loss": 0.5893,
+      "step": 1475
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.3913744045783546,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6081,
+      "step": 1476
+    },
+    {
+      "epoch": 0.7877333333333333,
+      "grad_norm": 0.40900627500047226,
+      "learning_rate": 2.2733224137277366e-05,
+      "loss": 0.6166,
+      "step": 1477
+    },
+    {
+      "epoch": 0.7882666666666667,
+      "grad_norm": 0.3684146909254557,
+      "learning_rate": 2.262364118471805e-05,
+      "loss": 0.5835,
+      "step": 1478
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.37875513247523523,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6543,
+      "step": 1479
+    },
+    {
+      "epoch": 0.7893333333333333,
+      "grad_norm": 0.3912267090240937,
+      "learning_rate": 2.2405168778797646e-05,
+      "loss": 0.6753,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7898666666666667,
+      "grad_norm": 0.35805656895046295,
+      "learning_rate": 2.2296279977828337e-05,
+      "loss": 0.6059,
+      "step": 1481
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.39932907661604244,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.6439,
+      "step": 1482
+    },
+    {
+      "epoch": 0.7909333333333334,
+      "grad_norm": 0.4218337932594207,
+      "learning_rate": 2.2079198805662914e-05,
+      "loss": 0.6988,
+      "step": 1483
+    },
+    {
+      "epoch": 0.7914666666666667,
+      "grad_norm": 0.40449171604535267,
+      "learning_rate": 2.1971007082704164e-05,
+      "loss": 0.6424,
+      "step": 1484
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.44683141125326076,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.6404,
+      "step": 1485
+    },
+    {
+      "epoch": 0.7925333333333333,
+      "grad_norm": 0.3546349444314611,
+      "learning_rate": 2.1755322978418137e-05,
+      "loss": 0.56,
+      "step": 1486
+    },
+    {
+      "epoch": 0.7930666666666667,
+      "grad_norm": 0.41878688706271866,
+      "learning_rate": 2.1647831241156302e-05,
+      "loss": 0.6986,
+      "step": 1487
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.41241198092673154,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6535,
+      "step": 1488
+    },
+    {
+      "epoch": 0.7941333333333334,
+      "grad_norm": 0.4356041886485949,
+      "learning_rate": 2.1433550001327373e-05,
+      "loss": 0.6944,
+      "step": 1489
+    },
+    {
+      "epoch": 0.7946666666666666,
+      "grad_norm": 0.4851579940295009,
+      "learning_rate": 2.1326761138636553e-05,
+      "loss": 0.6417,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.4628739332323059,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.7199,
+      "step": 1491
+    },
+    {
+      "epoch": 0.7957333333333333,
+      "grad_norm": 0.388260301176067,
+      "learning_rate": 2.111388852214001e-05,
+      "loss": 0.6158,
+      "step": 1492
+    },
+    {
+      "epoch": 0.7962666666666667,
+      "grad_norm": 0.3809160374646632,
+      "learning_rate": 2.1007805404004242e-05,
+      "loss": 0.6552,
+      "step": 1493
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.38767748502294097,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.6526,
+      "step": 1494
+    },
+    {
+      "epoch": 0.7973333333333333,
+      "grad_norm": 0.3828024778783532,
+      "learning_rate": 2.0796347131858186e-05,
+      "loss": 0.6361,
+      "step": 1495
+    },
+    {
+      "epoch": 0.7978666666666666,
+      "grad_norm": 0.39807422851630386,
+      "learning_rate": 2.069097260929439e-05,
+      "loss": 0.6369,
+      "step": 1496
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.4059228476243633,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6717,
+      "step": 1497
+    },
+    {
+      "epoch": 0.7989333333333334,
+      "grad_norm": 0.38967800340072495,
+      "learning_rate": 2.048093436450603e-05,
+      "loss": 0.6315,
+      "step": 1498
+    },
+    {
+      "epoch": 0.7994666666666667,
+      "grad_norm": 0.35357562122949704,
+      "learning_rate": 2.0376271269487514e-05,
+      "loss": 0.5684,
+      "step": 1499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.38258943119478694,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.6516,
+      "step": 1500
+    },
+    {
+      "epoch": 0.8005333333333333,
+      "grad_norm": 0.4419137450965979,
+      "learning_rate": 2.0167658696900317e-05,
+      "loss": 0.6848,
+      "step": 1501
+    },
+    {
+      "epoch": 0.8010666666666667,
+      "grad_norm": 0.43770767176075226,
+      "learning_rate": 2.0063709842280432e-05,
+      "loss": 0.6657,
+      "step": 1502
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.4488926601508519,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.75,
+      "step": 1503
+    },
+    {
+      "epoch": 0.8021333333333334,
+      "grad_norm": 0.3944234714230744,
+      "learning_rate": 1.985652854842247e-05,
+      "loss": 0.6434,
+      "step": 1504
+    },
+    {
+      "epoch": 0.8026666666666666,
+      "grad_norm": 0.37644829072906166,
+      "learning_rate": 1.9753296727859195e-05,
+      "loss": 0.6453,
+      "step": 1505
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.4044516702226617,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6403,
+      "step": 1506
+    },
+    {
+      "epoch": 0.8037333333333333,
+      "grad_norm": 0.3980673569707095,
+      "learning_rate": 1.9547552280792524e-05,
+      "loss": 0.6476,
+      "step": 1507
+    },
+    {
+      "epoch": 0.8042666666666667,
+      "grad_norm": 0.5475183862366019,
+      "learning_rate": 1.9445040268673298e-05,
+      "loss": 0.707,
+      "step": 1508
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.3843448951572416,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.637,
+      "step": 1509
+    },
+    {
+      "epoch": 0.8053333333333333,
+      "grad_norm": 0.40636938543825235,
+      "learning_rate": 1.9240738197844278e-05,
+      "loss": 0.6367,
+      "step": 1510
+    },
+    {
+      "epoch": 0.8058666666666666,
+      "grad_norm": 0.39095803152116365,
+      "learning_rate": 1.9138948749211472e-05,
+      "loss": 0.6309,
+      "step": 1511
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.36194859493401127,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.5899,
+      "step": 1512
+    },
+    {
+      "epoch": 0.8069333333333333,
+      "grad_norm": 0.48238799708196023,
+      "learning_rate": 1.8936094545302095e-05,
+      "loss": 0.697,
+      "step": 1513
+    },
+    {
+      "epoch": 0.8074666666666667,
+      "grad_norm": 0.42831654623021886,
+      "learning_rate": 1.883503039577894e-05,
+      "loss": 0.6778,
+      "step": 1514
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.3290612016576017,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.5184,
+      "step": 1515
+    },
+    {
+      "epoch": 0.8085333333333333,
+      "grad_norm": 0.43306775384722507,
+      "learning_rate": 1.8633629510559314e-05,
+      "loss": 0.6517,
+      "step": 1516
+    },
+    {
+      "epoch": 0.8090666666666667,
+      "grad_norm": 0.37939300241931767,
+      "learning_rate": 1.8533293376276472e-05,
+      "loss": 0.6448,
+      "step": 1517
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.4174859091757688,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.7248,
+      "step": 1518
+    },
+    {
+      "epoch": 0.8101333333333334,
+      "grad_norm": 0.42897070111080293,
+      "learning_rate": 1.8333351222458407e-05,
+      "loss": 0.6792,
+      "step": 1519
+    },
+    {
+      "epoch": 0.8106666666666666,
+      "grad_norm": 0.44468491812494343,
+      "learning_rate": 1.8233745799980817e-05,
+      "loss": 0.7148,
+      "step": 1520
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.5270997886680009,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.7384,
+      "step": 1521
+    },
+    {
+      "epoch": 0.8117333333333333,
+      "grad_norm": 0.4003108742784879,
+      "learning_rate": 1.803526775107217e-05,
+      "loss": 0.6593,
+      "step": 1522
+    },
+    {
+      "epoch": 0.8122666666666667,
+      "grad_norm": 0.39089226418237993,
+      "learning_rate": 1.7936395717326704e-05,
+      "loss": 0.6374,
+      "step": 1523
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.35194465381329293,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.5823,
+      "step": 1524
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 0.34472080784247255,
+      "learning_rate": 1.773938710748706e-05,
+      "loss": 0.5776,
+      "step": 1525
+    },
+    {
+      "epoch": 0.8138666666666666,
+      "grad_norm": 0.338792768683901,
+      "learning_rate": 1.7641251119690505e-05,
+      "loss": 0.5921,
+      "step": 1526
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.376611047079288,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.5993,
+      "step": 1527
+    },
+    {
+      "epoch": 0.8149333333333333,
+      "grad_norm": 0.37387556233721764,
+      "learning_rate": 1.744571724358789e-05,
+      "loss": 0.6193,
+      "step": 1528
+    },
+    {
+      "epoch": 0.8154666666666667,
+      "grad_norm": 0.4021204250187721,
+      "learning_rate": 1.7348319939175637e-05,
+      "loss": 0.6376,
+      "step": 1529
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.4083914805445237,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.6396,
+      "step": 1530
+    },
+    {
+      "epoch": 0.8165333333333333,
+      "grad_norm": 0.3962488022093343,
+      "learning_rate": 1.715426605184407e-05,
+      "loss": 0.6472,
+      "step": 1531
+    },
+    {
+      "epoch": 0.8170666666666667,
+      "grad_norm": 0.4124834308999355,
+      "learning_rate": 1.705761004839911e-05,
+      "loss": 0.6465,
+      "step": 1532
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.35479589603691897,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.6233,
+      "step": 1533
+    },
+    {
+      "epoch": 0.8181333333333334,
+      "grad_norm": 0.4080382065749448,
+      "learning_rate": 1.6865041365097435e-05,
+      "loss": 0.6461,
+      "step": 1534
+    },
+    {
+      "epoch": 0.8186666666666667,
+      "grad_norm": 0.4493104764695561,
+      "learning_rate": 1.676912926028007e-05,
+      "loss": 0.6507,
+      "step": 1535
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.39692865212200285,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6339,
+      "step": 1536
+    },
+    {
+      "epoch": 0.8197333333333333,
+      "grad_norm": 0.4061488300472044,
+      "learning_rate": 1.6578050956351886e-05,
+      "loss": 0.682,
+      "step": 1537
+    },
+    {
+      "epoch": 0.8202666666666667,
+      "grad_norm": 0.39685722765982656,
+      "learning_rate": 1.6482885327829913e-05,
+      "loss": 0.621,
+      "step": 1538
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.4150782486054149,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6831,
+      "step": 1539
+    },
+    {
+      "epoch": 0.8213333333333334,
+      "grad_norm": 0.3752175206795003,
+      "learning_rate": 1.6293302538564382e-05,
+      "loss": 0.5972,
+      "step": 1540
+    },
+    {
+      "epoch": 0.8218666666666666,
+      "grad_norm": 0.42912100246172646,
+      "learning_rate": 1.619888594394382e-05,
+      "loss": 0.6303,
+      "step": 1541
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.411690212726029,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6656,
+      "step": 1542
+    },
+    {
+      "epoch": 0.8229333333333333,
+      "grad_norm": 0.4642829889474915,
+      "learning_rate": 1.601080376443763e-05,
+      "loss": 0.6909,
+      "step": 1543
+    },
+    {
+      "epoch": 0.8234666666666667,
+      "grad_norm": 0.4709376800450816,
+      "learning_rate": 1.5917138741193973e-05,
+      "loss": 0.6764,
+      "step": 1544
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.38226574602583446,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6403,
+      "step": 1545
+    },
+    {
+      "epoch": 0.8245333333333333,
+      "grad_norm": 0.44060871721643696,
+      "learning_rate": 1.573056222621453e-05,
+      "loss": 0.718,
+      "step": 1546
+    },
+    {
+      "epoch": 0.8250666666666666,
+      "grad_norm": 0.37380761264975537,
+      "learning_rate": 1.5637651291624523e-05,
+      "loss": 0.6535,
+      "step": 1547
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3917189288001096,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.634,
+      "step": 1548
+    },
+    {
+      "epoch": 0.8261333333333334,
+      "grad_norm": 0.43458844590345663,
+      "learning_rate": 1.5452585455473977e-05,
+      "loss": 0.6715,
+      "step": 1549
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.37911668947364086,
+      "learning_rate": 1.536043110654809e-05,
+      "loss": 0.6382,
+      "step": 1550
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.3793806945062769,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6328,
+      "step": 1551
+    },
+    {
+      "epoch": 0.8277333333333333,
+      "grad_norm": 0.43584474096730086,
+      "learning_rate": 1.5176880922928616e-05,
+      "loss": 0.6113,
+      "step": 1552
+    },
+    {
+      "epoch": 0.8282666666666667,
+      "grad_norm": 0.4109297686822174,
+      "learning_rate": 1.5085485636343755e-05,
+      "loss": 0.6478,
+      "step": 1553
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.4063417624152271,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.7134,
+      "step": 1554
+    },
+    {
+      "epoch": 0.8293333333333334,
+      "grad_norm": 0.3709854276766787,
+      "learning_rate": 1.4903456038223939e-05,
+      "loss": 0.6751,
+      "step": 1555
+    },
+    {
+      "epoch": 0.8298666666666666,
+      "grad_norm": 0.3999953238741327,
+      "learning_rate": 1.4812822270257009e-05,
+      "loss": 0.6367,
+      "step": 1556
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.34047078430130645,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.593,
+      "step": 1557
+    },
+    {
+      "epoch": 0.8309333333333333,
+      "grad_norm": 0.37818008434984063,
+      "learning_rate": 1.4632318149739177e-05,
+      "loss": 0.6541,
+      "step": 1558
+    },
+    {
+      "epoch": 0.8314666666666667,
+      "grad_norm": 0.3883142550297361,
+      "learning_rate": 1.454244833620102e-05,
+      "loss": 0.5832,
+      "step": 1559
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3607911014474159,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.6276,
+      "step": 1560
+    },
+    {
+      "epoch": 0.8325333333333333,
+      "grad_norm": 0.3862760347704396,
+      "learning_rate": 1.4363474544389877e-05,
+      "loss": 0.6119,
+      "step": 1561
+    },
+    {
+      "epoch": 0.8330666666666666,
+      "grad_norm": 0.36933347067867883,
+      "learning_rate": 1.4274371100559791e-05,
+      "loss": 0.5914,
+      "step": 1562
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.36033685441672925,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.6303,
+      "step": 1563
+    },
+    {
+      "epoch": 0.8341333333333333,
+      "grad_norm": 0.43306524792713263,
+      "learning_rate": 1.409693244743192e-05,
+      "loss": 0.6972,
+      "step": 1564
+    },
+    {
+      "epoch": 0.8346666666666667,
+      "grad_norm": 0.3996021976393818,
+      "learning_rate": 1.4008597767992871e-05,
+      "loss": 0.629,
+      "step": 1565
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.3963386564272573,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.6189,
+      "step": 1566
+    },
+    {
+      "epoch": 0.8357333333333333,
+      "grad_norm": 0.36919022205639285,
+      "learning_rate": 1.3832699022267515e-05,
+      "loss": 0.6503,
+      "step": 1567
+    },
+    {
+      "epoch": 0.8362666666666667,
+      "grad_norm": 0.34735951895870093,
+      "learning_rate": 1.37451354812416e-05,
+      "loss": 0.5938,
+      "step": 1568
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.4077481894373184,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6124,
+      "step": 1569
+    },
+    {
+      "epoch": 0.8373333333333334,
+      "grad_norm": 0.36925086630683907,
+      "learning_rate": 1.3570781370252582e-05,
+      "loss": 0.6419,
+      "step": 1570
+    },
+    {
+      "epoch": 0.8378666666666666,
+      "grad_norm": 0.41118593099557654,
+      "learning_rate": 1.3483991320937306e-05,
+      "loss": 0.6309,
+      "step": 1571
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3822208135731575,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.6795,
+      "step": 1572
+    },
+    {
+      "epoch": 0.8389333333333333,
+      "grad_norm": 0.4215040066887796,
+      "learning_rate": 1.3311186530505838e-05,
+      "loss": 0.6591,
+      "step": 1573
+    },
+    {
+      "epoch": 0.8394666666666667,
+      "grad_norm": 0.38502469192610717,
+      "learning_rate": 1.322517230541096e-05,
+      "loss": 0.6466,
+      "step": 1574
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.3805668773253161,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6322,
+      "step": 1575
+    },
+    {
+      "epoch": 0.8405333333333334,
+      "grad_norm": 0.43267277488087297,
+      "learning_rate": 1.30539214797198e-05,
+      "loss": 0.6539,
+      "step": 1576
+    },
+    {
+      "epoch": 0.8410666666666666,
+      "grad_norm": 0.3841521427387812,
+      "learning_rate": 1.2968685390504465e-05,
+      "loss": 0.6485,
+      "step": 1577
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.38780750587337015,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6028,
+      "step": 1578
+    },
+    {
+      "epoch": 0.8421333333333333,
+      "grad_norm": 0.48541801549351904,
+      "learning_rate": 1.2798993131973091e-05,
+      "loss": 0.6351,
+      "step": 1579
+    },
+    {
+      "epoch": 0.8426666666666667,
+      "grad_norm": 0.42295674367595737,
+      "learning_rate": 1.2714537469383858e-05,
+      "loss": 0.6728,
+      "step": 1580
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.3900700328891032,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.651,
+      "step": 1581
+    },
+    {
+      "epoch": 0.8437333333333333,
+      "grad_norm": 0.326400083531713,
+      "learning_rate": 1.2546408338544769e-05,
+      "loss": 0.5693,
+      "step": 1582
+    },
+    {
+      "epoch": 0.8442666666666667,
+      "grad_norm": 0.38358760875407694,
+      "learning_rate": 1.2462735372353996e-05,
+      "loss": 0.6231,
+      "step": 1583
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3826742526760289,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6546,
+      "step": 1584
+    },
+    {
+      "epoch": 0.8453333333333334,
+      "grad_norm": 0.4025249079908315,
+      "learning_rate": 1.2296173887730123e-05,
+      "loss": 0.6202,
+      "step": 1585
+    },
+    {
+      "epoch": 0.8458666666666667,
+      "grad_norm": 0.3541133912664888,
+      "learning_rate": 1.2213285866674905e-05,
+      "loss": 0.6388,
+      "step": 1586
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.3815883631911988,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6464,
+      "step": 1587
+    },
+    {
+      "epoch": 0.8469333333333333,
+      "grad_norm": 0.4534226299418841,
+      "learning_rate": 1.2048296504658207e-05,
+      "loss": 0.7443,
+      "step": 1588
+    },
+    {
+      "epoch": 0.8474666666666667,
+      "grad_norm": 0.4119821518537961,
+      "learning_rate": 1.1966195656380031e-05,
+      "loss": 0.6913,
+      "step": 1589
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3924157628562887,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6496,
+      "step": 1590
+    },
+    {
+      "epoch": 0.8485333333333334,
+      "grad_norm": 0.36369766672205334,
+      "learning_rate": 1.1802782851111205e-05,
+      "loss": 0.6482,
+      "step": 1591
+    },
+    {
+      "epoch": 0.8490666666666666,
+      "grad_norm": 0.3787176470885485,
+      "learning_rate": 1.1721471382096027e-05,
+      "loss": 0.6009,
+      "step": 1592
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.4435549594830049,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6869,
+      "step": 1593
+    },
+    {
+      "epoch": 0.8501333333333333,
+      "grad_norm": 0.4346278720040936,
+      "learning_rate": 1.1559639525345311e-05,
+      "loss": 0.6602,
+      "step": 1594
+    },
+    {
+      "epoch": 0.8506666666666667,
+      "grad_norm": 0.4482220380875183,
+      "learning_rate": 1.1479119620864276e-05,
+      "loss": 0.6344,
+      "step": 1595
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.48419033078342644,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.7642,
+      "step": 1596
+    },
+    {
+      "epoch": 0.8517333333333333,
+      "grad_norm": 0.36179924130776503,
+      "learning_rate": 1.1318873061913405e-05,
+      "loss": 0.5854,
+      "step": 1597
+    },
+    {
+      "epoch": 0.8522666666666666,
+      "grad_norm": 0.39538725881852815,
+      "learning_rate": 1.123914688596409e-05,
+      "loss": 0.6469,
+      "step": 1598
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.4208253894034653,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6222,
+      "step": 1599
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.3659376261736916,
+      "learning_rate": 1.1080489931489391e-05,
+      "loss": 0.5881,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8538666666666667,
+      "grad_norm": 0.418286974140492,
+      "learning_rate": 1.1001559626737756e-05,
+      "loss": 0.6523,
+      "step": 1601
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.4016537477945244,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.5902,
+      "step": 1602
+    },
+    {
+      "epoch": 0.8549333333333333,
+      "grad_norm": 0.34525462758911396,
+      "learning_rate": 1.0844496540694515e-05,
+      "loss": 0.5599,
+      "step": 1603
+    },
+    {
+      "epoch": 0.8554666666666667,
+      "grad_norm": 0.42721498656573004,
+      "learning_rate": 1.0766364228417148e-05,
+      "loss": 0.666,
+      "step": 1604
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.3802210622399738,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6078,
+      "step": 1605
+    },
+    {
+      "epoch": 0.8565333333333334,
+      "grad_norm": 0.3598703676282435,
+      "learning_rate": 1.0610899231924886e-05,
+      "loss": 0.6005,
+      "step": 1606
+    },
+    {
+      "epoch": 0.8570666666666666,
+      "grad_norm": 0.3994187417531796,
+      "learning_rate": 1.0533567011952094e-05,
+      "loss": 0.6003,
+      "step": 1607
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3684169799709176,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.612,
+      "step": 1608
+    },
+    {
+      "epoch": 0.8581333333333333,
+      "grad_norm": 0.39500917760935933,
+      "learning_rate": 1.0379704283181179e-05,
+      "loss": 0.6579,
+      "step": 1609
+    },
+    {
+      "epoch": 0.8586666666666667,
+      "grad_norm": 0.41204006075109184,
+      "learning_rate": 1.0303174233840528e-05,
+      "loss": 0.6577,
+      "step": 1610
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.3798700478362599,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6043,
+      "step": 1611
+    },
+    {
+      "epoch": 0.8597333333333333,
+      "grad_norm": 0.38671422024474944,
+      "learning_rate": 1.0150917907899926e-05,
+      "loss": 0.6305,
+      "step": 1612
+    },
+    {
+      "epoch": 0.8602666666666666,
+      "grad_norm": 0.44027807270041436,
+      "learning_rate": 1.007519208596045e-05,
+      "loss": 0.7012,
+      "step": 1613
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.4621190002468634,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6052,
+      "step": 1614
+    },
+    {
+      "epoch": 0.8613333333333333,
+      "grad_norm": 0.3891856313705041,
+      "learning_rate": 9.924546254786493e-06,
+      "loss": 0.6663,
+      "step": 1615
+    },
+    {
+      "epoch": 0.8618666666666667,
+      "grad_norm": 0.3523057343628837,
+      "learning_rate": 9.849626695403324e-06,
+      "loss": 0.5846,
+      "step": 1616
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.41956867817442073,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6058,
+      "step": 1617
+    },
+    {
+      "epoch": 0.8629333333333333,
+      "grad_norm": 0.5114192626033517,
+      "learning_rate": 9.700595407649805e-06,
+      "loss": 0.7443,
+      "step": 1618
+    },
+    {
+      "epoch": 0.8634666666666667,
+      "grad_norm": 0.43309849127406347,
+      "learning_rate": 9.62648412430951e-06,
+      "loss": 0.7187,
+      "step": 1619
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.41631454039150007,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6188,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8645333333333334,
+      "grad_norm": 0.47788207709264857,
+      "learning_rate": 9.479071385238892e-06,
+      "loss": 0.7718,
+      "step": 1621
+    },
+    {
+      "epoch": 0.8650666666666667,
+      "grad_norm": 0.3922934070751977,
+      "learning_rate": 9.40577036970538e-06,
+      "loss": 0.5966,
+      "step": 1622
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.408553915368954,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.6195,
+      "step": 1623
+    },
+    {
+      "epoch": 0.8661333333333333,
+      "grad_norm": 0.4285867334060612,
+      "learning_rate": 9.259980141081115e-06,
+      "loss": 0.665,
+      "step": 1624
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.3859374780012439,
+      "learning_rate": 9.187491363342093e-06,
+      "loss": 0.6101,
+      "step": 1625
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3692533490464752,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6655,
+      "step": 1626
+    },
+    {
+      "epoch": 0.8677333333333334,
+      "grad_norm": 0.5004821979189422,
+      "learning_rate": 9.043327563322112e-06,
+      "loss": 0.6644,
+      "step": 1627
+    },
+    {
+      "epoch": 0.8682666666666666,
+      "grad_norm": 0.3678805736593292,
+      "learning_rate": 8.971652971536148e-06,
+      "loss": 0.5579,
+      "step": 1628
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.4232198324354022,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6853,
+      "step": 1629
+    },
+    {
+      "epoch": 0.8693333333333333,
+      "grad_norm": 0.3999592803772727,
+      "learning_rate": 8.829119474567671e-06,
+      "loss": 0.6828,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8698666666666667,
+      "grad_norm": 0.393928049506523,
+      "learning_rate": 8.758260995011825e-06,
+      "loss": 0.6094,
+      "step": 1631
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4534043147000788,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6925,
+      "step": 1632
+    },
+    {
+      "epoch": 0.8709333333333333,
+      "grad_norm": 0.4238057057903342,
+      "learning_rate": 8.617361631727138e-06,
+      "loss": 0.6005,
+      "step": 1633
+    },
+    {
+      "epoch": 0.8714666666666666,
+      "grad_norm": 0.6057842523033262,
+      "learning_rate": 8.547321168745193e-06,
+      "loss": 0.8187,
+      "step": 1634
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.35961587811473195,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.6429,
+      "step": 1635
+    },
+    {
+      "epoch": 0.8725333333333334,
+      "grad_norm": 0.40056045994994294,
+      "learning_rate": 8.408059725858719e-06,
+      "loss": 0.6963,
+      "step": 1636
+    },
+    {
+      "epoch": 0.8730666666666667,
+      "grad_norm": 0.4534110862461603,
+      "learning_rate": 8.338839161809997e-06,
+      "loss": 0.6415,
+      "step": 1637
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.3829854967675032,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.5945,
+      "step": 1638
+    },
+    {
+      "epoch": 0.8741333333333333,
+      "grad_norm": 0.39346706379921986,
+      "learning_rate": 8.201219382016556e-06,
+      "loss": 0.623,
+      "step": 1639
+    },
+    {
+      "epoch": 0.8746666666666667,
+      "grad_norm": 0.3275204922619166,
+      "learning_rate": 8.132820577225387e-06,
+      "loss": 0.6086,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.3641486535465027,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6114,
+      "step": 1641
+    },
+    {
+      "epoch": 0.8757333333333334,
+      "grad_norm": 0.42017073449067316,
+      "learning_rate": 7.996846159099557e-06,
+      "loss": 0.6276,
+      "step": 1642
+    },
+    {
+      "epoch": 0.8762666666666666,
+      "grad_norm": 0.38574038783865483,
+      "learning_rate": 7.929270951805178e-06,
+      "loss": 0.6052,
+      "step": 1643
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.384585616150103,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.5766,
+      "step": 1644
+    },
+    {
+      "epoch": 0.8773333333333333,
+      "grad_norm": 0.40568424059538666,
+      "learning_rate": 7.794945549701993e-06,
+      "loss": 0.6343,
+      "step": 1645
+    },
+    {
+      "epoch": 0.8778666666666667,
+      "grad_norm": 0.4034784043026029,
+      "learning_rate": 7.728195756009204e-06,
+      "loss": 0.6497,
+      "step": 1646
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.45139872187066754,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6593,
+      "step": 1647
+    },
+    {
+      "epoch": 0.8789333333333333,
+      "grad_norm": 0.5217615224525861,
+      "learning_rate": 7.595522979965819e-06,
+      "loss": 0.8341,
+      "step": 1648
+    },
+    {
+      "epoch": 0.8794666666666666,
+      "grad_norm": 0.39419060670680345,
+      "learning_rate": 7.529600393796232e-06,
+      "loss": 0.6293,
+      "step": 1649
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.33519106053638126,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.5672,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8805333333333333,
+      "grad_norm": 0.4420634099553051,
+      "learning_rate": 7.3985838094349444e-06,
+      "loss": 0.6521,
+      "step": 1651
+    },
+    {
+      "epoch": 0.8810666666666667,
+      "grad_norm": 0.45971888432388164,
+      "learning_rate": 7.333490202478666e-06,
+      "loss": 0.6387,
+      "step": 1652
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.40838075404353097,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6919,
+      "step": 1653
+    },
+    {
+      "epoch": 0.8821333333333333,
+      "grad_norm": 0.4379494947546668,
+      "learning_rate": 7.204133330911178e-06,
+      "loss": 0.6343,
+      "step": 1654
+    },
+    {
+      "epoch": 0.8826666666666667,
+      "grad_norm": 0.45690337243309503,
+      "learning_rate": 7.1398704525792e-06,
+      "loss": 0.6831,
+      "step": 1655
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.4453452943975889,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6643,
+      "step": 1656
+    },
+    {
+      "epoch": 0.8837333333333334,
+      "grad_norm": 0.34625404756407663,
+      "learning_rate": 7.012176770311862e-06,
+      "loss": 0.5578,
+      "step": 1657
+    },
+    {
+      "epoch": 0.8842666666666666,
+      "grad_norm": 0.5162864274893599,
+      "learning_rate": 6.948746347689183e-06,
+      "loss": 0.63,
+      "step": 1658
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.3876031911363762,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.7014,
+      "step": 1659
+    },
+    {
+      "epoch": 0.8853333333333333,
+      "grad_norm": 0.38583678426931806,
+      "learning_rate": 6.8227192865295995e-06,
+      "loss": 0.6304,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8858666666666667,
+      "grad_norm": 0.44534207598066183,
+      "learning_rate": 6.760123024328624e-06,
+      "loss": 0.6884,
+      "step": 1661
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.36993466736604363,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.5922,
+      "step": 1662
+    },
+    {
+      "epoch": 0.8869333333333334,
+      "grad_norm": 0.37483799699892645,
+      "learning_rate": 6.635765971293484e-06,
+      "loss": 0.603,
+      "step": 1663
+    },
+    {
+      "epoch": 0.8874666666666666,
+      "grad_norm": 0.35827519311964573,
+      "learning_rate": 6.5740055518083375e-06,
+      "loss": 0.5881,
+      "step": 1664
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.3550182360212827,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.6339,
+      "step": 1665
+    },
+    {
+      "epoch": 0.8885333333333333,
+      "grad_norm": 0.36409601536960257,
+      "learning_rate": 6.451321849032288e-06,
+      "loss": 0.5744,
+      "step": 1666
+    },
+    {
+      "epoch": 0.8890666666666667,
+      "grad_norm": 0.3762401812448924,
+      "learning_rate": 6.390398932093555e-06,
+      "loss": 0.6236,
+      "step": 1667
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.424113129594775,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.6362,
+      "step": 1668
+    },
+    {
+      "epoch": 0.8901333333333333,
+      "grad_norm": 0.41865579496554856,
+      "learning_rate": 6.269391876739495e-06,
+      "loss": 0.6299,
+      "step": 1669
+    },
+    {
+      "epoch": 0.8906666666666667,
+      "grad_norm": 0.390327087115672,
+      "learning_rate": 6.209308099669597e-06,
+      "loss": 0.59,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.3943126666647335,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.6667,
+      "step": 1671
+    },
+    {
+      "epoch": 0.8917333333333334,
+      "grad_norm": 0.39696441375321984,
+      "learning_rate": 6.089980943839924e-06,
+      "loss": 0.6875,
+      "step": 1672
+    },
+    {
+      "epoch": 0.8922666666666667,
+      "grad_norm": 0.4066980964646866,
+      "learning_rate": 6.030737921409169e-06,
+      "loss": 0.6633,
+      "step": 1673
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.3730617767279351,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6188,
+      "step": 1674
+    },
+    {
+      "epoch": 0.8933333333333333,
+      "grad_norm": 0.3786613638622962,
+      "learning_rate": 5.913093872058528e-06,
+      "loss": 0.5944,
+      "step": 1675
+    },
+    {
+      "epoch": 0.8938666666666667,
+      "grad_norm": 0.3920315764100113,
+      "learning_rate": 5.854693196441641e-06,
+      "loss": 0.5792,
+      "step": 1676
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.3951793563281852,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6323,
+      "step": 1677
+    },
+    {
+      "epoch": 0.8949333333333334,
+      "grad_norm": 0.40738210139455694,
+      "learning_rate": 5.738735415290642e-06,
+      "loss": 0.5666,
+      "step": 1678
+    },
+    {
+      "epoch": 0.8954666666666666,
+      "grad_norm": 0.4327654679559294,
+      "learning_rate": 5.681178656024055e-06,
+      "loss": 0.6903,
+      "step": 1679
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3866218525119125,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6275,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8965333333333333,
+      "grad_norm": 0.41676921894489855,
+      "learning_rate": 5.566910259474289e-06,
+      "loss": 0.6185,
+      "step": 1681
+    },
+    {
+      "epoch": 0.8970666666666667,
+      "grad_norm": 0.36819851050984187,
+      "learning_rate": 5.510198963413881e-06,
+      "loss": 0.6003,
+      "step": 1682
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.4289014562150203,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6426,
+      "step": 1683
+    },
+    {
+      "epoch": 0.8981333333333333,
+      "grad_norm": 0.3788170827299631,
+      "learning_rate": 5.397623022464226e-06,
+      "loss": 0.6143,
+      "step": 1684
+    },
+    {
+      "epoch": 0.8986666666666666,
+      "grad_norm": 0.3865590657220277,
+      "learning_rate": 5.341758713743828e-06,
+      "loss": 0.6638,
+      "step": 1685
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.4365607959056366,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6938,
+      "step": 1686
+    },
+    {
+      "epoch": 0.8997333333333334,
+      "grad_norm": 0.44866874502860077,
+      "learning_rate": 5.230878253907912e-06,
+      "loss": 0.6426,
+      "step": 1687
+    },
+    {
+      "epoch": 0.9002666666666667,
+      "grad_norm": 0.4296859079499077,
+      "learning_rate": 5.175862433898282e-06,
+      "loss": 0.7046,
+      "step": 1688
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.41822921407633734,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6482,
+      "step": 1689
+    },
+    {
+      "epoch": 0.9013333333333333,
+      "grad_norm": 0.38099014633383893,
+      "learning_rate": 5.066680435123106e-06,
+      "loss": 0.6109,
+      "step": 1690
+    },
+    {
+      "epoch": 0.9018666666666667,
+      "grad_norm": 0.4004957870733732,
+      "learning_rate": 5.012514582391592e-06,
+      "loss": 0.6591,
+      "step": 1691
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.4331026824841501,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.636,
+      "step": 1692
+    },
+    {
+      "epoch": 0.9029333333333334,
+      "grad_norm": 0.40954588959020344,
+      "learning_rate": 4.905033978977491e-06,
+      "loss": 0.6816,
+      "step": 1693
+    },
+    {
+      "epoch": 0.9034666666666666,
+      "grad_norm": 0.37477638008580616,
+      "learning_rate": 4.851719549248301e-06,
+      "loss": 0.5575,
+      "step": 1694
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.3677464157925147,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6233,
+      "step": 1695
+    },
+    {
+      "epoch": 0.9045333333333333,
+      "grad_norm": 0.3830645321769917,
+      "learning_rate": 4.745943229770122e-06,
+      "loss": 0.6373,
+      "step": 1696
+    },
+    {
+      "epoch": 0.9050666666666667,
+      "grad_norm": 0.4926300639629245,
+      "learning_rate": 4.693481655885257e-06,
+      "loss": 0.62,
+      "step": 1697
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.40984080239893733,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.653,
+      "step": 1698
+    },
+    {
+      "epoch": 0.9061333333333333,
+      "grad_norm": 0.4534047509134345,
+      "learning_rate": 4.58941246311464e-06,
+      "loss": 0.6164,
+      "step": 1699
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.41060612583947487,
+      "learning_rate": 4.537805154995278e-06,
+      "loss": 0.6648,
+      "step": 1700
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.4140544791594831,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6679,
+      "step": 1701
+    },
+    {
+      "epoch": 0.9077333333333333,
+      "grad_norm": 0.44852372679056324,
+      "learning_rate": 4.435445885824285e-06,
+      "loss": 0.6977,
+      "step": 1702
+    },
+    {
+      "epoch": 0.9082666666666667,
+      "grad_norm": 0.3766048639983215,
+      "learning_rate": 4.384694230432984e-06,
+      "loss": 0.6351,
+      "step": 1703
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.35319560193664873,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.5554,
+      "step": 1704
+    },
+    {
+      "epoch": 0.9093333333333333,
+      "grad_norm": 0.48616138562802547,
+      "learning_rate": 4.2840476357989825e-06,
+      "loss": 0.6901,
+      "step": 1705
+    },
+    {
+      "epoch": 0.9098666666666667,
+      "grad_norm": 0.3993370068830059,
+      "learning_rate": 4.2341529971023255e-06,
+      "loss": 0.7132,
+      "step": 1706
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.46021025343425365,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6835,
+      "step": 1707
+    },
+    {
+      "epoch": 0.9109333333333334,
+      "grad_norm": 0.3492644325618928,
+      "learning_rate": 4.135221781914034e-06,
+      "loss": 0.5862,
+      "step": 1708
+    },
+    {
+      "epoch": 0.9114666666666666,
+      "grad_norm": 0.4100686532285506,
+      "learning_rate": 4.0861855008460405e-06,
+      "loss": 0.6605,
+      "step": 1709
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4259304835012556,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.658,
+      "step": 1710
+    },
+    {
+      "epoch": 0.9125333333333333,
+      "grad_norm": 0.4250219131039423,
+      "learning_rate": 3.988972323910778e-06,
+      "loss": 0.6988,
+      "step": 1711
+    },
+    {
+      "epoch": 0.9130666666666667,
+      "grad_norm": 0.4028103776943648,
+      "learning_rate": 3.9407957183368095e-06,
+      "loss": 0.6674,
+      "step": 1712
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.3777656548476676,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.5894,
+      "step": 1713
+    },
+    {
+      "epoch": 0.9141333333333334,
+      "grad_norm": 0.42749975757907166,
+      "learning_rate": 3.845303192289074e-06,
+      "loss": 0.6723,
+      "step": 1714
+    },
+    {
+      "epoch": 0.9146666666666666,
+      "grad_norm": 0.43216473413347745,
+      "learning_rate": 3.797987556970495e-06,
+      "loss": 0.5931,
+      "step": 1715
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.4902906985721607,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6397,
+      "step": 1716
+    },
+    {
+      "epoch": 0.9157333333333333,
+      "grad_norm": 0.3438665823882658,
+      "learning_rate": 3.7042182482018075e-06,
+      "loss": 0.5812,
+      "step": 1717
+    },
+    {
+      "epoch": 0.9162666666666667,
+      "grad_norm": 0.3658476490393041,
+      "learning_rate": 3.6577648547611033e-06,
+      "loss": 0.5835,
+      "step": 1718
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.3780446322788056,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.595,
+      "step": 1719
+    },
+    {
+      "epoch": 0.9173333333333333,
+      "grad_norm": 0.41814618617778465,
+      "learning_rate": 3.565721283350931e-06,
+      "loss": 0.6324,
+      "step": 1720
+    },
+    {
+      "epoch": 0.9178666666666667,
+      "grad_norm": 0.3901178105595818,
+      "learning_rate": 3.5201313802375456e-06,
+      "loss": 0.6639,
+      "step": 1721
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.38286272643945235,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6099,
+      "step": 1722
+    },
+    {
+      "epoch": 0.9189333333333334,
+      "grad_norm": 0.3865628722920474,
+      "learning_rate": 3.4298160198856568e-06,
+      "loss": 0.6315,
+      "step": 1723
+    },
+    {
+      "epoch": 0.9194666666666667,
+      "grad_norm": 0.5419359515697314,
+      "learning_rate": 3.3850908323424967e-06,
+      "loss": 0.5427,
+      "step": 1724
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.3899876700160922,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6366,
+      "step": 1725
+    },
+    {
+      "epoch": 0.9205333333333333,
+      "grad_norm": 0.3616030779346872,
+      "learning_rate": 3.296506110302422e-06,
+      "loss": 0.594,
+      "step": 1726
+    },
+    {
+      "epoch": 0.9210666666666667,
+      "grad_norm": 0.4240669327876325,
+      "learning_rate": 3.252646840332918e-06,
+      "loss": 0.632,
+      "step": 1727
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.44957973793027783,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6851,
+      "step": 1728
+    },
+    {
+      "epoch": 0.9221333333333334,
+      "grad_norm": 0.3803640458713458,
+      "learning_rate": 3.1657951373467497e-06,
+      "loss": 0.6266,
+      "step": 1729
+    },
+    {
+      "epoch": 0.9226666666666666,
+      "grad_norm": 0.4255318402766365,
+      "learning_rate": 3.1228029636824475e-06,
+      "loss": 0.5948,
+      "step": 1730
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.5052336267324755,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.5732,
+      "step": 1731
+    },
+    {
+      "epoch": 0.9237333333333333,
+      "grad_norm": 0.34828984780876193,
+      "learning_rate": 3.037686613916857e-06,
+      "loss": 0.5604,
+      "step": 1732
+    },
+    {
+      "epoch": 0.9242666666666667,
+      "grad_norm": 0.346773846061091,
+      "learning_rate": 2.995562691985898e-06,
+      "loss": 0.6007,
+      "step": 1733
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3488831216331237,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.5746,
+      "step": 1734
+    },
+    {
+      "epoch": 0.9253333333333333,
+      "grad_norm": 0.353819959211536,
+      "learning_rate": 2.912183982969385e-06,
+      "loss": 0.5998,
+      "step": 1735
+    },
+    {
+      "epoch": 0.9258666666666666,
+      "grad_norm": 0.4182713925300156,
+      "learning_rate": 2.8709294448653225e-06,
+      "loss": 0.6411,
+      "step": 1736
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.43154467378249295,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6322,
+      "step": 1737
+    },
+    {
+      "epoch": 0.9269333333333334,
+      "grad_norm": 0.4037799248955002,
+      "learning_rate": 2.789290617426765e-06,
+      "loss": 0.6531,
+      "step": 1738
+    },
+    {
+      "epoch": 0.9274666666666667,
+      "grad_norm": 0.45604664148272145,
+      "learning_rate": 2.748906571878207e-06,
+      "loss": 0.66,
+      "step": 1739
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.44150611513550986,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.6566,
+      "step": 1740
+    },
+    {
+      "epoch": 0.9285333333333333,
+      "grad_norm": 0.4490419569292603,
+      "learning_rate": 2.6690098200866098e-06,
+      "loss": 0.6412,
+      "step": 1741
+    },
+    {
+      "epoch": 0.9290666666666667,
+      "grad_norm": 0.4074018171974381,
+      "learning_rate": 2.6294973524274125e-06,
+      "loss": 0.6246,
+      "step": 1742
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.4435900300223481,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6613,
+      "step": 1743
+    },
+    {
+      "epoch": 0.9301333333333334,
+      "grad_norm": 0.3600253066809195,
+      "learning_rate": 2.551344823532964e-06,
+      "loss": 0.5667,
+      "step": 1744
+    },
+    {
+      "epoch": 0.9306666666666666,
+      "grad_norm": 0.39696343675793055,
+      "learning_rate": 2.5127049956730207e-06,
+      "loss": 0.611,
+      "step": 1745
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.5182802937795672,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.707,
+      "step": 1746
+    },
+    {
+      "epoch": 0.9317333333333333,
+      "grad_norm": 0.36385089190171926,
+      "learning_rate": 2.436298790049363e-06,
+      "loss": 0.6002,
+      "step": 1747
+    },
+    {
+      "epoch": 0.9322666666666667,
+      "grad_norm": 0.4045286031388774,
+      "learning_rate": 2.3985326404461604e-06,
+      "loss": 0.6688,
+      "step": 1748
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.4468725830239745,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6733,
+      "step": 1749
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.37429865121265865,
+      "learning_rate": 2.3238748115339324e-06,
+      "loss": 0.6916,
+      "step": 1750
+    },
+    {
+      "epoch": 0.9338666666666666,
+      "grad_norm": 0.3572136019687655,
+      "learning_rate": 2.286983355164529e-06,
+      "loss": 0.6035,
+      "step": 1751
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.4486284312165138,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6861,
+      "step": 1752
+    },
+    {
+      "epoch": 0.9349333333333333,
+      "grad_norm": 0.3553972877760231,
+      "learning_rate": 2.2140759094162467e-06,
+      "loss": 0.594,
+      "step": 1753
+    },
+    {
+      "epoch": 0.9354666666666667,
+      "grad_norm": 0.4056341604847936,
+      "learning_rate": 2.178060137750071e-06,
+      "loss": 0.6248,
+      "step": 1754
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.36558159733533085,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.6066,
+      "step": 1755
+    },
+    {
+      "epoch": 0.9365333333333333,
+      "grad_norm": 0.4061970922594267,
+      "learning_rate": 2.106905034576112e-06,
+      "loss": 0.633,
+      "step": 1756
+    },
+    {
+      "epoch": 0.9370666666666667,
+      "grad_norm": 0.3811504643296473,
+      "learning_rate": 2.0717659155482738e-06,
+      "loss": 0.5499,
+      "step": 1757
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.37384507290136076,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.6049,
+      "step": 1758
+    },
+    {
+      "epoch": 0.9381333333333334,
+      "grad_norm": 0.3973429118899678,
+      "learning_rate": 2.002365067264289e-06,
+      "loss": 0.6284,
+      "step": 1759
+    },
+    {
+      "epoch": 0.9386666666666666,
+      "grad_norm": 0.47537615922802734,
+      "learning_rate": 1.968103545249611e-06,
+      "loss": 0.7432,
+      "step": 1760
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.4104457549264363,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.7001,
+      "step": 1761
+    },
+    {
+      "epoch": 0.9397333333333333,
+      "grad_norm": 0.4430266939888034,
+      "learning_rate": 1.900458817025097e-06,
+      "loss": 0.5947,
+      "step": 1762
+    },
+    {
+      "epoch": 0.9402666666666667,
+      "grad_norm": 0.3967446665343882,
+      "learning_rate": 1.8670758128126909e-06,
+      "loss": 0.5761,
+      "step": 1763
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.4258095715960273,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6796,
+      "step": 1764
+    },
+    {
+      "epoch": 0.9413333333333334,
+      "grad_norm": 0.34786398810414526,
+      "learning_rate": 1.8011890226208527e-06,
+      "loss": 0.5575,
+      "step": 1765
+    },
+    {
+      "epoch": 0.9418666666666666,
+      "grad_norm": 0.4275895681198496,
+      "learning_rate": 1.7686854333893833e-06,
+      "loss": 0.679,
+      "step": 1766
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.5290437124970101,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6052,
+      "step": 1767
+    },
+    {
+      "epoch": 0.9429333333333333,
+      "grad_norm": 0.5359355147252487,
+      "learning_rate": 1.7045583519583074e-06,
+      "loss": 0.7417,
+      "step": 1768
+    },
+    {
+      "epoch": 0.9434666666666667,
+      "grad_norm": 0.37930293748445637,
+      "learning_rate": 1.6729350512519005e-06,
+      "loss": 0.5852,
+      "step": 1769
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4633348667320413,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.7025,
+      "step": 1770
+    },
+    {
+      "epoch": 0.9445333333333333,
+      "grad_norm": 0.392749110225333,
+      "learning_rate": 1.6105694020169593e-06,
+      "loss": 0.6089,
+      "step": 1771
+    },
+    {
+      "epoch": 0.9450666666666667,
+      "grad_norm": 0.41487626459388594,
+      "learning_rate": 1.5798272397217095e-06,
+      "loss": 0.6547,
+      "step": 1772
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.41929688237722923,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6599,
+      "step": 1773
+    },
+    {
+      "epoch": 0.9461333333333334,
+      "grad_norm": 0.43257949909434595,
+      "learning_rate": 1.5192246987791981e-06,
+      "loss": 0.6243,
+      "step": 1774
+    },
+    {
+      "epoch": 0.9466666666666667,
+      "grad_norm": 0.40014876474338457,
+      "learning_rate": 1.489364501100332e-06,
+      "loss": 0.6399,
+      "step": 1775
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.44297625725316786,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.6189,
+      "step": 1776
+    },
+    {
+      "epoch": 0.9477333333333333,
+      "grad_norm": 0.391386338820982,
+      "learning_rate": 1.430526697162482e-06,
+      "loss": 0.6101,
+      "step": 1777
+    },
+    {
+      "epoch": 0.9482666666666667,
+      "grad_norm": 0.40289981420138316,
+      "learning_rate": 1.4015492666021312e-06,
+      "loss": 0.5999,
+      "step": 1778
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.4726234303201714,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.6732,
+      "step": 1779
+    },
+    {
+      "epoch": 0.9493333333333334,
+      "grad_norm": 0.3880707836498278,
+      "learning_rate": 1.344477780953346e-06,
+      "loss": 0.6831,
+      "step": 1780
+    },
+    {
+      "epoch": 0.9498666666666666,
+      "grad_norm": 0.42009141047301407,
+      "learning_rate": 1.3163838962890195e-06,
+      "loss": 0.5946,
+      "step": 1781
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.31459151848912936,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.4982,
+      "step": 1782
+    },
+    {
+      "epoch": 0.9509333333333333,
+      "grad_norm": 0.3808332904925995,
+      "learning_rate": 1.261080262743297e-06,
+      "loss": 0.6033,
+      "step": 1783
+    },
+    {
+      "epoch": 0.9514666666666667,
+      "grad_norm": 0.3971412010894983,
+      "learning_rate": 1.2338706790069431e-06,
+      "loss": 0.6068,
+      "step": 1784
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.42138215375453464,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.5968,
+      "step": 1785
+    },
+    {
+      "epoch": 0.9525333333333333,
+      "grad_norm": 0.45593227137872594,
+      "learning_rate": 1.1803363838667092e-06,
+      "loss": 0.6271,
+      "step": 1786
+    },
+    {
+      "epoch": 0.9530666666666666,
+      "grad_norm": 0.4370834348661708,
+      "learning_rate": 1.1540118323243865e-06,
+      "loss": 0.6853,
+      "step": 1787
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.40604479576739955,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6261,
+      "step": 1788
+    },
+    {
+      "epoch": 0.9541333333333334,
+      "grad_norm": 0.36332079540179196,
+      "learning_rate": 1.1022483143405705e-06,
+      "loss": 0.6037,
+      "step": 1789
+    },
+    {
+      "epoch": 0.9546666666666667,
+      "grad_norm": 0.46083204959098173,
+      "learning_rate": 1.076809502472831e-06,
+      "loss": 0.636,
+      "step": 1790
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.4331820455251136,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6008,
+      "step": 1791
+    },
+    {
+      "epoch": 0.9557333333333333,
+      "grad_norm": 0.39838234530173705,
+      "learning_rate": 1.0268181528061749e-06,
+      "loss": 0.6939,
+      "step": 1792
+    },
+    {
+      "epoch": 0.9562666666666667,
+      "grad_norm": 0.4029315687366071,
+      "learning_rate": 1.0022657642890231e-06,
+      "loss": 0.6374,
+      "step": 1793
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.36723184689622324,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.5428,
+      "step": 1794
+    },
+    {
+      "epoch": 0.9573333333333334,
+      "grad_norm": 0.41260438226003665,
+      "learning_rate": 9.540479264726676e-07,
+      "loss": 0.593,
+      "step": 1795
+    },
+    {
+      "epoch": 0.9578666666666666,
+      "grad_norm": 0.3819459436471279,
+      "learning_rate": 9.303826211592315e-07,
+      "loss": 0.597,
+      "step": 1796
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.5147018245810597,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.7745,
+      "step": 1797
+    },
+    {
+      "epoch": 0.9589333333333333,
+      "grad_norm": 0.3781968630865498,
+      "learning_rate": 8.839395910626213e-07,
+      "loss": 0.6202,
+      "step": 1798
+    },
+    {
+      "epoch": 0.9594666666666667,
+      "grad_norm": 0.40083163294255647,
+      "learning_rate": 8.611620049653879e-07,
+      "loss": 0.6093,
+      "step": 1799
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.4013706045863398,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6523,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9605333333333334,
+      "grad_norm": 0.45426433863489035,
+      "learning_rate": 8.16495030759501e-07,
+      "loss": 0.623,
+      "step": 1801
+    },
+    {
+      "epoch": 0.9610666666666666,
+      "grad_norm": 0.35118711218518495,
+      "learning_rate": 7.946057760332193e-07,
+      "loss": 0.6,
+      "step": 1802
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.4139965455148424,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.6918,
+      "step": 1803
+    },
+    {
+      "epoch": 0.9621333333333333,
+      "grad_norm": 0.4580436016800682,
+      "learning_rate": 7.517160581569372e-07,
+      "loss": 0.7366,
+      "step": 1804
+    },
+    {
+      "epoch": 0.9626666666666667,
+      "grad_norm": 0.41031093686075293,
+      "learning_rate": 7.307157230821426e-07,
+      "loss": 0.6508,
+      "step": 1805
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.42894494322204385,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6746,
+      "step": 1806
+    },
+    {
+      "epoch": 0.9637333333333333,
+      "grad_norm": 0.4277586928186075,
+      "learning_rate": 6.896044142100433e-07,
+      "loss": 0.5941,
+      "step": 1807
+    },
+    {
+      "epoch": 0.9642666666666667,
+      "grad_norm": 0.3726116360265131,
+      "learning_rate": 6.694935631773258e-07,
+      "loss": 0.6289,
+      "step": 1808
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.42678020531448374,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.7038,
+      "step": 1809
+    },
+    {
+      "epoch": 0.9653333333333334,
+      "grad_norm": 0.41457849722470047,
+      "learning_rate": 6.301617681886863e-07,
+      "loss": 0.5832,
+      "step": 1810
+    },
+    {
+      "epoch": 0.9658666666666667,
+      "grad_norm": 0.5517044690279252,
+      "learning_rate": 6.109409416834688e-07,
+      "loss": 0.6336,
+      "step": 1811
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.43149313950848506,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6666,
+      "step": 1812
+    },
+    {
+      "epoch": 0.9669333333333333,
+      "grad_norm": 0.3616916487578632,
+      "learning_rate": 5.733897176325665e-07,
+      "loss": 0.5937,
+      "step": 1813
+    },
+    {
+      "epoch": 0.9674666666666667,
+      "grad_norm": 0.3710368806138515,
+      "learning_rate": 5.550594322205504e-07,
+      "loss": 0.5398,
+      "step": 1814
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.3560929224051006,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.5422,
+      "step": 1815
+    },
+    {
+      "epoch": 0.9685333333333334,
+      "grad_norm": 0.3975214060376899,
+      "learning_rate": 5.192897883082747e-07,
+      "loss": 0.6349,
+      "step": 1816
+    },
+    {
+      "epoch": 0.9690666666666666,
+      "grad_norm": 0.39721197754283144,
+      "learning_rate": 5.018505366216175e-07,
+      "loss": 0.658,
+      "step": 1817
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.40380837462984304,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6511,
+      "step": 1818
+    },
+    {
+      "epoch": 0.9701333333333333,
+      "grad_norm": 0.41516731626442,
+      "learning_rate": 4.678634341683252e-07,
+      "loss": 0.6576,
+      "step": 1819
+    },
+    {
+      "epoch": 0.9706666666666667,
+      "grad_norm": 0.44443986255157986,
+      "learning_rate": 4.5131568489236166e-07,
+      "loss": 0.597,
+      "step": 1820
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.46690678254721785,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6744,
+      "step": 1821
+    },
+    {
+      "epoch": 0.9717333333333333,
+      "grad_norm": 0.4236247311231795,
+      "learning_rate": 4.191120373120749e-07,
+      "loss": 0.6958,
+      "step": 1822
+    },
+    {
+      "epoch": 0.9722666666666666,
+      "grad_norm": 0.4218011524784495,
+      "learning_rate": 4.034562351727389e-07,
+      "loss": 0.6064,
+      "step": 1823
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.4180589874972332,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6633,
+      "step": 1824
+    },
+    {
+      "epoch": 0.9733333333333334,
+      "grad_norm": 0.40482637284568534,
+      "learning_rate": 3.73036907948543e-07,
+      "loss": 0.6008,
+      "step": 1825
+    },
+    {
+      "epoch": 0.9738666666666667,
+      "grad_norm": 0.4127301984330829,
+      "learning_rate": 3.582734737004101e-07,
+      "loss": 0.6495,
+      "step": 1826
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.43671439107411486,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.7455,
+      "step": 1827
+    },
+    {
+      "epoch": 0.9749333333333333,
+      "grad_norm": 0.37837694112342424,
+      "learning_rate": 3.296392843612273e-07,
+      "loss": 0.6003,
+      "step": 1828
+    },
+    {
+      "epoch": 0.9754666666666667,
+      "grad_norm": 0.49685366602989023,
+      "learning_rate": 3.1576861477621287e-07,
+      "loss": 0.7369,
+      "step": 1829
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.43391209681434595,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6135,
+      "step": 1830
+    },
+    {
+      "epoch": 0.9765333333333334,
+      "grad_norm": 0.40530873199447187,
+      "learning_rate": 2.889203328748424e-07,
+      "loss": 0.6963,
+      "step": 1831
+    },
+    {
+      "epoch": 0.9770666666666666,
+      "grad_norm": 0.4132260658725821,
+      "learning_rate": 2.759428007315212e-07,
+      "loss": 0.6623,
+      "step": 1832
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.37671125150722135,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6389,
+      "step": 1833
+    },
+    {
+      "epoch": 0.9781333333333333,
+      "grad_norm": 0.5129910304033822,
+      "learning_rate": 2.5088114782392257e-07,
+      "loss": 0.7407,
+      "step": 1834
+    },
+    {
+      "epoch": 0.9786666666666667,
+      "grad_norm": 0.4652504700471368,
+      "learning_rate": 2.3879710189753656e-07,
+      "loss": 0.6826,
+      "step": 1835
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.4363947547988871,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6753,
+      "step": 1836
+    },
+    {
+      "epoch": 0.9797333333333333,
+      "grad_norm": 0.4218095754007212,
+      "learning_rate": 2.15522751523467e-07,
+      "loss": 0.6141,
+      "step": 1837
+    },
+    {
+      "epoch": 0.9802666666666666,
+      "grad_norm": 0.40715115291783716,
+      "learning_rate": 2.0433251657653308e-07,
+      "loss": 0.645,
+      "step": 1838
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.4201761417092367,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6296,
+      "step": 1839
+    },
+    {
+      "epoch": 0.9813333333333333,
+      "grad_norm": 0.4182657198519536,
+      "learning_rate": 1.8284609424142895e-07,
+      "loss": 0.6972,
+      "step": 1840
+    },
+    {
+      "epoch": 0.9818666666666667,
+      "grad_norm": 0.4281553244352119,
+      "learning_rate": 1.7254997101500137e-07,
+      "loss": 0.6501,
+      "step": 1841
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.4197015546849949,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6409,
+      "step": 1842
+    },
+    {
+      "epoch": 0.9829333333333333,
+      "grad_norm": 0.41746548528971333,
+      "learning_rate": 1.5285205417319149e-07,
+      "loss": 0.6475,
+      "step": 1843
+    },
+    {
+      "epoch": 0.9834666666666667,
+      "grad_norm": 0.4194385161714978,
+      "learning_rate": 1.4345031937879062e-07,
+      "loss": 0.6729,
+      "step": 1844
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.4707694959493561,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6543,
+      "step": 1845
+    },
+    {
+      "epoch": 0.9845333333333334,
+      "grad_norm": 0.4540084220958516,
+      "learning_rate": 1.255414374179531e-07,
+      "loss": 0.6682,
+      "step": 1846
+    },
+    {
+      "epoch": 0.9850666666666666,
+      "grad_norm": 0.4069028356392501,
+      "learning_rate": 1.170343437301491e-07,
+      "loss": 0.6074,
+      "step": 1847
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.5188126407003398,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.6645,
+      "step": 1848
+    },
+    {
+      "epoch": 0.9861333333333333,
+      "grad_norm": 0.39146103679729155,
+      "learning_rate": 1.0091497795706728e-07,
+      "loss": 0.5945,
+      "step": 1849
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.4265384478677555,
+      "learning_rate": 9.330275400666332e-08,
+      "loss": 0.6236,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.4159128225073078,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6616,
+      "step": 1851
+    },
+    {
+      "epoch": 0.9877333333333334,
+      "grad_norm": 0.41964616900270324,
+      "learning_rate": 7.8973337634336e-08,
+      "loss": 0.6243,
+      "step": 1852
+    },
+    {
+      "epoch": 0.9882666666666666,
+      "grad_norm": 0.579955875571895,
+      "learning_rate": 7.225618800222877e-08,
+      "loss": 0.6825,
+      "step": 1853
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.4478208448163976,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6358,
+      "step": 1854
+    },
+    {
+      "epoch": 0.9893333333333333,
+      "grad_norm": 0.4127808255305365,
+      "learning_rate": 5.971710613821291e-08,
+      "loss": 0.6365,
+      "step": 1855
+    },
+    {
+      "epoch": 0.9898666666666667,
+      "grad_norm": 0.420118472388387,
+      "learning_rate": 5.389521134989695e-08,
+      "loss": 0.6653,
+      "step": 1856
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.46142875720538856,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6428,
+      "step": 1857
+    },
+    {
+      "epoch": 0.9909333333333333,
+      "grad_norm": 0.4048587926915659,
+      "learning_rate": 4.314680098592705e-08,
+      "loss": 0.6279,
+      "step": 1858
+    },
+    {
+      "epoch": 0.9914666666666667,
+      "grad_norm": 0.3458469497176294,
+      "learning_rate": 3.8220317506654226e-08,
+      "loss": 0.6202,
+      "step": 1859
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4076662738844198,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6159,
+      "step": 1860
+    },
+    {
+      "epoch": 0.9925333333333334,
+      "grad_norm": 0.38112266212449186,
+      "learning_rate": 2.9262867509605163e-08,
+      "loss": 0.635,
+      "step": 1861
+    },
+    {
+      "epoch": 0.9930666666666667,
+      "grad_norm": 0.38108114881334026,
+      "learning_rate": 2.5231927740154704e-08,
+      "loss": 0.6266,
+      "step": 1862
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.3657179361705824,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6321,
+      "step": 1863
+    },
+    {
+      "epoch": 0.9941333333333333,
+      "grad_norm": 0.37421511625080134,
+      "learning_rate": 1.8065678844314538e-08,
+      "loss": 0.6465,
+      "step": 1864
+    },
+    {
+      "epoch": 0.9946666666666667,
+      "grad_norm": 0.4760479839858473,
+      "learning_rate": 1.4930391117451426e-08,
+      "loss": 0.6844,
+      "step": 1865
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.4045266852361633,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6967,
+      "step": 1866
+    },
+    {
+      "epoch": 0.9957333333333334,
+      "grad_norm": 0.3868844486384437,
+      "learning_rate": 9.555535917993297e-09,
+      "loss": 0.6766,
+      "step": 1867
+    },
+    {
+      "epoch": 0.9962666666666666,
+      "grad_norm": 0.3705547133425217,
+      "learning_rate": 7.315984495548378e-09,
+      "loss": 0.5429,
+      "step": 1868
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.3857037725600411,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6145,
+      "step": 1869
+    },
+    {
+      "epoch": 0.9973333333333333,
+      "grad_norm": 0.3690888658164784,
+      "learning_rate": 3.732667443390181e-09,
+      "loss": 0.5999,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9978666666666667,
+      "grad_norm": 0.37791030559543104,
+      "learning_rate": 2.388912514017516e-09,
+      "loss": 0.6228,
+      "step": 1871
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.40303393262956455,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.586,
+      "step": 1872
+    },
+    {
+      "epoch": 0.9989333333333333,
+      "grad_norm": 0.3804108106985691,
+      "learning_rate": 5.972299119250125e-10,
+      "loss": 0.6194,
+      "step": 1873
+    },
+    {
+      "epoch": 0.9994666666666666,
+      "grad_norm": 0.37964342880483826,
+      "learning_rate": 1.4930758944764479e-10,
+      "loss": 0.5963,
+      "step": 1874
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.3654631867371486,
+      "learning_rate": 0.0,
+      "loss": 0.6098,
+      "step": 1875
+    },
+    {
+      "epoch": 1.0,
+      "step": 1875,
+      "total_flos": 1673623541907456.0,
+      "train_loss": 0.7114526748339335,
+      "train_runtime": 29135.8629,
+      "train_samples_per_second": 1.03,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1673623541907456.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f3d2c0fab2a37b55c7692981509f5579150eb23
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "down_proj",
+    "o_proj",
+    "up_proj",
+    "q_proj",
+    "gate_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..20ec7ae0d16b0a97f45a8584a4de91d16ca081c0
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2814d25019d3e9d21a6e924c8125532b4fc62ab211a13d11fdfa494d7378f0aa
+size 671150064
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..55e68a7295ebea3d87d13fef6bb0dc75c7d0329d
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8f3c5c81f5a6de3f134118146a9b959c53799922dce1d8318db881a072efa39
+size 918507402
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..24fd824dc0a80d79b0c186ddaecfc78b86990c11
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/trainer_state.json
@@ -0,0 +1,6601 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9994666666666666,
+  "eval_steps": 500,
+  "global_step": 937,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0010666666666666667,
+      "grad_norm": 0.7150404507746235,
+      "learning_rate": 6.896551724137932e-06,
+      "loss": 1.3138,
+      "step": 1
+    },
+    {
+      "epoch": 0.0021333333333333334,
+      "grad_norm": 0.7517230768793969,
+      "learning_rate": 1.3793103448275863e-05,
+      "loss": 1.3487,
+      "step": 2
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.6850986826401334,
+      "learning_rate": 2.0689655172413793e-05,
+      "loss": 1.2781,
+      "step": 3
+    },
+    {
+      "epoch": 0.004266666666666667,
+      "grad_norm": 0.6669116276208316,
+      "learning_rate": 2.7586206896551727e-05,
+      "loss": 1.2872,
+      "step": 4
+    },
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 0.5513124829727151,
+      "learning_rate": 3.4482758620689657e-05,
+      "loss": 1.1467,
+      "step": 5
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.6839012744129601,
+      "learning_rate": 4.1379310344827587e-05,
+      "loss": 1.263,
+      "step": 6
+    },
+    {
+      "epoch": 0.007466666666666667,
+      "grad_norm": 0.5916540505648946,
+      "learning_rate": 4.827586206896552e-05,
+      "loss": 1.1601,
+      "step": 7
+    },
+    {
+      "epoch": 0.008533333333333334,
+      "grad_norm": 0.5681708162575256,
+      "learning_rate": 5.517241379310345e-05,
+      "loss": 1.063,
+      "step": 8
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.9175268922331461,
+      "learning_rate": 6.206896551724138e-05,
+      "loss": 1.0006,
+      "step": 9
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": 0.689533234355985,
+      "learning_rate": 6.896551724137931e-05,
+      "loss": 1.0061,
+      "step": 10
+    },
+    {
+      "epoch": 0.011733333333333333,
+      "grad_norm": 0.6058128511088969,
+      "learning_rate": 7.586206896551724e-05,
+      "loss": 0.9672,
+      "step": 11
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.4697515045752619,
+      "learning_rate": 8.275862068965517e-05,
+      "loss": 0.9108,
+      "step": 12
+    },
+    {
+      "epoch": 0.013866666666666666,
+      "grad_norm": 0.5136927014165928,
+      "learning_rate": 8.96551724137931e-05,
+      "loss": 1.0057,
+      "step": 13
+    },
+    {
+      "epoch": 0.014933333333333333,
+      "grad_norm": 0.4050192125844374,
+      "learning_rate": 9.655172413793105e-05,
+      "loss": 0.8527,
+      "step": 14
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.37227559692849704,
+      "learning_rate": 0.00010344827586206898,
+      "loss": 0.913,
+      "step": 15
+    },
+    {
+      "epoch": 0.017066666666666667,
+      "grad_norm": 0.38229389281594744,
+      "learning_rate": 0.0001103448275862069,
+      "loss": 0.9282,
+      "step": 16
+    },
+    {
+      "epoch": 0.018133333333333335,
+      "grad_norm": 0.3914291297898359,
+      "learning_rate": 0.00011724137931034482,
+      "loss": 0.8802,
+      "step": 17
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.36356337251375576,
+      "learning_rate": 0.00012413793103448277,
+      "loss": 0.8928,
+      "step": 18
+    },
+    {
+      "epoch": 0.020266666666666665,
+      "grad_norm": 0.40831342927814046,
+      "learning_rate": 0.00013103448275862068,
+      "loss": 0.9102,
+      "step": 19
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 0.3984609271059379,
+      "learning_rate": 0.00013793103448275863,
+      "loss": 0.9177,
+      "step": 20
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.3789014734433653,
+      "learning_rate": 0.00014482758620689657,
+      "loss": 0.8836,
+      "step": 21
+    },
+    {
+      "epoch": 0.023466666666666667,
+      "grad_norm": 0.36284925629079756,
+      "learning_rate": 0.00015172413793103449,
+      "loss": 0.8911,
+      "step": 22
+    },
+    {
+      "epoch": 0.024533333333333334,
+      "grad_norm": 0.3782325896198143,
+      "learning_rate": 0.00015862068965517243,
+      "loss": 0.8294,
+      "step": 23
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.35647166395970625,
+      "learning_rate": 0.00016551724137931035,
+      "loss": 0.8587,
+      "step": 24
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.38566819707562905,
+      "learning_rate": 0.00017241379310344826,
+      "loss": 0.9287,
+      "step": 25
+    },
+    {
+      "epoch": 0.027733333333333332,
+      "grad_norm": 0.34689221911048684,
+      "learning_rate": 0.0001793103448275862,
+      "loss": 0.8166,
+      "step": 26
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.35163565590631973,
+      "learning_rate": 0.00018620689655172415,
+      "loss": 0.8877,
+      "step": 27
+    },
+    {
+      "epoch": 0.029866666666666666,
+      "grad_norm": 0.3602698831545272,
+      "learning_rate": 0.0001931034482758621,
+      "loss": 0.8118,
+      "step": 28
+    },
+    {
+      "epoch": 0.030933333333333334,
+      "grad_norm": 0.4717780613184218,
+      "learning_rate": 0.0002,
+      "loss": 0.9208,
+      "step": 29
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.31657851949244914,
+      "learning_rate": 0.00019999940145388063,
+      "loss": 0.8136,
+      "step": 30
+    },
+    {
+      "epoch": 0.03306666666666667,
+      "grad_norm": 0.33567964751978563,
+      "learning_rate": 0.00019999760582268763,
+      "loss": 0.7925,
+      "step": 31
+    },
+    {
+      "epoch": 0.034133333333333335,
+      "grad_norm": 0.6991343865783637,
+      "learning_rate": 0.00019999461312791638,
+      "loss": 0.7912,
+      "step": 32
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.3324181376982427,
+      "learning_rate": 0.0001999904234053922,
+      "loss": 0.8135,
+      "step": 33
+    },
+    {
+      "epoch": 0.03626666666666667,
+      "grad_norm": 0.36123274074779893,
+      "learning_rate": 0.00019998503670526994,
+      "loss": 0.8447,
+      "step": 34
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 0.3332897101654105,
+      "learning_rate": 0.00019997845309203334,
+      "loss": 0.808,
+      "step": 35
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.33841161377796475,
+      "learning_rate": 0.00019997067264449433,
+      "loss": 0.8057,
+      "step": 36
+    },
+    {
+      "epoch": 0.039466666666666664,
+      "grad_norm": 0.3433711214041263,
+      "learning_rate": 0.00019996169545579207,
+      "loss": 0.8823,
+      "step": 37
+    },
+    {
+      "epoch": 0.04053333333333333,
+      "grad_norm": 0.3320327339933086,
+      "learning_rate": 0.00019995152163339178,
+      "loss": 0.8295,
+      "step": 38
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.35328695696685514,
+      "learning_rate": 0.00019994015129908346,
+      "loss": 0.8567,
+      "step": 39
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 0.32055484686699165,
+      "learning_rate": 0.00019992758458898055,
+      "loss": 0.8269,
+      "step": 40
+    },
+    {
+      "epoch": 0.04373333333333333,
+      "grad_norm": 0.2988079364489933,
+      "learning_rate": 0.00019991382165351814,
+      "loss": 0.8004,
+      "step": 41
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.31076214749417785,
+      "learning_rate": 0.00019989886265745128,
+      "loss": 0.7915,
+      "step": 42
+    },
+    {
+      "epoch": 0.04586666666666667,
+      "grad_norm": 0.36495247754832527,
+      "learning_rate": 0.00019988270777985292,
+      "loss": 0.905,
+      "step": 43
+    },
+    {
+      "epoch": 0.046933333333333334,
+      "grad_norm": 0.3201346595140102,
+      "learning_rate": 0.00019986535721411186,
+      "loss": 0.8098,
+      "step": 44
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.31431333652249416,
+      "learning_rate": 0.00019984681116793038,
+      "loss": 0.8142,
+      "step": 45
+    },
+    {
+      "epoch": 0.04906666666666667,
+      "grad_norm": 0.3174046054807775,
+      "learning_rate": 0.00019982706986332175,
+      "loss": 0.7825,
+      "step": 46
+    },
+    {
+      "epoch": 0.050133333333333335,
+      "grad_norm": 0.3416574223684478,
+      "learning_rate": 0.00019980613353660763,
+      "loss": 0.8033,
+      "step": 47
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.29105704043049,
+      "learning_rate": 0.00019978400243841508,
+      "loss": 0.7839,
+      "step": 48
+    },
+    {
+      "epoch": 0.05226666666666667,
+      "grad_norm": 0.344561985887665,
+      "learning_rate": 0.00019976067683367385,
+      "loss": 0.8317,
+      "step": 49
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.2971171538224461,
+      "learning_rate": 0.0001997361570016129,
+      "loss": 0.7912,
+      "step": 50
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.3367067040104621,
+      "learning_rate": 0.00019971044323575728,
+      "loss": 0.8412,
+      "step": 51
+    },
+    {
+      "epoch": 0.055466666666666664,
+      "grad_norm": 0.30951669789643976,
+      "learning_rate": 0.0001996835358439244,
+      "loss": 0.8032,
+      "step": 52
+    },
+    {
+      "epoch": 0.05653333333333333,
+      "grad_norm": 0.31671201739557314,
+      "learning_rate": 0.00019965543514822062,
+      "loss": 0.7848,
+      "step": 53
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.2962489428837048,
+      "learning_rate": 0.00019962614148503718,
+      "loss": 0.7344,
+      "step": 54
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 0.2945631273987315,
+      "learning_rate": 0.00019959565520504623,
+      "loss": 0.7634,
+      "step": 55
+    },
+    {
+      "epoch": 0.05973333333333333,
+      "grad_norm": 0.3133817043715632,
+      "learning_rate": 0.00019956397667319668,
+      "loss": 0.7434,
+      "step": 56
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.3106632817607619,
+      "learning_rate": 0.00019953110626870979,
+      "loss": 0.8277,
+      "step": 57
+    },
+    {
+      "epoch": 0.06186666666666667,
+      "grad_norm": 0.33440647782775346,
+      "learning_rate": 0.00019949704438507459,
+      "loss": 0.8456,
+      "step": 58
+    },
+    {
+      "epoch": 0.06293333333333333,
+      "grad_norm": 0.2859149686600027,
+      "learning_rate": 0.00019946179143004325,
+      "loss": 0.6912,
+      "step": 59
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.3293297422558948,
+      "learning_rate": 0.0001994253478256262,
+      "loss": 0.8083,
+      "step": 60
+    },
+    {
+      "epoch": 0.06506666666666666,
+      "grad_norm": 0.29984639159162674,
+      "learning_rate": 0.0001993877140080869,
+      "loss": 0.8117,
+      "step": 61
+    },
+    {
+      "epoch": 0.06613333333333334,
+      "grad_norm": 0.30330771622679087,
+      "learning_rate": 0.000199348890427937,
+      "loss": 0.757,
+      "step": 62
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.31300353466341463,
+      "learning_rate": 0.00019930887754993044,
+      "loss": 0.7748,
+      "step": 63
+    },
+    {
+      "epoch": 0.06826666666666667,
+      "grad_norm": 0.3522154834722942,
+      "learning_rate": 0.00019926767585305835,
+      "loss": 0.8176,
+      "step": 64
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 0.31586643449288965,
+      "learning_rate": 0.000199225285830543,
+      "loss": 0.8185,
+      "step": 65
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.2869955582760285,
+      "learning_rate": 0.00019918170798983211,
+      "loss": 0.7617,
+      "step": 66
+    },
+    {
+      "epoch": 0.07146666666666666,
+      "grad_norm": 0.35210811616977333,
+      "learning_rate": 0.00019913694285259256,
+      "loss": 0.791,
+      "step": 67
+    },
+    {
+      "epoch": 0.07253333333333334,
+      "grad_norm": 0.31706716634256427,
+      "learning_rate": 0.00019909099095470444,
+      "loss": 0.8021,
+      "step": 68
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.3339013269934293,
+      "learning_rate": 0.00019904385284625424,
+      "loss": 0.8119,
+      "step": 69
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.31877870971738437,
+      "learning_rate": 0.00019899552909152866,
+      "loss": 0.7794,
+      "step": 70
+    },
+    {
+      "epoch": 0.07573333333333333,
+      "grad_norm": 0.38541610907281726,
+      "learning_rate": 0.00019894602026900758,
+      "loss": 0.7312,
+      "step": 71
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.30837800553536693,
+      "learning_rate": 0.00019889532697135734,
+      "loss": 0.7584,
+      "step": 72
+    },
+    {
+      "epoch": 0.07786666666666667,
+      "grad_norm": 0.29549951927182116,
+      "learning_rate": 0.00019884344980542338,
+      "loss": 0.7328,
+      "step": 73
+    },
+    {
+      "epoch": 0.07893333333333333,
+      "grad_norm": 0.32164400658443787,
+      "learning_rate": 0.00019879038939222329,
+      "loss": 0.7841,
+      "step": 74
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.32324949897356936,
+      "learning_rate": 0.0001987361463669392,
+      "loss": 0.7781,
+      "step": 75
+    },
+    {
+      "epoch": 0.08106666666666666,
+      "grad_norm": 0.3208249120187674,
+      "learning_rate": 0.00019868072137891002,
+      "loss": 0.8233,
+      "step": 76
+    },
+    {
+      "epoch": 0.08213333333333334,
+      "grad_norm": 0.30597753006893685,
+      "learning_rate": 0.00019862411509162406,
+      "loss": 0.7677,
+      "step": 77
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.3369333017450298,
+      "learning_rate": 0.0001985663281827108,
+      "loss": 0.8387,
+      "step": 78
+    },
+    {
+      "epoch": 0.08426666666666667,
+      "grad_norm": 0.30688012588113905,
+      "learning_rate": 0.00019850736134393286,
+      "loss": 0.7934,
+      "step": 79
+    },
+    {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.36117693414215873,
+      "learning_rate": 0.00019844721528117766,
+      "loss": 0.7943,
+      "step": 80
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.3145122076331781,
+      "learning_rate": 0.00019838589071444903,
+      "loss": 0.7786,
+      "step": 81
+    },
+    {
+      "epoch": 0.08746666666666666,
+      "grad_norm": 0.350600304350432,
+      "learning_rate": 0.00019832338837785863,
+      "loss": 0.8077,
+      "step": 82
+    },
+    {
+      "epoch": 0.08853333333333334,
+      "grad_norm": 0.3520435592531511,
+      "learning_rate": 0.00019825970901961705,
+      "loss": 0.8227,
+      "step": 83
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.34115733128930387,
+      "learning_rate": 0.000198194853402025,
+      "loss": 0.8598,
+      "step": 84
+    },
+    {
+      "epoch": 0.09066666666666667,
+      "grad_norm": 0.2970478438067611,
+      "learning_rate": 0.00019812882230146398,
+      "loss": 0.7553,
+      "step": 85
+    },
+    {
+      "epoch": 0.09173333333333333,
+      "grad_norm": 0.31146294972146893,
+      "learning_rate": 0.00019806161650838723,
+      "loss": 0.8243,
+      "step": 86
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.3435563021382438,
+      "learning_rate": 0.00019799323682731,
+      "loss": 0.7977,
+      "step": 87
+    },
+    {
+      "epoch": 0.09386666666666667,
+      "grad_norm": 0.35481914703830225,
+      "learning_rate": 0.00019792368407680025,
+      "loss": 0.847,
+      "step": 88
+    },
+    {
+      "epoch": 0.09493333333333333,
+      "grad_norm": 0.3255425174480584,
+      "learning_rate": 0.00019785295908946848,
+      "loss": 0.8486,
+      "step": 89
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.31142872104302455,
+      "learning_rate": 0.00019778106271195806,
+      "loss": 0.7739,
+      "step": 90
+    },
+    {
+      "epoch": 0.09706666666666666,
+      "grad_norm": 0.31099434553696265,
+      "learning_rate": 0.00019770799580493494,
+      "loss": 0.7232,
+      "step": 91
+    },
+    {
+      "epoch": 0.09813333333333334,
+      "grad_norm": 0.33390378245723135,
+      "learning_rate": 0.00019763375924307735,
+      "loss": 0.7922,
+      "step": 92
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.3384282157569822,
+      "learning_rate": 0.0001975583539150655,
+      "loss": 0.7837,
+      "step": 93
+    },
+    {
+      "epoch": 0.10026666666666667,
+      "grad_norm": 0.34501336210884226,
+      "learning_rate": 0.00019748178072357065,
+      "loss": 0.8115,
+      "step": 94
+    },
+    {
+      "epoch": 0.10133333333333333,
+      "grad_norm": 0.3414289964926809,
+      "learning_rate": 0.00019740404058524457,
+      "loss": 0.7936,
+      "step": 95
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.3268309843645721,
+      "learning_rate": 0.00019732513443070836,
+      "loss": 0.7667,
+      "step": 96
+    },
+    {
+      "epoch": 0.10346666666666667,
+      "grad_norm": 0.34769874408769946,
+      "learning_rate": 0.00019724506320454153,
+      "loss": 0.8125,
+      "step": 97
+    },
+    {
+      "epoch": 0.10453333333333334,
+      "grad_norm": 0.3347227330823757,
+      "learning_rate": 0.0001971638278652705,
+      "loss": 0.7706,
+      "step": 98
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.341218992013308,
+      "learning_rate": 0.0001970814293853572,
+      "loss": 0.7264,
+      "step": 99
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.3221983151649732,
+      "learning_rate": 0.00019699786875118747,
+      "loss": 0.8102,
+      "step": 100
+    },
+    {
+      "epoch": 0.10773333333333333,
+      "grad_norm": 0.32357950261485013,
+      "learning_rate": 0.00019691314696305913,
+      "loss": 0.759,
+      "step": 101
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.3688826265112733,
+      "learning_rate": 0.00019682726503517017,
+      "loss": 0.7623,
+      "step": 102
+    },
+    {
+      "epoch": 0.10986666666666667,
+      "grad_norm": 0.3157913419017387,
+      "learning_rate": 0.00019674022399560648,
+      "loss": 0.7549,
+      "step": 103
+    },
+    {
+      "epoch": 0.11093333333333333,
+      "grad_norm": 0.3199130056921602,
+      "learning_rate": 0.00019665202488632956,
+      "loss": 0.797,
+      "step": 104
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.34271816025751367,
+      "learning_rate": 0.0001965626687631641,
+      "loss": 0.7837,
+      "step": 105
+    },
+    {
+      "epoch": 0.11306666666666666,
+      "grad_norm": 1.1749314330347793,
+      "learning_rate": 0.00019647215669578536,
+      "loss": 0.7909,
+      "step": 106
+    },
+    {
+      "epoch": 0.11413333333333334,
+      "grad_norm": 0.3047250811610679,
+      "learning_rate": 0.00019638048976770628,
+      "loss": 0.7369,
+      "step": 107
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.3471502915554745,
+      "learning_rate": 0.00019628766907626446,
+      "loss": 0.8222,
+      "step": 108
+    },
+    {
+      "epoch": 0.11626666666666667,
+      "grad_norm": 0.31661974043563507,
+      "learning_rate": 0.00019619369573260924,
+      "loss": 0.7579,
+      "step": 109
+    },
+    {
+      "epoch": 0.11733333333333333,
+      "grad_norm": 0.3414034529581211,
+      "learning_rate": 0.00019609857086168823,
+      "loss": 0.7917,
+      "step": 110
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.32742097941667897,
+      "learning_rate": 0.00019600229560223388,
+      "loss": 0.8256,
+      "step": 111
+    },
+    {
+      "epoch": 0.11946666666666667,
+      "grad_norm": 0.3132305449791067,
+      "learning_rate": 0.00019590487110674983,
+      "loss": 0.7544,
+      "step": 112
+    },
+    {
+      "epoch": 0.12053333333333334,
+      "grad_norm": 0.3248074990909413,
+      "learning_rate": 0.0001958062985414972,
+      "loss": 0.7896,
+      "step": 113
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.31958249201953143,
+      "learning_rate": 0.00019570657908648048,
+      "loss": 0.7124,
+      "step": 114
+    },
+    {
+      "epoch": 0.12266666666666666,
+      "grad_norm": 0.3302381937004531,
+      "learning_rate": 0.0001956057139354335,
+      "loss": 0.7946,
+      "step": 115
+    },
+    {
+      "epoch": 0.12373333333333333,
+      "grad_norm": 0.35980190664873524,
+      "learning_rate": 0.0001955037042958052,
+      "loss": 0.8117,
+      "step": 116
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.28062167044982156,
+      "learning_rate": 0.00019540055138874505,
+      "loss": 0.7172,
+      "step": 117
+    },
+    {
+      "epoch": 0.12586666666666665,
+      "grad_norm": 0.36503321665028093,
+      "learning_rate": 0.00019529625644908847,
+      "loss": 0.7969,
+      "step": 118
+    },
+    {
+      "epoch": 0.12693333333333334,
+      "grad_norm": 0.3304952954224699,
+      "learning_rate": 0.0001951908207253421,
+      "loss": 0.8017,
+      "step": 119
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3504957437371224,
+      "learning_rate": 0.00019508424547966884,
+      "loss": 0.7584,
+      "step": 120
+    },
+    {
+      "epoch": 0.12906666666666666,
+      "grad_norm": 0.3099650890513928,
+      "learning_rate": 0.00019497653198787264,
+      "loss": 0.7527,
+      "step": 121
+    },
+    {
+      "epoch": 0.13013333333333332,
+      "grad_norm": 0.32179057694112095,
+      "learning_rate": 0.00019486768153938338,
+      "loss": 0.7741,
+      "step": 122
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.30322277540797365,
+      "learning_rate": 0.0001947576954372413,
+      "loss": 0.7481,
+      "step": 123
+    },
+    {
+      "epoch": 0.13226666666666667,
+      "grad_norm": 0.2925102959156477,
+      "learning_rate": 0.00019464657499808152,
+      "loss": 0.73,
+      "step": 124
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.3370938546775735,
+      "learning_rate": 0.0001945343215521182,
+      "loss": 0.7842,
+      "step": 125
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.33903637260094704,
+      "learning_rate": 0.0001944209364431286,
+      "loss": 0.784,
+      "step": 126
+    },
+    {
+      "epoch": 0.13546666666666668,
+      "grad_norm": 0.3145777113723304,
+      "learning_rate": 0.00019430642102843707,
+      "loss": 0.7201,
+      "step": 127
+    },
+    {
+      "epoch": 0.13653333333333334,
+      "grad_norm": 0.3258583284186643,
+      "learning_rate": 0.00019419077667889872,
+      "loss": 0.7463,
+      "step": 128
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.29104955145028755,
+      "learning_rate": 0.00019407400477888315,
+      "loss": 0.7396,
+      "step": 129
+    },
+    {
+      "epoch": 0.13866666666666666,
+      "grad_norm": 0.320537361595748,
+      "learning_rate": 0.00019395610672625767,
+      "loss": 0.7645,
+      "step": 130
+    },
+    {
+      "epoch": 0.13973333333333332,
+      "grad_norm": 0.32773393358493336,
+      "learning_rate": 0.00019383708393237075,
+      "loss": 0.776,
+      "step": 131
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.30795208108296096,
+      "learning_rate": 0.00019371693782203498,
+      "loss": 0.7558,
+      "step": 132
+    },
+    {
+      "epoch": 0.14186666666666667,
+      "grad_norm": 0.31080726259715846,
+      "learning_rate": 0.00019359566983351013,
+      "loss": 0.7598,
+      "step": 133
+    },
+    {
+      "epoch": 0.14293333333333333,
+      "grad_norm": 0.30560257019675857,
+      "learning_rate": 0.0001934732814184859,
+      "loss": 0.7791,
+      "step": 134
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.30215125661680314,
+      "learning_rate": 0.00019334977404206443,
+      "loss": 0.7399,
+      "step": 135
+    },
+    {
+      "epoch": 0.14506666666666668,
+      "grad_norm": 0.31244001707200425,
+      "learning_rate": 0.00019322514918274308,
+      "loss": 0.746,
+      "step": 136
+    },
+    {
+      "epoch": 0.14613333333333334,
+      "grad_norm": 0.30641046306359077,
+      "learning_rate": 0.00019309940833239626,
+      "loss": 0.7559,
+      "step": 137
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.3077699271034951,
+      "learning_rate": 0.00019297255299625797,
+      "loss": 0.7623,
+      "step": 138
+    },
+    {
+      "epoch": 0.14826666666666666,
+      "grad_norm": 0.3006958802209959,
+      "learning_rate": 0.00019284458469290354,
+      "loss": 0.7601,
+      "step": 139
+    },
+    {
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.28871077834134756,
+      "learning_rate": 0.00019271550495423168,
+      "loss": 0.7512,
+      "step": 140
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.30479592833280356,
+      "learning_rate": 0.00019258531532544585,
+      "loss": 0.7887,
+      "step": 141
+    },
+    {
+      "epoch": 0.15146666666666667,
+      "grad_norm": 0.29438431708112245,
+      "learning_rate": 0.00019245401736503608,
+      "loss": 0.7641,
+      "step": 142
+    },
+    {
+      "epoch": 0.15253333333333333,
+      "grad_norm": 0.3698190550809041,
+      "learning_rate": 0.00019232161264475997,
+      "loss": 0.8312,
+      "step": 143
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.32113905470036225,
+      "learning_rate": 0.00019218810274962417,
+      "loss": 0.7608,
+      "step": 144
+    },
+    {
+      "epoch": 0.15466666666666667,
+      "grad_norm": 0.34743845197123385,
+      "learning_rate": 0.00019205348927786532,
+      "loss": 0.7892,
+      "step": 145
+    },
+    {
+      "epoch": 0.15573333333333333,
+      "grad_norm": 0.3140052946624941,
+      "learning_rate": 0.00019191777384093081,
+      "loss": 0.7224,
+      "step": 146
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.31360127159357665,
+      "learning_rate": 0.0001917809580634596,
+      "loss": 0.8009,
+      "step": 147
+    },
+    {
+      "epoch": 0.15786666666666666,
+      "grad_norm": 0.2964427898232695,
+      "learning_rate": 0.00019164304358326275,
+      "loss": 0.7203,
+      "step": 148
+    },
+    {
+      "epoch": 0.15893333333333334,
+      "grad_norm": 0.3235804028497726,
+      "learning_rate": 0.00019150403205130383,
+      "loss": 0.7871,
+      "step": 149
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.30056816519980334,
+      "learning_rate": 0.00019136392513167903,
+      "loss": 0.7468,
+      "step": 150
+    },
+    {
+      "epoch": 0.16106666666666666,
+      "grad_norm": 0.3002837519853485,
+      "learning_rate": 0.00019122272450159745,
+      "loss": 0.7717,
+      "step": 151
+    },
+    {
+      "epoch": 0.16213333333333332,
+      "grad_norm": 0.31696226829654534,
+      "learning_rate": 0.0001910804318513609,
+      "loss": 0.8007,
+      "step": 152
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.30048603236796223,
+      "learning_rate": 0.0001909370488843436,
+      "loss": 0.7377,
+      "step": 153
+    },
+    {
+      "epoch": 0.16426666666666667,
+      "grad_norm": 0.3178311258315863,
+      "learning_rate": 0.00019079257731697196,
+      "loss": 0.7642,
+      "step": 154
+    },
+    {
+      "epoch": 0.16533333333333333,
+      "grad_norm": 0.3607299857080969,
+      "learning_rate": 0.0001906470188787039,
+      "loss": 0.7684,
+      "step": 155
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.33000609227428035,
+      "learning_rate": 0.00019050037531200814,
+      "loss": 0.7874,
+      "step": 156
+    },
+    {
+      "epoch": 0.16746666666666668,
+      "grad_norm": 0.32170495531523496,
+      "learning_rate": 0.00019035264837234347,
+      "loss": 0.7373,
+      "step": 157
+    },
+    {
+      "epoch": 0.16853333333333334,
+      "grad_norm": 0.318770211379808,
+      "learning_rate": 0.00019020383982813765,
+      "loss": 0.7532,
+      "step": 158
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.31962795313591713,
+      "learning_rate": 0.00019005395146076616,
+      "loss": 0.7478,
+      "step": 159
+    },
+    {
+      "epoch": 0.17066666666666666,
+      "grad_norm": 0.32498100332958324,
+      "learning_rate": 0.00018990298506453104,
+      "loss": 0.7682,
+      "step": 160
+    },
+    {
+      "epoch": 0.17173333333333332,
+      "grad_norm": 0.3122681763794844,
+      "learning_rate": 0.0001897509424466393,
+      "loss": 0.7533,
+      "step": 161
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.32638067359801876,
+      "learning_rate": 0.00018959782542718128,
+      "loss": 0.792,
+      "step": 162
+    },
+    {
+      "epoch": 0.17386666666666667,
+      "grad_norm": 0.31987561741947407,
+      "learning_rate": 0.000189443635839109,
+      "loss": 0.7518,
+      "step": 163
+    },
+    {
+      "epoch": 0.17493333333333333,
+      "grad_norm": 0.30041151587312104,
+      "learning_rate": 0.00018928837552821404,
+      "loss": 0.7134,
+      "step": 164
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.2970901377274106,
+      "learning_rate": 0.0001891320463531055,
+      "loss": 0.6934,
+      "step": 165
+    },
+    {
+      "epoch": 0.17706666666666668,
+      "grad_norm": 0.32022861895985255,
+      "learning_rate": 0.00018897465018518782,
+      "loss": 0.7538,
+      "step": 166
+    },
+    {
+      "epoch": 0.17813333333333334,
+      "grad_norm": 0.30419609260958613,
+      "learning_rate": 0.0001888161889086383,
+      "loss": 0.7491,
+      "step": 167
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.31972446389836073,
+      "learning_rate": 0.00018865666442038456,
+      "loss": 0.7714,
+      "step": 168
+    },
+    {
+      "epoch": 0.18026666666666666,
+      "grad_norm": 0.3299470052921253,
+      "learning_rate": 0.00018849607863008193,
+      "loss": 0.7506,
+      "step": 169
+    },
+    {
+      "epoch": 0.18133333333333335,
+      "grad_norm": 0.28517024164539245,
+      "learning_rate": 0.0001883344334600904,
+      "loss": 0.7135,
+      "step": 170
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.3224558700250482,
+      "learning_rate": 0.00018817173084545176,
+      "loss": 0.7158,
+      "step": 171
+    },
+    {
+      "epoch": 0.18346666666666667,
+      "grad_norm": 0.30270891189616717,
+      "learning_rate": 0.0001880079727338664,
+      "loss": 0.7356,
+      "step": 172
+    },
+    {
+      "epoch": 0.18453333333333333,
+      "grad_norm": 0.3388519846376595,
+      "learning_rate": 0.00018784316108566996,
+      "loss": 0.7896,
+      "step": 173
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.2816035133503061,
+      "learning_rate": 0.00018767729787380985,
+      "loss": 0.7268,
+      "step": 174
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.2981926946358348,
+      "learning_rate": 0.00018751038508382176,
+      "loss": 0.7608,
+      "step": 175
+    },
+    {
+      "epoch": 0.18773333333333334,
+      "grad_norm": 0.3011299811184157,
+      "learning_rate": 0.00018734242471380572,
+      "loss": 0.7432,
+      "step": 176
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.32803280465133966,
+      "learning_rate": 0.00018717341877440226,
+      "loss": 0.8206,
+      "step": 177
+    },
+    {
+      "epoch": 0.18986666666666666,
+      "grad_norm": 0.313769001249352,
+      "learning_rate": 0.0001870033692887684,
+      "loss": 0.7918,
+      "step": 178
+    },
+    {
+      "epoch": 0.19093333333333334,
+      "grad_norm": 0.30339604523132696,
+      "learning_rate": 0.00018683227829255334,
+      "loss": 0.7099,
+      "step": 179
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.3006045830411854,
+      "learning_rate": 0.00018666014783387408,
+      "loss": 0.7431,
+      "step": 180
+    },
+    {
+      "epoch": 0.19306666666666666,
+      "grad_norm": 0.314679925997079,
+      "learning_rate": 0.000186486979973291,
+      "loss": 0.7509,
+      "step": 181
+    },
+    {
+      "epoch": 0.19413333333333332,
+      "grad_norm": 0.33019330905158284,
+      "learning_rate": 0.0001863127767837831,
+      "loss": 0.7525,
+      "step": 182
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.31765891033121607,
+      "learning_rate": 0.0001861375403507233,
+      "loss": 0.7908,
+      "step": 183
+    },
+    {
+      "epoch": 0.19626666666666667,
+      "grad_norm": 0.31004466226608485,
+      "learning_rate": 0.00018596127277185329,
+      "loss": 0.7735,
+      "step": 184
+    },
+    {
+      "epoch": 0.19733333333333333,
+      "grad_norm": 0.2877227986417195,
+      "learning_rate": 0.0001857839761572586,
+      "loss": 0.6979,
+      "step": 185
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.3345401794189286,
+      "learning_rate": 0.00018560565262934318,
+      "loss": 0.7938,
+      "step": 186
+    },
+    {
+      "epoch": 0.19946666666666665,
+      "grad_norm": 0.31992280535145035,
+      "learning_rate": 0.00018542630432280422,
+      "loss": 0.7865,
+      "step": 187
+    },
+    {
+      "epoch": 0.20053333333333334,
+      "grad_norm": 0.29618585685440163,
+      "learning_rate": 0.00018524593338460635,
+      "loss": 0.7257,
+      "step": 188
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.2808123672759863,
+      "learning_rate": 0.00018506454197395606,
+      "loss": 0.6761,
+      "step": 189
+    },
+    {
+      "epoch": 0.20266666666666666,
+      "grad_norm": 0.32676433205976424,
+      "learning_rate": 0.00018488213226227588,
+      "loss": 0.7334,
+      "step": 190
+    },
+    {
+      "epoch": 0.20373333333333332,
+      "grad_norm": 0.3139748760986525,
+      "learning_rate": 0.0001846987064331783,
+      "loss": 0.7282,
+      "step": 191
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.32007869336218764,
+      "learning_rate": 0.00018451426668243963,
+      "loss": 0.7419,
+      "step": 192
+    },
+    {
+      "epoch": 0.20586666666666667,
+      "grad_norm": 0.2976081340068327,
+      "learning_rate": 0.0001843288152179739,
+      "loss": 0.7606,
+      "step": 193
+    },
+    {
+      "epoch": 0.20693333333333333,
+      "grad_norm": 0.286432862770985,
+      "learning_rate": 0.00018414235425980616,
+      "loss": 0.6929,
+      "step": 194
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.2859293393462299,
+      "learning_rate": 0.00018395488604004603,
+      "loss": 0.7439,
+      "step": 195
+    },
+    {
+      "epoch": 0.20906666666666668,
+      "grad_norm": 0.3280792485884969,
+      "learning_rate": 0.00018376641280286107,
+      "loss": 0.7909,
+      "step": 196
+    },
+    {
+      "epoch": 0.21013333333333334,
+      "grad_norm": 0.2950017517796392,
+      "learning_rate": 0.00018357693680444976,
+      "loss": 0.6992,
+      "step": 197
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.3010421415813137,
+      "learning_rate": 0.00018338646031301458,
+      "loss": 0.738,
+      "step": 198
+    },
+    {
+      "epoch": 0.21226666666666666,
+      "grad_norm": 0.2871613748711631,
+      "learning_rate": 0.00018319498560873476,
+      "loss": 0.7175,
+      "step": 199
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.3178582558772246,
+      "learning_rate": 0.00018300251498373923,
+      "loss": 0.7615,
+      "step": 200
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.3479160766860369,
+      "learning_rate": 0.00018280905074207884,
+      "loss": 0.7944,
+      "step": 201
+    },
+    {
+      "epoch": 0.21546666666666667,
+      "grad_norm": 0.32393029697176035,
+      "learning_rate": 0.000182614595199699,
+      "loss": 0.7462,
+      "step": 202
+    },
+    {
+      "epoch": 0.21653333333333333,
+      "grad_norm": 0.2979591990515668,
+      "learning_rate": 0.00018241915068441196,
+      "loss": 0.7283,
+      "step": 203
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.32340201194214113,
+      "learning_rate": 0.00018222271953586883,
+      "loss": 0.7416,
+      "step": 204
+    },
+    {
+      "epoch": 0.21866666666666668,
+      "grad_norm": 0.3342851972955029,
+      "learning_rate": 0.00018202530410553163,
+      "loss": 0.7808,
+      "step": 205
+    },
+    {
+      "epoch": 0.21973333333333334,
+      "grad_norm": 0.33113966231844666,
+      "learning_rate": 0.00018182690675664514,
+      "loss": 0.7259,
+      "step": 206
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.31090205478518784,
+      "learning_rate": 0.00018162752986420868,
+      "loss": 0.7502,
+      "step": 207
+    },
+    {
+      "epoch": 0.22186666666666666,
+      "grad_norm": 0.2781076391987126,
+      "learning_rate": 0.0001814271758149475,
+      "loss": 0.7376,
+      "step": 208
+    },
+    {
+      "epoch": 0.22293333333333334,
+      "grad_norm": 0.29378185169500126,
+      "learning_rate": 0.00018122584700728443,
+      "loss": 0.7587,
+      "step": 209
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.2948693355021739,
+      "learning_rate": 0.00018102354585131092,
+      "loss": 0.7272,
+      "step": 210
+    },
+    {
+      "epoch": 0.22506666666666666,
+      "grad_norm": 0.31430848427810804,
+      "learning_rate": 0.00018082027476875847,
+      "loss": 0.7399,
+      "step": 211
+    },
+    {
+      "epoch": 0.22613333333333333,
+      "grad_norm": 0.323709165255713,
+      "learning_rate": 0.00018061603619296942,
+      "loss": 0.7624,
+      "step": 212
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.2968511519545849,
+      "learning_rate": 0.0001804108325688679,
+      "loss": 0.7807,
+      "step": 213
+    },
+    {
+      "epoch": 0.22826666666666667,
+      "grad_norm": 0.2819216924645662,
+      "learning_rate": 0.00018020466635293057,
+      "loss": 0.7111,
+      "step": 214
+    },
+    {
+      "epoch": 0.22933333333333333,
+      "grad_norm": 0.3025250296590155,
+      "learning_rate": 0.0001799975400131572,
+      "loss": 0.7606,
+      "step": 215
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3061912629543229,
+      "learning_rate": 0.00017978945602904116,
+      "loss": 0.7556,
+      "step": 216
+    },
+    {
+      "epoch": 0.23146666666666665,
+      "grad_norm": 0.2827904474986111,
+      "learning_rate": 0.0001795804168915396,
+      "loss": 0.7395,
+      "step": 217
+    },
+    {
+      "epoch": 0.23253333333333334,
+      "grad_norm": 0.28857293213283614,
+      "learning_rate": 0.00017937042510304392,
+      "loss": 0.7666,
+      "step": 218
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.3128780749972699,
+      "learning_rate": 0.00017915948317734942,
+      "loss": 0.7465,
+      "step": 219
+    },
+    {
+      "epoch": 0.23466666666666666,
+      "grad_norm": 0.32906725700855655,
+      "learning_rate": 0.00017894759363962554,
+      "loss": 0.7839,
+      "step": 220
+    },
+    {
+      "epoch": 0.23573333333333332,
+      "grad_norm": 0.29731516228232285,
+      "learning_rate": 0.00017873475902638553,
+      "loss": 0.7153,
+      "step": 221
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.2928230641997966,
+      "learning_rate": 0.00017852098188545602,
+      "loss": 0.7008,
+      "step": 222
+    },
+    {
+      "epoch": 0.23786666666666667,
+      "grad_norm": 0.2914718534521632,
+      "learning_rate": 0.00017830626477594654,
+      "loss": 0.7333,
+      "step": 223
+    },
+    {
+      "epoch": 0.23893333333333333,
+      "grad_norm": 0.32700146595596147,
+      "learning_rate": 0.00017809061026821896,
+      "loss": 0.7801,
+      "step": 224
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.2763731273064379,
+      "learning_rate": 0.00017787402094385666,
+      "loss": 0.685,
+      "step": 225
+    },
+    {
+      "epoch": 0.24106666666666668,
+      "grad_norm": 0.2929504436109405,
+      "learning_rate": 0.00017765649939563365,
+      "loss": 0.7488,
+      "step": 226
+    },
+    {
+      "epoch": 0.24213333333333334,
+      "grad_norm": 0.2970130447161082,
+      "learning_rate": 0.00017743804822748345,
+      "loss": 0.7703,
+      "step": 227
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.33159059092356574,
+      "learning_rate": 0.00017721867005446806,
+      "loss": 0.7384,
+      "step": 228
+    },
+    {
+      "epoch": 0.24426666666666666,
+      "grad_norm": 0.2742392922768844,
+      "learning_rate": 0.00017699836750274662,
+      "loss": 0.6779,
+      "step": 229
+    },
+    {
+      "epoch": 0.24533333333333332,
+      "grad_norm": 0.2768590001638933,
+      "learning_rate": 0.00017677714320954378,
+      "loss": 0.6825,
+      "step": 230
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.32132265567577706,
+      "learning_rate": 0.00017655499982311847,
+      "loss": 0.7196,
+      "step": 231
+    },
+    {
+      "epoch": 0.24746666666666667,
+      "grad_norm": 0.29627429210170386,
+      "learning_rate": 0.00017633194000273188,
+      "loss": 0.7544,
+      "step": 232
+    },
+    {
+      "epoch": 0.24853333333333333,
+      "grad_norm": 0.27463251072949696,
+      "learning_rate": 0.00017610796641861581,
+      "loss": 0.6652,
+      "step": 233
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.2932904007753125,
+      "learning_rate": 0.0001758830817519407,
+      "loss": 0.7397,
+      "step": 234
+    },
+    {
+      "epoch": 0.25066666666666665,
+      "grad_norm": 0.28310614670643797,
+      "learning_rate": 0.00017565728869478337,
+      "loss": 0.7179,
+      "step": 235
+    },
+    {
+      "epoch": 0.2517333333333333,
+      "grad_norm": 0.3024834857591237,
+      "learning_rate": 0.00017543058995009503,
+      "loss": 0.7529,
+      "step": 236
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.3136320435854579,
+      "learning_rate": 0.00017520298823166873,
+      "loss": 0.7364,
+      "step": 237
+    },
+    {
+      "epoch": 0.2538666666666667,
+      "grad_norm": 0.2907929595807934,
+      "learning_rate": 0.000174974486264107,
+      "loss": 0.6681,
+      "step": 238
+    },
+    {
+      "epoch": 0.25493333333333335,
+      "grad_norm": 0.3131364120157505,
+      "learning_rate": 0.00017474508678278915,
+      "loss": 0.7363,
+      "step": 239
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.29983111182956595,
+      "learning_rate": 0.00017451479253383857,
+      "loss": 0.7326,
+      "step": 240
+    },
+    {
+      "epoch": 0.25706666666666667,
+      "grad_norm": 0.3207725269511036,
+      "learning_rate": 0.00017428360627408978,
+      "loss": 0.701,
+      "step": 241
+    },
+    {
+      "epoch": 0.2581333333333333,
+      "grad_norm": 0.3861142552859371,
+      "learning_rate": 0.0001740515307710557,
+      "loss": 0.7684,
+      "step": 242
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.31487307045532403,
+      "learning_rate": 0.000173818568802894,
+      "loss": 0.7591,
+      "step": 243
+    },
+    {
+      "epoch": 0.26026666666666665,
+      "grad_norm": 0.2958529377111997,
+      "learning_rate": 0.00017358472315837447,
+      "loss": 0.7488,
+      "step": 244
+    },
+    {
+      "epoch": 0.2613333333333333,
+      "grad_norm": 0.3068972347088698,
+      "learning_rate": 0.00017334999663684504,
+      "loss": 0.8011,
+      "step": 245
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.28791528208628586,
+      "learning_rate": 0.00017311439204819874,
+      "loss": 0.7263,
+      "step": 246
+    },
+    {
+      "epoch": 0.2634666666666667,
+      "grad_norm": 0.29223396448803846,
+      "learning_rate": 0.00017287791221283984,
+      "loss": 0.7267,
+      "step": 247
+    },
+    {
+      "epoch": 0.26453333333333334,
+      "grad_norm": 0.2666690598809469,
+      "learning_rate": 0.00017264055996165007,
+      "loss": 0.6668,
+      "step": 248
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.3309840970310266,
+      "learning_rate": 0.00017240233813595478,
+      "loss": 0.7258,
+      "step": 249
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.2691754319488731,
+      "learning_rate": 0.000172163249587489,
+      "loss": 0.7061,
+      "step": 250
+    },
+    {
+      "epoch": 0.2677333333333333,
+      "grad_norm": 0.29528374904113114,
+      "learning_rate": 0.00017192329717836315,
+      "loss": 0.7466,
+      "step": 251
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.33205946639312367,
+      "learning_rate": 0.00017168248378102892,
+      "loss": 0.8233,
+      "step": 252
+    },
+    {
+      "epoch": 0.26986666666666664,
+      "grad_norm": 0.29562064965308293,
+      "learning_rate": 0.0001714408122782448,
+      "loss": 0.7121,
+      "step": 253
+    },
+    {
+      "epoch": 0.27093333333333336,
+      "grad_norm": 0.3012135977466715,
+      "learning_rate": 0.0001711982855630416,
+      "loss": 0.7437,
+      "step": 254
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.2827803225931734,
+      "learning_rate": 0.00017095490653868778,
+      "loss": 0.7008,
+      "step": 255
+    },
+    {
+      "epoch": 0.2730666666666667,
+      "grad_norm": 0.29491210506131904,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.7075,
+      "step": 256
+    },
+    {
+      "epoch": 0.27413333333333334,
+      "grad_norm": 0.3051925897943795,
+      "learning_rate": 0.000170465603226582,
+      "loss": 0.7325,
+      "step": 257
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.31857535328982267,
+      "learning_rate": 0.00017021968479624203,
+      "loss": 0.768,
+      "step": 258
+    },
+    {
+      "epoch": 0.27626666666666666,
+      "grad_norm": 0.3009770540923035,
+      "learning_rate": 0.00016997292577150528,
+      "loss": 0.7494,
+      "step": 259
+    },
+    {
+      "epoch": 0.2773333333333333,
+      "grad_norm": 0.2921214285940776,
+      "learning_rate": 0.0001697253291063049,
+      "loss": 0.7767,
+      "step": 260
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.29498942574732173,
+      "learning_rate": 0.0001694768977646013,
+      "loss": 0.726,
+      "step": 261
+    },
+    {
+      "epoch": 0.27946666666666664,
+      "grad_norm": 0.2916601404697442,
+      "learning_rate": 0.00016922763472034685,
+      "loss": 0.7162,
+      "step": 262
+    },
+    {
+      "epoch": 0.28053333333333336,
+      "grad_norm": 0.28962661000374823,
+      "learning_rate": 0.00016897754295745008,
+      "loss": 0.7053,
+      "step": 263
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.29026392914900373,
+      "learning_rate": 0.00016872662546974008,
+      "loss": 0.6746,
+      "step": 264
+    },
+    {
+      "epoch": 0.2826666666666667,
+      "grad_norm": 0.2989724726717724,
+      "learning_rate": 0.0001684748852609306,
+      "loss": 0.7048,
+      "step": 265
+    },
+    {
+      "epoch": 0.28373333333333334,
+      "grad_norm": 0.29731514656338986,
+      "learning_rate": 0.00016822232534458416,
+      "loss": 0.7135,
+      "step": 266
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.3043035518302015,
+      "learning_rate": 0.00016796894874407595,
+      "loss": 0.7428,
+      "step": 267
+    },
+    {
+      "epoch": 0.28586666666666666,
+      "grad_norm": 0.29378761374146967,
+      "learning_rate": 0.00016771475849255754,
+      "loss": 0.7102,
+      "step": 268
+    },
+    {
+      "epoch": 0.2869333333333333,
+      "grad_norm": 0.29062578316610915,
+      "learning_rate": 0.0001674597576329207,
+      "loss": 0.7206,
+      "step": 269
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3093426154655176,
+      "learning_rate": 0.00016720394921776097,
+      "loss": 0.7584,
+      "step": 270
+    },
+    {
+      "epoch": 0.2890666666666667,
+      "grad_norm": 0.30174166383165807,
+      "learning_rate": 0.000166947336309341,
+      "loss": 0.7342,
+      "step": 271
+    },
+    {
+      "epoch": 0.29013333333333335,
+      "grad_norm": 0.29456769132732447,
+      "learning_rate": 0.00016668992197955398,
+      "loss": 0.7371,
+      "step": 272
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.3297911302830033,
+      "learning_rate": 0.00016643170930988698,
+      "loss": 0.7862,
+      "step": 273
+    },
+    {
+      "epoch": 0.2922666666666667,
+      "grad_norm": 0.30393752220479153,
+      "learning_rate": 0.00016617270139138371,
+      "loss": 0.7646,
+      "step": 274
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.30104010921831503,
+      "learning_rate": 0.0001659129013246079,
+      "loss": 0.7163,
+      "step": 275
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.35874325189483025,
+      "learning_rate": 0.000165652312219606,
+      "loss": 0.7971,
+      "step": 276
+    },
+    {
+      "epoch": 0.29546666666666666,
+      "grad_norm": 0.26915126189349153,
+      "learning_rate": 0.00016539093719586994,
+      "loss": 0.6894,
+      "step": 277
+    },
+    {
+      "epoch": 0.2965333333333333,
+      "grad_norm": 0.30440693909445965,
+      "learning_rate": 0.00016512877938229986,
+      "loss": 0.7389,
+      "step": 278
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.2795376731367992,
+      "learning_rate": 0.0001648658419171666,
+      "loss": 0.6617,
+      "step": 279
+    },
+    {
+      "epoch": 0.2986666666666667,
+      "grad_norm": 0.27639176627018,
+      "learning_rate": 0.00016460212794807414,
+      "loss": 0.7544,
+      "step": 280
+    },
+    {
+      "epoch": 0.29973333333333335,
+      "grad_norm": 0.28378425437363497,
+      "learning_rate": 0.00016433764063192194,
+      "loss": 0.7048,
+      "step": 281
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.28315366675197184,
+      "learning_rate": 0.00016407238313486712,
+      "loss": 0.7256,
+      "step": 282
+    },
+    {
+      "epoch": 0.30186666666666667,
+      "grad_norm": 0.29143112277983785,
+      "learning_rate": 0.0001638063586322866,
+      "loss": 0.7477,
+      "step": 283
+    },
+    {
+      "epoch": 0.30293333333333333,
+      "grad_norm": 0.3109336287354725,
+      "learning_rate": 0.0001635395703087391,
+      "loss": 0.7458,
+      "step": 284
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.30280188045828377,
+      "learning_rate": 0.00016327202135792685,
+      "loss": 0.7217,
+      "step": 285
+    },
+    {
+      "epoch": 0.30506666666666665,
+      "grad_norm": 0.2854188665742502,
+      "learning_rate": 0.00016300371498265763,
+      "loss": 0.7083,
+      "step": 286
+    },
+    {
+      "epoch": 0.3061333333333333,
+      "grad_norm": 0.33213256633408716,
+      "learning_rate": 0.00016273465439480618,
+      "loss": 0.7722,
+      "step": 287
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3178323971874437,
+      "learning_rate": 0.000162464842815276,
+      "loss": 0.7055,
+      "step": 288
+    },
+    {
+      "epoch": 0.3082666666666667,
+      "grad_norm": 0.28764951172464187,
+      "learning_rate": 0.00016219428347396053,
+      "loss": 0.7144,
+      "step": 289
+    },
+    {
+      "epoch": 0.30933333333333335,
+      "grad_norm": 0.3164517443762592,
+      "learning_rate": 0.0001619229796097046,
+      "loss": 0.7692,
+      "step": 290
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.2862059612445801,
+      "learning_rate": 0.0001616509344702658,
+      "loss": 0.695,
+      "step": 291
+    },
+    {
+      "epoch": 0.31146666666666667,
+      "grad_norm": 0.29574185361704974,
+      "learning_rate": 0.00016137815131227526,
+      "loss": 0.6988,
+      "step": 292
+    },
+    {
+      "epoch": 0.31253333333333333,
+      "grad_norm": 0.33822117083312314,
+      "learning_rate": 0.00016110463340119913,
+      "loss": 0.8098,
+      "step": 293
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.27989243149301973,
+      "learning_rate": 0.000160830384011299,
+      "loss": 0.6968,
+      "step": 294
+    },
+    {
+      "epoch": 0.31466666666666665,
+      "grad_norm": 0.29850067554920934,
+      "learning_rate": 0.00016055540642559305,
+      "loss": 0.6945,
+      "step": 295
+    },
+    {
+      "epoch": 0.3157333333333333,
+      "grad_norm": 0.2861496386709042,
+      "learning_rate": 0.00016027970393581666,
+      "loss": 0.6975,
+      "step": 296
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.2990508342435385,
+      "learning_rate": 0.00016000327984238292,
+      "loss": 0.719,
+      "step": 297
+    },
+    {
+      "epoch": 0.3178666666666667,
+      "grad_norm": 0.28785342962563926,
+      "learning_rate": 0.00015972613745434314,
+      "loss": 0.6655,
+      "step": 298
+    },
+    {
+      "epoch": 0.31893333333333335,
+      "grad_norm": 0.30001158652537224,
+      "learning_rate": 0.0001594482800893474,
+      "loss": 0.748,
+      "step": 299
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.29072964378791066,
+      "learning_rate": 0.00015916971107360461,
+      "loss": 0.7082,
+      "step": 300
+    },
+    {
+      "epoch": 0.32106666666666667,
+      "grad_norm": 0.29890261400540596,
+      "learning_rate": 0.00015889043374184286,
+      "loss": 0.7327,
+      "step": 301
+    },
+    {
+      "epoch": 0.3221333333333333,
+      "grad_norm": 0.3228171591846049,
+      "learning_rate": 0.00015861045143726946,
+      "loss": 0.7367,
+      "step": 302
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.2902748295223263,
+      "learning_rate": 0.00015832976751153078,
+      "loss": 0.7129,
+      "step": 303
+    },
+    {
+      "epoch": 0.32426666666666665,
+      "grad_norm": 0.30275292475490034,
+      "learning_rate": 0.0001580483853246723,
+      "loss": 0.7838,
+      "step": 304
+    },
+    {
+      "epoch": 0.3253333333333333,
+      "grad_norm": 0.2668890914947514,
+      "learning_rate": 0.0001577663082450984,
+      "loss": 0.675,
+      "step": 305
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.3040406339357325,
+      "learning_rate": 0.00015748353964953186,
+      "loss": 0.687,
+      "step": 306
+    },
+    {
+      "epoch": 0.3274666666666667,
+      "grad_norm": 0.29342135112984513,
+      "learning_rate": 0.00015720008292297364,
+      "loss": 0.7259,
+      "step": 307
+    },
+    {
+      "epoch": 0.32853333333333334,
+      "grad_norm": 0.28860332771058517,
+      "learning_rate": 0.00015691594145866215,
+      "loss": 0.7066,
+      "step": 308
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.31672975257705877,
+      "learning_rate": 0.00015663111865803285,
+      "loss": 0.7977,
+      "step": 309
+    },
+    {
+      "epoch": 0.33066666666666666,
+      "grad_norm": 0.3171367453337259,
+      "learning_rate": 0.00015634561793067737,
+      "loss": 0.7414,
+      "step": 310
+    },
+    {
+      "epoch": 0.3317333333333333,
+      "grad_norm": 0.29429343878253417,
+      "learning_rate": 0.00015605944269430277,
+      "loss": 0.6851,
+      "step": 311
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.2833825568619266,
+      "learning_rate": 0.00015577259637469058,
+      "loss": 0.7171,
+      "step": 312
+    },
+    {
+      "epoch": 0.33386666666666664,
+      "grad_norm": 0.31890104033818384,
+      "learning_rate": 0.00015548508240565583,
+      "loss": 0.7667,
+      "step": 313
+    },
+    {
+      "epoch": 0.33493333333333336,
+      "grad_norm": 0.3244450924884595,
+      "learning_rate": 0.00015519690422900593,
+      "loss": 0.8027,
+      "step": 314
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.30332414268590085,
+      "learning_rate": 0.00015490806529449945,
+      "loss": 0.7086,
+      "step": 315
+    },
+    {
+      "epoch": 0.3370666666666667,
+      "grad_norm": 0.31232628731556034,
+      "learning_rate": 0.0001546185690598049,
+      "loss": 0.7693,
+      "step": 316
+    },
+    {
+      "epoch": 0.33813333333333334,
+      "grad_norm": 0.31348120638983434,
+      "learning_rate": 0.0001543284189904592,
+      "loss": 0.7452,
+      "step": 317
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.3118912344088803,
+      "learning_rate": 0.00015403761855982631,
+      "loss": 0.7417,
+      "step": 318
+    },
+    {
+      "epoch": 0.34026666666666666,
+      "grad_norm": 0.30450479497278976,
+      "learning_rate": 0.00015374617124905564,
+      "loss": 0.7544,
+      "step": 319
+    },
+    {
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.3239981443932927,
+      "learning_rate": 0.0001534540805470403,
+      "loss": 0.7603,
+      "step": 320
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.29261333880112783,
+      "learning_rate": 0.00015316134995037545,
+      "loss": 0.7358,
+      "step": 321
+    },
+    {
+      "epoch": 0.34346666666666664,
+      "grad_norm": 0.2885874006177945,
+      "learning_rate": 0.00015286798296331632,
+      "loss": 0.7224,
+      "step": 322
+    },
+    {
+      "epoch": 0.34453333333333336,
+      "grad_norm": 0.29543008226112166,
+      "learning_rate": 0.00015257398309773633,
+      "loss": 0.7312,
+      "step": 323
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.2957848212852695,
+      "learning_rate": 0.00015227935387308511,
+      "loss": 0.6929,
+      "step": 324
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.32926130408501947,
+      "learning_rate": 0.00015198409881634617,
+      "loss": 0.772,
+      "step": 325
+    },
+    {
+      "epoch": 0.34773333333333334,
+      "grad_norm": 0.2809669129045362,
+      "learning_rate": 0.0001516882214619949,
+      "loss": 0.6751,
+      "step": 326
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.28636906382774335,
+      "learning_rate": 0.00015139172535195617,
+      "loss": 0.6989,
+      "step": 327
+    },
+    {
+      "epoch": 0.34986666666666666,
+      "grad_norm": 0.30016437496912274,
+      "learning_rate": 0.0001510946140355619,
+      "loss": 0.7392,
+      "step": 328
+    },
+    {
+      "epoch": 0.3509333333333333,
+      "grad_norm": 0.2942959821556821,
+      "learning_rate": 0.00015079689106950854,
+      "loss": 0.6848,
+      "step": 329
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.30165997763270475,
+      "learning_rate": 0.0001504985600178147,
+      "loss": 0.7109,
+      "step": 330
+    },
+    {
+      "epoch": 0.35306666666666664,
+      "grad_norm": 0.3167869656534145,
+      "learning_rate": 0.00015019962445177819,
+      "loss": 0.7919,
+      "step": 331
+    },
+    {
+      "epoch": 0.35413333333333336,
+      "grad_norm": 0.3134628091835621,
+      "learning_rate": 0.00014990008794993345,
+      "loss": 0.7248,
+      "step": 332
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.2903622009994375,
+      "learning_rate": 0.00014959995409800873,
+      "loss": 0.736,
+      "step": 333
+    },
+    {
+      "epoch": 0.3562666666666667,
+      "grad_norm": 0.29401512481649594,
+      "learning_rate": 0.00014929922648888308,
+      "loss": 0.7384,
+      "step": 334
+    },
+    {
+      "epoch": 0.35733333333333334,
+      "grad_norm": 0.28172948649947277,
+      "learning_rate": 0.0001489979087225434,
+      "loss": 0.6986,
+      "step": 335
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.2850310534609354,
+      "learning_rate": 0.00014869600440604118,
+      "loss": 0.7252,
+      "step": 336
+    },
+    {
+      "epoch": 0.35946666666666666,
+      "grad_norm": 0.2767374226067079,
+      "learning_rate": 0.00014839351715344968,
+      "loss": 0.7054,
+      "step": 337
+    },
+    {
+      "epoch": 0.3605333333333333,
+      "grad_norm": 0.31246054503597304,
+      "learning_rate": 0.00014809045058582026,
+      "loss": 0.7466,
+      "step": 338
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.30447686790541406,
+      "learning_rate": 0.00014778680833113926,
+      "loss": 0.7072,
+      "step": 339
+    },
+    {
+      "epoch": 0.3626666666666667,
+      "grad_norm": 0.27477522093219053,
+      "learning_rate": 0.00014748259402428462,
+      "loss": 0.6963,
+      "step": 340
+    },
+    {
+      "epoch": 0.36373333333333335,
+      "grad_norm": 0.29095451951383444,
+      "learning_rate": 0.00014717781130698212,
+      "loss": 0.7421,
+      "step": 341
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.30372583676503134,
+      "learning_rate": 0.00014687246382776205,
+      "loss": 0.7089,
+      "step": 342
+    },
+    {
+      "epoch": 0.3658666666666667,
+      "grad_norm": 0.28126555993254804,
+      "learning_rate": 0.00014656655524191537,
+      "loss": 0.6845,
+      "step": 343
+    },
+    {
+      "epoch": 0.36693333333333333,
+      "grad_norm": 0.30767316808921585,
+      "learning_rate": 0.0001462600892114501,
+      "loss": 0.733,
+      "step": 344
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.31251721417449835,
+      "learning_rate": 0.00014595306940504716,
+      "loss": 0.7065,
+      "step": 345
+    },
+    {
+      "epoch": 0.36906666666666665,
+      "grad_norm": 0.3108818460327434,
+      "learning_rate": 0.00014564549949801694,
+      "loss": 0.7124,
+      "step": 346
+    },
+    {
+      "epoch": 0.3701333333333333,
+      "grad_norm": 0.282923514009023,
+      "learning_rate": 0.00014533738317225485,
+      "loss": 0.6908,
+      "step": 347
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.3303366957193466,
+      "learning_rate": 0.00014502872411619757,
+      "loss": 0.78,
+      "step": 348
+    },
+    {
+      "epoch": 0.3722666666666667,
+      "grad_norm": 0.3037307159828496,
+      "learning_rate": 0.00014471952602477866,
+      "loss": 0.6914,
+      "step": 349
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.2916449803269286,
+      "learning_rate": 0.0001444097925993845,
+      "loss": 0.6869,
+      "step": 350
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.3138562296402307,
+      "learning_rate": 0.0001440995275478099,
+      "loss": 0.7332,
+      "step": 351
+    },
+    {
+      "epoch": 0.37546666666666667,
+      "grad_norm": 0.2597771712776934,
+      "learning_rate": 0.0001437887345842137,
+      "loss": 0.6605,
+      "step": 352
+    },
+    {
+      "epoch": 0.37653333333333333,
+      "grad_norm": 0.3027758447845734,
+      "learning_rate": 0.00014347741742907433,
+      "loss": 0.7191,
+      "step": 353
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.2893071571942425,
+      "learning_rate": 0.00014316557980914528,
+      "loss": 0.7116,
+      "step": 354
+    },
+    {
+      "epoch": 0.37866666666666665,
+      "grad_norm": 0.2797832212193401,
+      "learning_rate": 0.00014285322545741052,
+      "loss": 0.6991,
+      "step": 355
+    },
+    {
+      "epoch": 0.3797333333333333,
+      "grad_norm": 0.3124620780913201,
+      "learning_rate": 0.0001425403581130398,
+      "loss": 0.7154,
+      "step": 356
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.2941373724733284,
+      "learning_rate": 0.00014222698152134374,
+      "loss": 0.7211,
+      "step": 357
+    },
+    {
+      "epoch": 0.3818666666666667,
+      "grad_norm": 0.2782094160086249,
+      "learning_rate": 0.0001419130994337292,
+      "loss": 0.7214,
+      "step": 358
+    },
+    {
+      "epoch": 0.38293333333333335,
+      "grad_norm": 0.2804505006644943,
+      "learning_rate": 0.00014159871560765432,
+      "loss": 0.6604,
+      "step": 359
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.30212878218473016,
+      "learning_rate": 0.0001412838338065835,
+      "loss": 0.7252,
+      "step": 360
+    },
+    {
+      "epoch": 0.38506666666666667,
+      "grad_norm": 0.2702118918080904,
+      "learning_rate": 0.0001409684577999423,
+      "loss": 0.6071,
+      "step": 361
+    },
+    {
+      "epoch": 0.38613333333333333,
+      "grad_norm": 0.2892472891674336,
+      "learning_rate": 0.00014065259136307242,
+      "loss": 0.6771,
+      "step": 362
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.3284962152202465,
+      "learning_rate": 0.0001403362382771865,
+      "loss": 0.7286,
+      "step": 363
+    },
+    {
+      "epoch": 0.38826666666666665,
+      "grad_norm": 0.3159813553417371,
+      "learning_rate": 0.0001400194023293228,
+      "loss": 0.7421,
+      "step": 364
+    },
+    {
+      "epoch": 0.3893333333333333,
+      "grad_norm": 0.2680233418669452,
+      "learning_rate": 0.00013970208731229974,
+      "loss": 0.6407,
+      "step": 365
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.3696073633336626,
+      "learning_rate": 0.00013938429702467086,
+      "loss": 0.7358,
+      "step": 366
+    },
+    {
+      "epoch": 0.3914666666666667,
+      "grad_norm": 0.286676388334349,
+      "learning_rate": 0.000139066035270679,
+      "loss": 0.7204,
+      "step": 367
+    },
+    {
+      "epoch": 0.39253333333333335,
+      "grad_norm": 0.2839122477820604,
+      "learning_rate": 0.00013874730586021093,
+      "loss": 0.7149,
+      "step": 368
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.33071584092148254,
+      "learning_rate": 0.00013842811260875168,
+      "loss": 0.7061,
+      "step": 369
+    },
+    {
+      "epoch": 0.39466666666666667,
+      "grad_norm": 0.27418658262569773,
+      "learning_rate": 0.0001381084593373389,
+      "loss": 0.6862,
+      "step": 370
+    },
+    {
+      "epoch": 0.3957333333333333,
+      "grad_norm": 0.2804854915936261,
+      "learning_rate": 0.00013778834987251707,
+      "loss": 0.7242,
+      "step": 371
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.30504085364295,
+      "learning_rate": 0.00013746778804629177,
+      "loss": 0.7233,
+      "step": 372
+    },
+    {
+      "epoch": 0.39786666666666665,
+      "grad_norm": 0.27377761971187475,
+      "learning_rate": 0.0001371467776960837,
+      "loss": 0.6684,
+      "step": 373
+    },
+    {
+      "epoch": 0.3989333333333333,
+      "grad_norm": 0.30178576642031846,
+      "learning_rate": 0.0001368253226646829,
+      "loss": 0.684,
+      "step": 374
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.29748127767091714,
+      "learning_rate": 0.00013650342680020258,
+      "loss": 0.7274,
+      "step": 375
+    },
+    {
+      "epoch": 0.4010666666666667,
+      "grad_norm": 0.29599308598306373,
+      "learning_rate": 0.00013618109395603317,
+      "loss": 0.6669,
+      "step": 376
+    },
+    {
+      "epoch": 0.40213333333333334,
+      "grad_norm": 0.29901688850136304,
+      "learning_rate": 0.0001358583279907961,
+      "loss": 0.7298,
+      "step": 377
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.2800474421972958,
+      "learning_rate": 0.0001355351327682977,
+      "loss": 0.7102,
+      "step": 378
+    },
+    {
+      "epoch": 0.40426666666666666,
+      "grad_norm": 0.292933712030632,
+      "learning_rate": 0.0001352115121574829,
+      "loss": 0.703,
+      "step": 379
+    },
+    {
+      "epoch": 0.4053333333333333,
+      "grad_norm": 0.26604413420465156,
+      "learning_rate": 0.00013488747003238892,
+      "loss": 0.6801,
+      "step": 380
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.2984747285500766,
+      "learning_rate": 0.00013456301027209882,
+      "loss": 0.7136,
+      "step": 381
+    },
+    {
+      "epoch": 0.40746666666666664,
+      "grad_norm": 0.3010120016237503,
+      "learning_rate": 0.00013423813676069534,
+      "loss": 0.7163,
+      "step": 382
+    },
+    {
+      "epoch": 0.40853333333333336,
+      "grad_norm": 0.29077295670348746,
+      "learning_rate": 0.000133912853387214,
+      "loss": 0.6896,
+      "step": 383
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.2734708039635259,
+      "learning_rate": 0.0001335871640455968,
+      "loss": 0.6581,
+      "step": 384
+    },
+    {
+      "epoch": 0.4106666666666667,
+      "grad_norm": 0.266365248452827,
+      "learning_rate": 0.00013326107263464558,
+      "loss": 0.7078,
+      "step": 385
+    },
+    {
+      "epoch": 0.41173333333333334,
+      "grad_norm": 0.3119455472011162,
+      "learning_rate": 0.00013293458305797533,
+      "loss": 0.7513,
+      "step": 386
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.31817694431513605,
+      "learning_rate": 0.0001326076992239674,
+      "loss": 0.7097,
+      "step": 387
+    },
+    {
+      "epoch": 0.41386666666666666,
+      "grad_norm": 0.2718709183308422,
+      "learning_rate": 0.00013228042504572285,
+      "loss": 0.6692,
+      "step": 388
+    },
+    {
+      "epoch": 0.4149333333333333,
+      "grad_norm": 0.2900547973410943,
+      "learning_rate": 0.00013195276444101547,
+      "loss": 0.7265,
+      "step": 389
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3022191928802937,
+      "learning_rate": 0.00013162472133224483,
+      "loss": 0.7225,
+      "step": 390
+    },
+    {
+      "epoch": 0.41706666666666664,
+      "grad_norm": 0.2822429057631901,
+      "learning_rate": 0.0001312962996463896,
+      "loss": 0.679,
+      "step": 391
+    },
+    {
+      "epoch": 0.41813333333333336,
+      "grad_norm": 0.27940934892113045,
+      "learning_rate": 0.00013096750331496033,
+      "loss": 0.6523,
+      "step": 392
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.31788555354012227,
+      "learning_rate": 0.0001306383362739523,
+      "loss": 0.7764,
+      "step": 393
+    },
+    {
+      "epoch": 0.4202666666666667,
+      "grad_norm": 0.2934418110735407,
+      "learning_rate": 0.00013030880246379866,
+      "loss": 0.7124,
+      "step": 394
+    },
+    {
+      "epoch": 0.42133333333333334,
+      "grad_norm": 0.3165821382129767,
+      "learning_rate": 0.00012997890582932303,
+      "loss": 0.7585,
+      "step": 395
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.308246917214587,
+      "learning_rate": 0.00012964865031969252,
+      "loss": 0.709,
+      "step": 396
+    },
+    {
+      "epoch": 0.42346666666666666,
+      "grad_norm": 0.3092407284421468,
+      "learning_rate": 0.0001293180398883701,
+      "loss": 0.773,
+      "step": 397
+    },
+    {
+      "epoch": 0.4245333333333333,
+      "grad_norm": 0.29363551590447556,
+      "learning_rate": 0.00012898707849306763,
+      "loss": 0.6796,
+      "step": 398
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.2998652820938399,
+      "learning_rate": 0.00012865577009569824,
+      "loss": 0.7374,
+      "step": 399
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.2783646425542507,
+      "learning_rate": 0.0001283241186623291,
+      "loss": 0.6784,
+      "step": 400
+    },
+    {
+      "epoch": 0.42773333333333335,
+      "grad_norm": 0.29705300656456096,
+      "learning_rate": 0.00012799212816313376,
+      "loss": 0.6951,
+      "step": 401
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.26911596672632715,
+      "learning_rate": 0.00012765980257234473,
+      "loss": 0.6754,
+      "step": 402
+    },
+    {
+      "epoch": 0.4298666666666667,
+      "grad_norm": 0.2711186894753625,
+      "learning_rate": 0.00012732714586820583,
+      "loss": 0.6634,
+      "step": 403
+    },
+    {
+      "epoch": 0.43093333333333333,
+      "grad_norm": 0.3062968880528903,
+      "learning_rate": 0.00012699416203292466,
+      "loss": 0.7103,
+      "step": 404
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.29724964491767264,
+      "learning_rate": 0.00012666085505262485,
+      "loss": 0.7129,
+      "step": 405
+    },
+    {
+      "epoch": 0.43306666666666666,
+      "grad_norm": 0.3276764158913423,
+      "learning_rate": 0.00012632722891729845,
+      "loss": 0.7434,
+      "step": 406
+    },
+    {
+      "epoch": 0.4341333333333333,
+      "grad_norm": 0.2760680245789975,
+      "learning_rate": 0.000125993287620758,
+      "loss": 0.6518,
+      "step": 407
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.28691338701577757,
+      "learning_rate": 0.00012565903516058882,
+      "loss": 0.6537,
+      "step": 408
+    },
+    {
+      "epoch": 0.4362666666666667,
+      "grad_norm": 0.30041569017505154,
+      "learning_rate": 0.00012532447553810126,
+      "loss": 0.7138,
+      "step": 409
+    },
+    {
+      "epoch": 0.43733333333333335,
+      "grad_norm": 0.27959002497055496,
+      "learning_rate": 0.00012498961275828247,
+      "loss": 0.6669,
+      "step": 410
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.2962430647749866,
+      "learning_rate": 0.00012465445082974886,
+      "loss": 0.7467,
+      "step": 411
+    },
+    {
+      "epoch": 0.43946666666666667,
+      "grad_norm": 0.27405141310923437,
+      "learning_rate": 0.00012431899376469784,
+      "loss": 0.7013,
+      "step": 412
+    },
+    {
+      "epoch": 0.44053333333333333,
+      "grad_norm": 0.2684139536538161,
+      "learning_rate": 0.00012398324557885994,
+      "loss": 0.6689,
+      "step": 413
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.2806857537453147,
+      "learning_rate": 0.0001236472102914506,
+      "loss": 0.6913,
+      "step": 414
+    },
+    {
+      "epoch": 0.44266666666666665,
+      "grad_norm": 0.2778467641703905,
+      "learning_rate": 0.00012331089192512218,
+      "loss": 0.68,
+      "step": 415
+    },
+    {
+      "epoch": 0.4437333333333333,
+      "grad_norm": 0.3207849562533483,
+      "learning_rate": 0.00012297429450591575,
+      "loss": 0.7562,
+      "step": 416
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.3021635998972281,
+      "learning_rate": 0.00012263742206321287,
+      "loss": 0.7177,
+      "step": 417
+    },
+    {
+      "epoch": 0.4458666666666667,
+      "grad_norm": 0.3092902309614487,
+      "learning_rate": 0.00012230027862968743,
+      "loss": 0.7299,
+      "step": 418
+    },
+    {
+      "epoch": 0.44693333333333335,
+      "grad_norm": 0.3036539312661521,
+      "learning_rate": 0.00012196286824125726,
+      "loss": 0.7413,
+      "step": 419
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.31927918690705254,
+      "learning_rate": 0.000121625194937036,
+      "loss": 0.7218,
+      "step": 420
+    },
+    {
+      "epoch": 0.44906666666666667,
+      "grad_norm": 0.2884783502399563,
+      "learning_rate": 0.0001212872627592845,
+      "loss": 0.6676,
+      "step": 421
+    },
+    {
+      "epoch": 0.45013333333333333,
+      "grad_norm": 0.28669541960623074,
+      "learning_rate": 0.00012094907575336267,
+      "loss": 0.6799,
+      "step": 422
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.2704411179231478,
+      "learning_rate": 0.0001206106379676809,
+      "loss": 0.6891,
+      "step": 423
+    },
+    {
+      "epoch": 0.45226666666666665,
+      "grad_norm": 0.3254286958434096,
+      "learning_rate": 0.00012027195345365167,
+      "loss": 0.7372,
+      "step": 424
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.2731466535433553,
+      "learning_rate": 0.00011993302626564102,
+      "loss": 0.6967,
+      "step": 425
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.2807729185976604,
+      "learning_rate": 0.00011959386046091998,
+      "loss": 0.6839,
+      "step": 426
+    },
+    {
+      "epoch": 0.4554666666666667,
+      "grad_norm": 0.2787040118325204,
+      "learning_rate": 0.00011925446009961607,
+      "loss": 0.6816,
+      "step": 427
+    },
+    {
+      "epoch": 0.45653333333333335,
+      "grad_norm": 0.27789017423965373,
+      "learning_rate": 0.00011891482924466471,
+      "loss": 0.6918,
+      "step": 428
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3182433579237445,
+      "learning_rate": 0.00011857497196176049,
+      "loss": 0.6531,
+      "step": 429
+    },
+    {
+      "epoch": 0.45866666666666667,
+      "grad_norm": 0.290394658448927,
+      "learning_rate": 0.00011823489231930854,
+      "loss": 0.6952,
+      "step": 430
+    },
+    {
+      "epoch": 0.4597333333333333,
+      "grad_norm": 0.2983989954278844,
+      "learning_rate": 0.00011789459438837589,
+      "loss": 0.7416,
+      "step": 431
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.2887353500957207,
+      "learning_rate": 0.00011755408224264269,
+      "loss": 0.7064,
+      "step": 432
+    },
+    {
+      "epoch": 0.46186666666666665,
+      "grad_norm": 0.28895397897697245,
+      "learning_rate": 0.00011721335995835336,
+      "loss": 0.6949,
+      "step": 433
+    },
+    {
+      "epoch": 0.4629333333333333,
+      "grad_norm": 0.3042478870143878,
+      "learning_rate": 0.00011687243161426793,
+      "loss": 0.747,
+      "step": 434
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.299282151006539,
+      "learning_rate": 0.00011653130129161316,
+      "loss": 0.7504,
+      "step": 435
+    },
+    {
+      "epoch": 0.4650666666666667,
+      "grad_norm": 0.2949167146250877,
+      "learning_rate": 0.00011618997307403367,
+      "loss": 0.6774,
+      "step": 436
+    },
+    {
+      "epoch": 0.46613333333333334,
+      "grad_norm": 0.3030249400725251,
+      "learning_rate": 0.00011584845104754304,
+      "loss": 0.7105,
+      "step": 437
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.29589961404518833,
+      "learning_rate": 0.00011550673930047498,
+      "loss": 0.6639,
+      "step": 438
+    },
+    {
+      "epoch": 0.46826666666666666,
+      "grad_norm": 0.2685570345029132,
+      "learning_rate": 0.00011516484192343425,
+      "loss": 0.6516,
+      "step": 439
+    },
+    {
+      "epoch": 0.4693333333333333,
+      "grad_norm": 0.272942272719098,
+      "learning_rate": 0.00011482276300924782,
+      "loss": 0.6465,
+      "step": 440
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.2859207332260473,
+      "learning_rate": 0.00011448050665291587,
+      "loss": 0.6676,
+      "step": 441
+    },
+    {
+      "epoch": 0.47146666666666665,
+      "grad_norm": 0.2866673263857679,
+      "learning_rate": 0.00011413807695156262,
+      "loss": 0.6987,
+      "step": 442
+    },
+    {
+      "epoch": 0.47253333333333336,
+      "grad_norm": 0.32704950699804464,
+      "learning_rate": 0.00011379547800438747,
+      "loss": 0.6961,
+      "step": 443
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3024224748565969,
+      "learning_rate": 0.00011345271391261584,
+      "loss": 0.7044,
+      "step": 444
+    },
+    {
+      "epoch": 0.4746666666666667,
+      "grad_norm": 0.31228993211722267,
+      "learning_rate": 0.00011310978877945007,
+      "loss": 0.7366,
+      "step": 445
+    },
+    {
+      "epoch": 0.47573333333333334,
+      "grad_norm": 0.28716266186260053,
+      "learning_rate": 0.00011276670671002028,
+      "loss": 0.7087,
+      "step": 446
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.2715422982460639,
+      "learning_rate": 0.00011242347181133533,
+      "loss": 0.6649,
+      "step": 447
+    },
+    {
+      "epoch": 0.47786666666666666,
+      "grad_norm": 0.27227233928386024,
+      "learning_rate": 0.00011208008819223354,
+      "loss": 0.6582,
+      "step": 448
+    },
+    {
+      "epoch": 0.4789333333333333,
+      "grad_norm": 0.2558079132567336,
+      "learning_rate": 0.00011173655996333357,
+      "loss": 0.6352,
+      "step": 449
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.274934500937583,
+      "learning_rate": 0.00011139289123698518,
+      "loss": 0.6692,
+      "step": 450
+    },
+    {
+      "epoch": 0.48106666666666664,
+      "grad_norm": 0.28820425858138565,
+      "learning_rate": 0.00011104908612722001,
+      "loss": 0.7306,
+      "step": 451
+    },
+    {
+      "epoch": 0.48213333333333336,
+      "grad_norm": 0.2786676865531545,
+      "learning_rate": 0.00011070514874970237,
+      "loss": 0.6813,
+      "step": 452
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.28781836180871956,
+      "learning_rate": 0.00011036108322167988,
+      "loss": 0.6828,
+      "step": 453
+    },
+    {
+      "epoch": 0.4842666666666667,
+      "grad_norm": 0.2628916240402343,
+      "learning_rate": 0.00011001689366193433,
+      "loss": 0.6712,
+      "step": 454
+    },
+    {
+      "epoch": 0.48533333333333334,
+      "grad_norm": 0.3316475169969365,
+      "learning_rate": 0.00010967258419073217,
+      "loss": 0.7702,
+      "step": 455
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.29632582000702057,
+      "learning_rate": 0.00010932815892977535,
+      "loss": 0.7145,
+      "step": 456
+    },
+    {
+      "epoch": 0.48746666666666666,
+      "grad_norm": 0.3033796550777103,
+      "learning_rate": 0.00010898362200215197,
+      "loss": 0.7128,
+      "step": 457
+    },
+    {
+      "epoch": 0.4885333333333333,
+      "grad_norm": 0.2812670956871168,
+      "learning_rate": 0.00010863897753228687,
+      "loss": 0.673,
+      "step": 458
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.27284272460274583,
+      "learning_rate": 0.0001082942296458922,
+      "loss": 0.6663,
+      "step": 459
+    },
+    {
+      "epoch": 0.49066666666666664,
+      "grad_norm": 0.32150331711989816,
+      "learning_rate": 0.00010794938246991817,
+      "loss": 0.7248,
+      "step": 460
+    },
+    {
+      "epoch": 0.49173333333333336,
+      "grad_norm": 0.3124675705383241,
+      "learning_rate": 0.0001076044401325036,
+      "loss": 0.7058,
+      "step": 461
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.3056934528138313,
+      "learning_rate": 0.00010725940676292636,
+      "loss": 0.7524,
+      "step": 462
+    },
+    {
+      "epoch": 0.4938666666666667,
+      "grad_norm": 0.25821166006218316,
+      "learning_rate": 0.0001069142864915542,
+      "loss": 0.6728,
+      "step": 463
+    },
+    {
+      "epoch": 0.49493333333333334,
+      "grad_norm": 0.29667202104180407,
+      "learning_rate": 0.00010656908344979506,
+      "loss": 0.6928,
+      "step": 464
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3082632780946926,
+      "learning_rate": 0.0001062238017700478,
+      "loss": 0.7028,
+      "step": 465
+    },
+    {
+      "epoch": 0.49706666666666666,
+      "grad_norm": 0.2732022844493322,
+      "learning_rate": 0.00010587844558565261,
+      "loss": 0.6891,
+      "step": 466
+    },
+    {
+      "epoch": 0.4981333333333333,
+      "grad_norm": 0.2694717190175374,
+      "learning_rate": 0.00010553301903084157,
+      "loss": 0.6945,
+      "step": 467
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.29173066404677117,
+      "learning_rate": 0.00010518752624068911,
+      "loss": 0.6918,
+      "step": 468
+    },
+    {
+      "epoch": 0.5002666666666666,
+      "grad_norm": 0.2991188095881304,
+      "learning_rate": 0.00010484197135106263,
+      "loss": 0.7075,
+      "step": 469
+    },
+    {
+      "epoch": 0.5013333333333333,
+      "grad_norm": 0.3266209656618395,
+      "learning_rate": 0.0001044963584985729,
+      "loss": 0.6726,
+      "step": 470
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.30257739145875007,
+      "learning_rate": 0.0001041506918205246,
+      "loss": 0.7064,
+      "step": 471
+    },
+    {
+      "epoch": 0.5034666666666666,
+      "grad_norm": 0.2899570070928348,
+      "learning_rate": 0.00010380497545486663,
+      "loss": 0.6933,
+      "step": 472
+    },
+    {
+      "epoch": 0.5045333333333333,
+      "grad_norm": 0.2842242620832916,
+      "learning_rate": 0.00010345921354014279,
+      "loss": 0.6764,
+      "step": 473
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.27906256261403156,
+      "learning_rate": 0.00010311341021544218,
+      "loss": 0.6855,
+      "step": 474
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.3159988605982528,
+      "learning_rate": 0.0001027675696203495,
+      "loss": 0.6977,
+      "step": 475
+    },
+    {
+      "epoch": 0.5077333333333334,
+      "grad_norm": 0.30054675574873646,
+      "learning_rate": 0.00010242169589489568,
+      "loss": 0.6937,
+      "step": 476
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.28457048083757125,
+      "learning_rate": 0.00010207579317950827,
+      "loss": 0.6512,
+      "step": 477
+    },
+    {
+      "epoch": 0.5098666666666667,
+      "grad_norm": 0.30356306008660766,
+      "learning_rate": 0.0001017298656149618,
+      "loss": 0.7225,
+      "step": 478
+    },
+    {
+      "epoch": 0.5109333333333334,
+      "grad_norm": 0.28317457056646794,
+      "learning_rate": 0.00010138391734232832,
+      "loss": 0.6838,
+      "step": 479
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.31768004643728465,
+      "learning_rate": 0.00010103795250292778,
+      "loss": 0.7606,
+      "step": 480
+    },
+    {
+      "epoch": 0.5130666666666667,
+      "grad_norm": 0.2674934346224266,
+      "learning_rate": 0.00010069197523827833,
+      "loss": 0.6467,
+      "step": 481
+    },
+    {
+      "epoch": 0.5141333333333333,
+      "grad_norm": 0.2841931782123327,
+      "learning_rate": 0.00010034598969004705,
+      "loss": 0.6924,
+      "step": 482
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.2976191946843458,
+      "learning_rate": 0.0001,
+      "loss": 0.6877,
+      "step": 483
+    },
+    {
+      "epoch": 0.5162666666666667,
+      "grad_norm": 0.3045853152591595,
+      "learning_rate": 9.965401030995301e-05,
+      "loss": 0.7188,
+      "step": 484
+    },
+    {
+      "epoch": 0.5173333333333333,
+      "grad_norm": 0.30795860143094705,
+      "learning_rate": 9.930802476172169e-05,
+      "loss": 0.6544,
+      "step": 485
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.28724834345193667,
+      "learning_rate": 9.896204749707228e-05,
+      "loss": 0.6935,
+      "step": 486
+    },
+    {
+      "epoch": 0.5194666666666666,
+      "grad_norm": 0.33534202228153537,
+      "learning_rate": 9.861608265767167e-05,
+      "loss": 0.7443,
+      "step": 487
+    },
+    {
+      "epoch": 0.5205333333333333,
+      "grad_norm": 0.29500696251450703,
+      "learning_rate": 9.827013438503822e-05,
+      "loss": 0.6852,
+      "step": 488
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.2774665215507267,
+      "learning_rate": 9.792420682049174e-05,
+      "loss": 0.6714,
+      "step": 489
+    },
+    {
+      "epoch": 0.5226666666666666,
+      "grad_norm": 0.27879423976510714,
+      "learning_rate": 9.757830410510433e-05,
+      "loss": 0.6628,
+      "step": 490
+    },
+    {
+      "epoch": 0.5237333333333334,
+      "grad_norm": 0.24864113181101832,
+      "learning_rate": 9.723243037965056e-05,
+      "loss": 0.6392,
+      "step": 491
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.29714731482588147,
+      "learning_rate": 9.688658978455784e-05,
+      "loss": 0.6926,
+      "step": 492
+    },
+    {
+      "epoch": 0.5258666666666667,
+      "grad_norm": 0.27833391768791793,
+      "learning_rate": 9.654078645985722e-05,
+      "loss": 0.736,
+      "step": 493
+    },
+    {
+      "epoch": 0.5269333333333334,
+      "grad_norm": 0.2694171559355083,
+      "learning_rate": 9.619502454513338e-05,
+      "loss": 0.6975,
+      "step": 494
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.29810632742874227,
+      "learning_rate": 9.584930817947544e-05,
+      "loss": 0.7008,
+      "step": 495
+    },
+    {
+      "epoch": 0.5290666666666667,
+      "grad_norm": 0.2948716126782792,
+      "learning_rate": 9.550364150142713e-05,
+      "loss": 0.6776,
+      "step": 496
+    },
+    {
+      "epoch": 0.5301333333333333,
+      "grad_norm": 0.2946985777896799,
+      "learning_rate": 9.515802864893739e-05,
+      "loss": 0.6741,
+      "step": 497
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.284247571441035,
+      "learning_rate": 9.481247375931094e-05,
+      "loss": 0.7164,
+      "step": 498
+    },
+    {
+      "epoch": 0.5322666666666667,
+      "grad_norm": 0.27581045832111845,
+      "learning_rate": 9.446698096915847e-05,
+      "loss": 0.6428,
+      "step": 499
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.30837432342203425,
+      "learning_rate": 9.412155441434741e-05,
+      "loss": 0.6578,
+      "step": 500
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.2990392539662064,
+      "learning_rate": 9.377619822995219e-05,
+      "loss": 0.6824,
+      "step": 501
+    },
+    {
+      "epoch": 0.5354666666666666,
+      "grad_norm": 0.29584192832682144,
+      "learning_rate": 9.343091655020495e-05,
+      "loss": 0.6581,
+      "step": 502
+    },
+    {
+      "epoch": 0.5365333333333333,
+      "grad_norm": 0.2756642495486731,
+      "learning_rate": 9.308571350844584e-05,
+      "loss": 0.6553,
+      "step": 503
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.31625137977434126,
+      "learning_rate": 9.274059323707366e-05,
+      "loss": 0.7254,
+      "step": 504
+    },
+    {
+      "epoch": 0.5386666666666666,
+      "grad_norm": 0.2888163041939396,
+      "learning_rate": 9.239555986749645e-05,
+      "loss": 0.6902,
+      "step": 505
+    },
+    {
+      "epoch": 0.5397333333333333,
+      "grad_norm": 0.2823616564888589,
+      "learning_rate": 9.205061753008183e-05,
+      "loss": 0.6761,
+      "step": 506
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.312245375551478,
+      "learning_rate": 9.170577035410783e-05,
+      "loss": 0.7174,
+      "step": 507
+    },
+    {
+      "epoch": 0.5418666666666667,
+      "grad_norm": 0.27329110126372763,
+      "learning_rate": 9.136102246771314e-05,
+      "loss": 0.7092,
+      "step": 508
+    },
+    {
+      "epoch": 0.5429333333333334,
+      "grad_norm": 0.28289621600363557,
+      "learning_rate": 9.101637799784804e-05,
+      "loss": 0.6726,
+      "step": 509
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.30378125283881025,
+      "learning_rate": 9.06718410702247e-05,
+      "loss": 0.7135,
+      "step": 510
+    },
+    {
+      "epoch": 0.5450666666666667,
+      "grad_norm": 0.27386661465405454,
+      "learning_rate": 9.032741580926787e-05,
+      "loss": 0.6595,
+      "step": 511
+    },
+    {
+      "epoch": 0.5461333333333334,
+      "grad_norm": 0.2917119627165848,
+      "learning_rate": 8.998310633806571e-05,
+      "loss": 0.667,
+      "step": 512
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.32680184848419386,
+      "learning_rate": 8.963891677832011e-05,
+      "loss": 0.7503,
+      "step": 513
+    },
+    {
+      "epoch": 0.5482666666666667,
+      "grad_norm": 0.2887453813002438,
+      "learning_rate": 8.929485125029766e-05,
+      "loss": 0.703,
+      "step": 514
+    },
+    {
+      "epoch": 0.5493333333333333,
+      "grad_norm": 0.3123646066338202,
+      "learning_rate": 8.895091387277999e-05,
+      "loss": 0.7221,
+      "step": 515
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.3043523712481793,
+      "learning_rate": 8.860710876301484e-05,
+      "loss": 0.7392,
+      "step": 516
+    },
+    {
+      "epoch": 0.5514666666666667,
+      "grad_norm": 0.26371171892204565,
+      "learning_rate": 8.826344003666647e-05,
+      "loss": 0.6779,
+      "step": 517
+    },
+    {
+      "epoch": 0.5525333333333333,
+      "grad_norm": 0.28268771266944587,
+      "learning_rate": 8.791991180776648e-05,
+      "loss": 0.6895,
+      "step": 518
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.30496944666575654,
+      "learning_rate": 8.757652818866471e-05,
+      "loss": 0.6762,
+      "step": 519
+    },
+    {
+      "epoch": 0.5546666666666666,
+      "grad_norm": 0.27800056978244325,
+      "learning_rate": 8.723329328997973e-05,
+      "loss": 0.6773,
+      "step": 520
+    },
+    {
+      "epoch": 0.5557333333333333,
+      "grad_norm": 0.3036649295616628,
+      "learning_rate": 8.689021122054996e-05,
+      "loss": 0.695,
+      "step": 521
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.2932496959497221,
+      "learning_rate": 8.654728608738418e-05,
+      "loss": 0.6949,
+      "step": 522
+    },
+    {
+      "epoch": 0.5578666666666666,
+      "grad_norm": 0.28868433300841473,
+      "learning_rate": 8.620452199561254e-05,
+      "loss": 0.6866,
+      "step": 523
+    },
+    {
+      "epoch": 0.5589333333333333,
+      "grad_norm": 0.29142913825254585,
+      "learning_rate": 8.58619230484374e-05,
+      "loss": 0.691,
+      "step": 524
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.26612842848899165,
+      "learning_rate": 8.551949334708415e-05,
+      "loss": 0.662,
+      "step": 525
+    },
+    {
+      "epoch": 0.5610666666666667,
+      "grad_norm": 0.2815041444215433,
+      "learning_rate": 8.51772369907522e-05,
+      "loss": 0.7068,
+      "step": 526
+    },
+    {
+      "epoch": 0.5621333333333334,
+      "grad_norm": 0.32699659703784895,
+      "learning_rate": 8.483515807656576e-05,
+      "loss": 0.7203,
+      "step": 527
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.2971502635394618,
+      "learning_rate": 8.449326069952506e-05,
+      "loss": 0.6996,
+      "step": 528
+    },
+    {
+      "epoch": 0.5642666666666667,
+      "grad_norm": 0.2888587243184356,
+      "learning_rate": 8.415154895245697e-05,
+      "loss": 0.6374,
+      "step": 529
+    },
+    {
+      "epoch": 0.5653333333333334,
+      "grad_norm": 0.3902497947590015,
+      "learning_rate": 8.381002692596635e-05,
+      "loss": 0.6656,
+      "step": 530
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.307671581911795,
+      "learning_rate": 8.346869870838685e-05,
+      "loss": 0.7331,
+      "step": 531
+    },
+    {
+      "epoch": 0.5674666666666667,
+      "grad_norm": 0.2724483232898439,
+      "learning_rate": 8.312756838573208e-05,
+      "loss": 0.6452,
+      "step": 532
+    },
+    {
+      "epoch": 0.5685333333333333,
+      "grad_norm": 0.2983827706569765,
+      "learning_rate": 8.278664004164665e-05,
+      "loss": 0.6677,
+      "step": 533
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.2787531753541985,
+      "learning_rate": 8.244591775735732e-05,
+      "loss": 0.6478,
+      "step": 534
+    },
+    {
+      "epoch": 0.5706666666666667,
+      "grad_norm": 0.283768099032677,
+      "learning_rate": 8.210540561162412e-05,
+      "loss": 0.6752,
+      "step": 535
+    },
+    {
+      "epoch": 0.5717333333333333,
+      "grad_norm": 0.291717309703545,
+      "learning_rate": 8.176510768069147e-05,
+      "loss": 0.7109,
+      "step": 536
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3031358111163699,
+      "learning_rate": 8.142502803823955e-05,
+      "loss": 0.7195,
+      "step": 537
+    },
+    {
+      "epoch": 0.5738666666666666,
+      "grad_norm": 0.29478240014249535,
+      "learning_rate": 8.108517075533531e-05,
+      "loss": 0.691,
+      "step": 538
+    },
+    {
+      "epoch": 0.5749333333333333,
+      "grad_norm": 0.29709353597605226,
+      "learning_rate": 8.074553990038395e-05,
+      "loss": 0.6488,
+      "step": 539
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.29440998991459344,
+      "learning_rate": 8.040613953908005e-05,
+      "loss": 0.7157,
+      "step": 540
+    },
+    {
+      "epoch": 0.5770666666666666,
+      "grad_norm": 0.29573345625739195,
+      "learning_rate": 8.0066973734359e-05,
+      "loss": 0.6637,
+      "step": 541
+    },
+    {
+      "epoch": 0.5781333333333334,
+      "grad_norm": 0.32999656344904255,
+      "learning_rate": 7.972804654634834e-05,
+      "loss": 0.7149,
+      "step": 542
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.306402388226421,
+      "learning_rate": 7.938936203231912e-05,
+      "loss": 0.7093,
+      "step": 543
+    },
+    {
+      "epoch": 0.5802666666666667,
+      "grad_norm": 0.29023525020313806,
+      "learning_rate": 7.905092424663735e-05,
+      "loss": 0.6672,
+      "step": 544
+    },
+    {
+      "epoch": 0.5813333333333334,
+      "grad_norm": 0.2831298509229484,
+      "learning_rate": 7.871273724071553e-05,
+      "loss": 0.6462,
+      "step": 545
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.2813178654912594,
+      "learning_rate": 7.837480506296404e-05,
+      "loss": 0.6305,
+      "step": 546
+    },
+    {
+      "epoch": 0.5834666666666667,
+      "grad_norm": 0.2662660017764881,
+      "learning_rate": 7.803713175874275e-05,
+      "loss": 0.6498,
+      "step": 547
+    },
+    {
+      "epoch": 0.5845333333333333,
+      "grad_norm": 0.2817750011294188,
+      "learning_rate": 7.769972137031262e-05,
+      "loss": 0.6254,
+      "step": 548
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.29468313237397137,
+      "learning_rate": 7.736257793678714e-05,
+      "loss": 0.6975,
+      "step": 549
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.3032568151281726,
+      "learning_rate": 7.702570549408428e-05,
+      "loss": 0.67,
+      "step": 550
+    },
+    {
+      "epoch": 0.5877333333333333,
+      "grad_norm": 0.31411653264107314,
+      "learning_rate": 7.668910807487783e-05,
+      "loss": 0.7137,
+      "step": 551
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.2938706074022764,
+      "learning_rate": 7.635278970854943e-05,
+      "loss": 0.6789,
+      "step": 552
+    },
+    {
+      "epoch": 0.5898666666666667,
+      "grad_norm": 0.2623978006966498,
+      "learning_rate": 7.601675442114009e-05,
+      "loss": 0.6418,
+      "step": 553
+    },
+    {
+      "epoch": 0.5909333333333333,
+      "grad_norm": 0.3092538965585159,
+      "learning_rate": 7.568100623530217e-05,
+      "loss": 0.7048,
+      "step": 554
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.2895798939877575,
+      "learning_rate": 7.534554917025119e-05,
+      "loss": 0.656,
+      "step": 555
+    },
+    {
+      "epoch": 0.5930666666666666,
+      "grad_norm": 0.32535827755481617,
+      "learning_rate": 7.501038724171756e-05,
+      "loss": 0.7482,
+      "step": 556
+    },
+    {
+      "epoch": 0.5941333333333333,
+      "grad_norm": 0.28040339929597097,
+      "learning_rate": 7.46755244618988e-05,
+      "loss": 0.6523,
+      "step": 557
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.28481551363714974,
+      "learning_rate": 7.434096483941115e-05,
+      "loss": 0.6937,
+      "step": 558
+    },
+    {
+      "epoch": 0.5962666666666666,
+      "grad_norm": 0.27658244979840724,
+      "learning_rate": 7.400671237924202e-05,
+      "loss": 0.6632,
+      "step": 559
+    },
+    {
+      "epoch": 0.5973333333333334,
+      "grad_norm": 0.28672018705481,
+      "learning_rate": 7.367277108270156e-05,
+      "loss": 0.7122,
+      "step": 560
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.3054070599002515,
+      "learning_rate": 7.333914494737514e-05,
+      "loss": 0.7462,
+      "step": 561
+    },
+    {
+      "epoch": 0.5994666666666667,
+      "grad_norm": 0.26066922267536863,
+      "learning_rate": 7.300583796707539e-05,
+      "loss": 0.6526,
+      "step": 562
+    },
+    {
+      "epoch": 0.6005333333333334,
+      "grad_norm": 0.2663942533888753,
+      "learning_rate": 7.267285413179421e-05,
+      "loss": 0.6401,
+      "step": 563
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.3074810965889515,
+      "learning_rate": 7.234019742765532e-05,
+      "loss": 0.7171,
+      "step": 564
+    },
+    {
+      "epoch": 0.6026666666666667,
+      "grad_norm": 0.32327663932539985,
+      "learning_rate": 7.200787183686625e-05,
+      "loss": 0.7023,
+      "step": 565
+    },
+    {
+      "epoch": 0.6037333333333333,
+      "grad_norm": 0.27775451757768665,
+      "learning_rate": 7.167588133767091e-05,
+      "loss": 0.6617,
+      "step": 566
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.26045639881270616,
+      "learning_rate": 7.134422990430176e-05,
+      "loss": 0.5921,
+      "step": 567
+    },
+    {
+      "epoch": 0.6058666666666667,
+      "grad_norm": 0.2760200078362994,
+      "learning_rate": 7.101292150693241e-05,
+      "loss": 0.6624,
+      "step": 568
+    },
+    {
+      "epoch": 0.6069333333333333,
+      "grad_norm": 0.284862759323287,
+      "learning_rate": 7.068196011162994e-05,
+      "loss": 0.6677,
+      "step": 569
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3102663214082485,
+      "learning_rate": 7.03513496803075e-05,
+      "loss": 0.665,
+      "step": 570
+    },
+    {
+      "epoch": 0.6090666666666666,
+      "grad_norm": 0.3204096685591165,
+      "learning_rate": 7.002109417067697e-05,
+      "loss": 0.6168,
+      "step": 571
+    },
+    {
+      "epoch": 0.6101333333333333,
+      "grad_norm": 0.2809619276552323,
+      "learning_rate": 6.969119753620135e-05,
+      "loss": 0.6771,
+      "step": 572
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.27930273371137115,
+      "learning_rate": 6.936166372604773e-05,
+      "loss": 0.6595,
+      "step": 573
+    },
+    {
+      "epoch": 0.6122666666666666,
+      "grad_norm": 0.27393534135607,
+      "learning_rate": 6.903249668503972e-05,
+      "loss": 0.6543,
+      "step": 574
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.3133322498095281,
+      "learning_rate": 6.87037003536104e-05,
+      "loss": 0.6845,
+      "step": 575
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.292942414360091,
+      "learning_rate": 6.837527866775522e-05,
+      "loss": 0.6255,
+      "step": 576
+    },
+    {
+      "epoch": 0.6154666666666667,
+      "grad_norm": 0.2800959719802186,
+      "learning_rate": 6.804723555898458e-05,
+      "loss": 0.6409,
+      "step": 577
+    },
+    {
+      "epoch": 0.6165333333333334,
+      "grad_norm": 0.31656690985698044,
+      "learning_rate": 6.771957495427716e-05,
+      "loss": 0.7182,
+      "step": 578
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.33320353446119055,
+      "learning_rate": 6.739230077603259e-05,
+      "loss": 0.7104,
+      "step": 579
+    },
+    {
+      "epoch": 0.6186666666666667,
+      "grad_norm": 0.28493767473138093,
+      "learning_rate": 6.706541694202471e-05,
+      "loss": 0.6851,
+      "step": 580
+    },
+    {
+      "epoch": 0.6197333333333334,
+      "grad_norm": 0.27565430496802135,
+      "learning_rate": 6.673892736535448e-05,
+      "loss": 0.6306,
+      "step": 581
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.26208129531868224,
+      "learning_rate": 6.641283595440323e-05,
+      "loss": 0.6525,
+      "step": 582
+    },
+    {
+      "epoch": 0.6218666666666667,
+      "grad_norm": 0.28243055346170187,
+      "learning_rate": 6.608714661278606e-05,
+      "loss": 0.6653,
+      "step": 583
+    },
+    {
+      "epoch": 0.6229333333333333,
+      "grad_norm": 0.2871697561300585,
+      "learning_rate": 6.576186323930466e-05,
+      "loss": 0.672,
+      "step": 584
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.2843702328109472,
+      "learning_rate": 6.543698972790117e-05,
+      "loss": 0.6925,
+      "step": 585
+    },
+    {
+      "epoch": 0.6250666666666667,
+      "grad_norm": 0.2841216673228526,
+      "learning_rate": 6.51125299676111e-05,
+      "loss": 0.6489,
+      "step": 586
+    },
+    {
+      "epoch": 0.6261333333333333,
+      "grad_norm": 0.30262406942134606,
+      "learning_rate": 6.478848784251713e-05,
+      "loss": 0.7315,
+      "step": 587
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.2798472247395526,
+      "learning_rate": 6.446486723170236e-05,
+      "loss": 0.6645,
+      "step": 588
+    },
+    {
+      "epoch": 0.6282666666666666,
+      "grad_norm": 0.2667725923356366,
+      "learning_rate": 6.414167200920391e-05,
+      "loss": 0.6398,
+      "step": 589
+    },
+    {
+      "epoch": 0.6293333333333333,
+      "grad_norm": 0.3901233462085959,
+      "learning_rate": 6.381890604396687e-05,
+      "loss": 0.671,
+      "step": 590
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.32802964775015336,
+      "learning_rate": 6.349657319979742e-05,
+      "loss": 0.6896,
+      "step": 591
+    },
+    {
+      "epoch": 0.6314666666666666,
+      "grad_norm": 0.29918042413694523,
+      "learning_rate": 6.317467733531712e-05,
+      "loss": 0.6683,
+      "step": 592
+    },
+    {
+      "epoch": 0.6325333333333333,
+      "grad_norm": 0.27730506676072064,
+      "learning_rate": 6.28532223039163e-05,
+      "loss": 0.671,
+      "step": 593
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.31656170023615376,
+      "learning_rate": 6.253221195370826e-05,
+      "loss": 0.6984,
+      "step": 594
+    },
+    {
+      "epoch": 0.6346666666666667,
+      "grad_norm": 0.27858978923058714,
+      "learning_rate": 6.221165012748297e-05,
+      "loss": 0.6648,
+      "step": 595
+    },
+    {
+      "epoch": 0.6357333333333334,
+      "grad_norm": 0.27716740679748453,
+      "learning_rate": 6.189154066266112e-05,
+      "loss": 0.6603,
+      "step": 596
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.32373094969876837,
+      "learning_rate": 6.157188739124834e-05,
+      "loss": 0.7869,
+      "step": 597
+    },
+    {
+      "epoch": 0.6378666666666667,
+      "grad_norm": 0.32905428766166533,
+      "learning_rate": 6.125269413978907e-05,
+      "loss": 0.6604,
+      "step": 598
+    },
+    {
+      "epoch": 0.6389333333333334,
+      "grad_norm": 0.2827813439312809,
+      "learning_rate": 6.093396472932103e-05,
+      "loss": 0.6503,
+      "step": 599
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.27990982670223624,
+      "learning_rate": 6.0615702975329194e-05,
+      "loss": 0.6586,
+      "step": 600
+    },
+    {
+      "epoch": 0.6410666666666667,
+      "grad_norm": 0.31279521216643086,
+      "learning_rate": 6.029791268770029e-05,
+      "loss": 0.6517,
+      "step": 601
+    },
+    {
+      "epoch": 0.6421333333333333,
+      "grad_norm": 0.2911609136627562,
+      "learning_rate": 5.998059767067728e-05,
+      "loss": 0.6345,
+      "step": 602
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.27966152609036526,
+      "learning_rate": 5.9663761722813495e-05,
+      "loss": 0.6227,
+      "step": 603
+    },
+    {
+      "epoch": 0.6442666666666667,
+      "grad_norm": 0.2939451053442902,
+      "learning_rate": 5.934740863692759e-05,
+      "loss": 0.6706,
+      "step": 604
+    },
+    {
+      "epoch": 0.6453333333333333,
+      "grad_norm": 0.29298604755571256,
+      "learning_rate": 5.903154220005771e-05,
+      "loss": 0.6552,
+      "step": 605
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.2946108030220765,
+      "learning_rate": 5.871616619341653e-05,
+      "loss": 0.7258,
+      "step": 606
+    },
+    {
+      "epoch": 0.6474666666666666,
+      "grad_norm": 0.2695643421152406,
+      "learning_rate": 5.840128439234571e-05,
+      "loss": 0.6397,
+      "step": 607
+    },
+    {
+      "epoch": 0.6485333333333333,
+      "grad_norm": 0.2685510979232542,
+      "learning_rate": 5.80869005662708e-05,
+      "loss": 0.6463,
+      "step": 608
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.26743508036139063,
+      "learning_rate": 5.777301847865629e-05,
+      "loss": 0.622,
+      "step": 609
+    },
+    {
+      "epoch": 0.6506666666666666,
+      "grad_norm": 0.2861940883382232,
+      "learning_rate": 5.7459641886960244e-05,
+      "loss": 0.6847,
+      "step": 610
+    },
+    {
+      "epoch": 0.6517333333333334,
+      "grad_norm": 0.28751647313232404,
+      "learning_rate": 5.714677454258947e-05,
+      "loss": 0.6553,
+      "step": 611
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.2973098924609291,
+      "learning_rate": 5.6834420190854745e-05,
+      "loss": 0.6411,
+      "step": 612
+    },
+    {
+      "epoch": 0.6538666666666667,
+      "grad_norm": 0.26866698877101813,
+      "learning_rate": 5.652258257092569e-05,
+      "loss": 0.6042,
+      "step": 613
+    },
+    {
+      "epoch": 0.6549333333333334,
+      "grad_norm": 0.27277054436317155,
+      "learning_rate": 5.621126541578632e-05,
+      "loss": 0.6305,
+      "step": 614
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.29334647465461267,
+      "learning_rate": 5.590047245219009e-05,
+      "loss": 0.6677,
+      "step": 615
+    },
+    {
+      "epoch": 0.6570666666666667,
+      "grad_norm": 0.2692181748192527,
+      "learning_rate": 5.559020740061549e-05,
+      "loss": 0.6441,
+      "step": 616
+    },
+    {
+      "epoch": 0.6581333333333333,
+      "grad_norm": 0.2875655942770087,
+      "learning_rate": 5.528047397522133e-05,
+      "loss": 0.7035,
+      "step": 617
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.3196039710835889,
+      "learning_rate": 5.497127588380244e-05,
+      "loss": 0.7077,
+      "step": 618
+    },
+    {
+      "epoch": 0.6602666666666667,
+      "grad_norm": 0.25514688966767723,
+      "learning_rate": 5.4662616827745185e-05,
+      "loss": 0.6279,
+      "step": 619
+    },
+    {
+      "epoch": 0.6613333333333333,
+      "grad_norm": 0.30966996310288775,
+      "learning_rate": 5.4354500501983074e-05,
+      "loss": 0.7202,
+      "step": 620
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.2749126392137727,
+      "learning_rate": 5.404693059495285e-05,
+      "loss": 0.6205,
+      "step": 621
+    },
+    {
+      "epoch": 0.6634666666666666,
+      "grad_norm": 0.3829206361819499,
+      "learning_rate": 5.373991078854992e-05,
+      "loss": 0.6853,
+      "step": 622
+    },
+    {
+      "epoch": 0.6645333333333333,
+      "grad_norm": 0.302528210142608,
+      "learning_rate": 5.3433444758084604e-05,
+      "loss": 0.6858,
+      "step": 623
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.2770927219737785,
+      "learning_rate": 5.312753617223794e-05,
+      "loss": 0.6262,
+      "step": 624
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.29440427360769594,
+      "learning_rate": 5.282218869301788e-05,
+      "loss": 0.6997,
+      "step": 625
+    },
+    {
+      "epoch": 0.6677333333333333,
+      "grad_norm": 0.2738871723904596,
+      "learning_rate": 5.251740597571542e-05,
+      "loss": 0.6366,
+      "step": 626
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.29430307578778014,
+      "learning_rate": 5.221319166886073e-05,
+      "loss": 0.6784,
+      "step": 627
+    },
+    {
+      "epoch": 0.6698666666666667,
+      "grad_norm": 0.2931850021619955,
+      "learning_rate": 5.190954941417977e-05,
+      "loss": 0.6877,
+      "step": 628
+    },
+    {
+      "epoch": 0.6709333333333334,
+      "grad_norm": 0.2766784624087078,
+      "learning_rate": 5.160648284655032e-05,
+      "loss": 0.6297,
+      "step": 629
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.28563037511990763,
+      "learning_rate": 5.1303995593958824e-05,
+      "loss": 0.6565,
+      "step": 630
+    },
+    {
+      "epoch": 0.6730666666666667,
+      "grad_norm": 0.2571946346254562,
+      "learning_rate": 5.100209127745661e-05,
+      "loss": 0.6004,
+      "step": 631
+    },
+    {
+      "epoch": 0.6741333333333334,
+      "grad_norm": 0.335776447635849,
+      "learning_rate": 5.0700773511116906e-05,
+      "loss": 0.7161,
+      "step": 632
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.27596423062496944,
+      "learning_rate": 5.040004590199128e-05,
+      "loss": 0.6356,
+      "step": 633
+    },
+    {
+      "epoch": 0.6762666666666667,
+      "grad_norm": 0.26803594304831474,
+      "learning_rate": 5.0099912050066556e-05,
+      "loss": 0.6321,
+      "step": 634
+    },
+    {
+      "epoch": 0.6773333333333333,
+      "grad_norm": 0.2841437162450076,
+      "learning_rate": 4.9800375548221845e-05,
+      "loss": 0.6633,
+      "step": 635
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.2909509540859932,
+      "learning_rate": 4.950143998218531e-05,
+      "loss": 0.6738,
+      "step": 636
+    },
+    {
+      "epoch": 0.6794666666666667,
+      "grad_norm": 0.4839290382099412,
+      "learning_rate": 4.920310893049146e-05,
+      "loss": 0.7175,
+      "step": 637
+    },
+    {
+      "epoch": 0.6805333333333333,
+      "grad_norm": 0.28786760320456234,
+      "learning_rate": 4.89053859644381e-05,
+      "loss": 0.6489,
+      "step": 638
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.3066173657523897,
+      "learning_rate": 4.860827464804383e-05,
+      "loss": 0.6622,
+      "step": 639
+    },
+    {
+      "epoch": 0.6826666666666666,
+      "grad_norm": 0.31712847314029313,
+      "learning_rate": 4.831177853800511e-05,
+      "loss": 0.7302,
+      "step": 640
+    },
+    {
+      "epoch": 0.6837333333333333,
+      "grad_norm": 0.314127065264689,
+      "learning_rate": 4.801590118365383e-05,
+      "loss": 0.6554,
+      "step": 641
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.2912563986330789,
+      "learning_rate": 4.77206461269149e-05,
+      "loss": 0.6997,
+      "step": 642
+    },
+    {
+      "epoch": 0.6858666666666666,
+      "grad_norm": 0.2723475608329597,
+      "learning_rate": 4.7426016902263636e-05,
+      "loss": 0.6366,
+      "step": 643
+    },
+    {
+      "epoch": 0.6869333333333333,
+      "grad_norm": 0.2857812171044624,
+      "learning_rate": 4.713201703668367e-05,
+      "loss": 0.6396,
+      "step": 644
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.269722291822516,
+      "learning_rate": 4.683865004962452e-05,
+      "loss": 0.6718,
+      "step": 645
+    },
+    {
+      "epoch": 0.6890666666666667,
+      "grad_norm": 0.27127114300010835,
+      "learning_rate": 4.654591945295969e-05,
+      "loss": 0.601,
+      "step": 646
+    },
+    {
+      "epoch": 0.6901333333333334,
+      "grad_norm": 0.2875125469368505,
+      "learning_rate": 4.6253828750944375e-05,
+      "loss": 0.6491,
+      "step": 647
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.2820172283412604,
+      "learning_rate": 4.596238144017369e-05,
+      "loss": 0.6605,
+      "step": 648
+    },
+    {
+      "epoch": 0.6922666666666667,
+      "grad_norm": 0.28215979461344326,
+      "learning_rate": 4.567158100954083e-05,
+      "loss": 0.6455,
+      "step": 649
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.30445950051799525,
+      "learning_rate": 4.53814309401951e-05,
+      "loss": 0.6693,
+      "step": 650
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.289393432300126,
+      "learning_rate": 4.509193470550056e-05,
+      "loss": 0.6626,
+      "step": 651
+    },
+    {
+      "epoch": 0.6954666666666667,
+      "grad_norm": 0.29159567744282455,
+      "learning_rate": 4.4803095770994106e-05,
+      "loss": 0.6772,
+      "step": 652
+    },
+    {
+      "epoch": 0.6965333333333333,
+      "grad_norm": 0.3153818045447706,
+      "learning_rate": 4.4514917594344184e-05,
+      "loss": 0.7167,
+      "step": 653
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.2900916187308584,
+      "learning_rate": 4.422740362530945e-05,
+      "loss": 0.6325,
+      "step": 654
+    },
+    {
+      "epoch": 0.6986666666666667,
+      "grad_norm": 0.35486349786173416,
+      "learning_rate": 4.3940557305697226e-05,
+      "loss": 0.6996,
+      "step": 655
+    },
+    {
+      "epoch": 0.6997333333333333,
+      "grad_norm": 0.2947295595910589,
+      "learning_rate": 4.3654382069322644e-05,
+      "loss": 0.6797,
+      "step": 656
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.29827049603167827,
+      "learning_rate": 4.3368881341967135e-05,
+      "loss": 0.6707,
+      "step": 657
+    },
+    {
+      "epoch": 0.7018666666666666,
+      "grad_norm": 0.2930065259590927,
+      "learning_rate": 4.308405854133786e-05,
+      "loss": 0.6626,
+      "step": 658
+    },
+    {
+      "epoch": 0.7029333333333333,
+      "grad_norm": 0.2684168453018188,
+      "learning_rate": 4.2799917077026394e-05,
+      "loss": 0.6452,
+      "step": 659
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.2933993105793805,
+      "learning_rate": 4.251646035046814e-05,
+      "loss": 0.6877,
+      "step": 660
+    },
+    {
+      "epoch": 0.7050666666666666,
+      "grad_norm": 0.28510513967157963,
+      "learning_rate": 4.223369175490162e-05,
+      "loss": 0.6395,
+      "step": 661
+    },
+    {
+      "epoch": 0.7061333333333333,
+      "grad_norm": 0.28430008291021985,
+      "learning_rate": 4.195161467532769e-05,
+      "loss": 0.6833,
+      "step": 662
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.3098944003473299,
+      "learning_rate": 4.167023248846925e-05,
+      "loss": 0.7245,
+      "step": 663
+    },
+    {
+      "epoch": 0.7082666666666667,
+      "grad_norm": 0.26285919211526726,
+      "learning_rate": 4.138954856273054e-05,
+      "loss": 0.6256,
+      "step": 664
+    },
+    {
+      "epoch": 0.7093333333333334,
+      "grad_norm": 0.29079985687631693,
+      "learning_rate": 4.110956625815713e-05,
+      "loss": 0.6689,
+      "step": 665
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3038385015807507,
+      "learning_rate": 4.083028892639541e-05,
+      "loss": 0.687,
+      "step": 666
+    },
+    {
+      "epoch": 0.7114666666666667,
+      "grad_norm": 0.31326059865547873,
+      "learning_rate": 4.055171991065262e-05,
+      "loss": 0.6164,
+      "step": 667
+    },
+    {
+      "epoch": 0.7125333333333334,
+      "grad_norm": 0.31823161383866727,
+      "learning_rate": 4.027386254565688e-05,
+      "loss": 0.6914,
+      "step": 668
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.28351722713064437,
+      "learning_rate": 3.9996720157617094e-05,
+      "loss": 0.6716,
+      "step": 669
+    },
+    {
+      "epoch": 0.7146666666666667,
+      "grad_norm": 0.2863134817616585,
+      "learning_rate": 3.972029606418335e-05,
+      "loss": 0.66,
+      "step": 670
+    },
+    {
+      "epoch": 0.7157333333333333,
+      "grad_norm": 0.2890830404906038,
+      "learning_rate": 3.9444593574406915e-05,
+      "loss": 0.6781,
+      "step": 671
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3008629834692028,
+      "learning_rate": 3.9169615988701e-05,
+      "loss": 0.6478,
+      "step": 672
+    },
+    {
+      "epoch": 0.7178666666666667,
+      "grad_norm": 0.30229004417007266,
+      "learning_rate": 3.8895366598800896e-05,
+      "loss": 0.6962,
+      "step": 673
+    },
+    {
+      "epoch": 0.7189333333333333,
+      "grad_norm": 0.29431364789393194,
+      "learning_rate": 3.862184868772473e-05,
+      "loss": 0.6799,
+      "step": 674
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.2846214083775447,
+      "learning_rate": 3.834906552973424e-05,
+      "loss": 0.6785,
+      "step": 675
+    },
+    {
+      "epoch": 0.7210666666666666,
+      "grad_norm": 0.3012354151426349,
+      "learning_rate": 3.807702039029539e-05,
+      "loss": 0.7035,
+      "step": 676
+    },
+    {
+      "epoch": 0.7221333333333333,
+      "grad_norm": 0.3063494102115359,
+      "learning_rate": 3.780571652603949e-05,
+      "loss": 0.6569,
+      "step": 677
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.29788653775709056,
+      "learning_rate": 3.753515718472402e-05,
+      "loss": 0.6765,
+      "step": 678
+    },
+    {
+      "epoch": 0.7242666666666666,
+      "grad_norm": 0.2664276148587841,
+      "learning_rate": 3.726534560519381e-05,
+      "loss": 0.6371,
+      "step": 679
+    },
+    {
+      "epoch": 0.7253333333333334,
+      "grad_norm": 0.3041428329482516,
+      "learning_rate": 3.6996285017342406e-05,
+      "loss": 0.6453,
+      "step": 680
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.2907425254813268,
+      "learning_rate": 3.672797864207316e-05,
+      "loss": 0.6739,
+      "step": 681
+    },
+    {
+      "epoch": 0.7274666666666667,
+      "grad_norm": 0.27481374083401466,
+      "learning_rate": 3.646042969126093e-05,
+      "loss": 0.6422,
+      "step": 682
+    },
+    {
+      "epoch": 0.7285333333333334,
+      "grad_norm": 0.28869767017809994,
+      "learning_rate": 3.619364136771337e-05,
+      "loss": 0.685,
+      "step": 683
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.27629920246099,
+      "learning_rate": 3.5927616865132884e-05,
+      "loss": 0.62,
+      "step": 684
+    },
+    {
+      "epoch": 0.7306666666666667,
+      "grad_norm": 0.36573334865015195,
+      "learning_rate": 3.566235936807808e-05,
+      "loss": 0.6385,
+      "step": 685
+    },
+    {
+      "epoch": 0.7317333333333333,
+      "grad_norm": 0.3306751761302303,
+      "learning_rate": 3.539787205192586e-05,
+      "loss": 0.7023,
+      "step": 686
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.29786552166099595,
+      "learning_rate": 3.513415808283341e-05,
+      "loss": 0.6945,
+      "step": 687
+    },
+    {
+      "epoch": 0.7338666666666667,
+      "grad_norm": 0.3200615573895231,
+      "learning_rate": 3.4871220617700126e-05,
+      "loss": 0.697,
+      "step": 688
+    },
+    {
+      "epoch": 0.7349333333333333,
+      "grad_norm": 0.2917494354935264,
+      "learning_rate": 3.460906280413007e-05,
+      "loss": 0.6687,
+      "step": 689
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.2881025206406779,
+      "learning_rate": 3.4347687780394e-05,
+      "loss": 0.65,
+      "step": 690
+    },
+    {
+      "epoch": 0.7370666666666666,
+      "grad_norm": 0.277026993832784,
+      "learning_rate": 3.4087098675392104e-05,
+      "loss": 0.6642,
+      "step": 691
+    },
+    {
+      "epoch": 0.7381333333333333,
+      "grad_norm": 0.28718483143700635,
+      "learning_rate": 3.382729860861632e-05,
+      "loss": 0.6399,
+      "step": 692
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.2861967168150272,
+      "learning_rate": 3.3568290690113034e-05,
+      "loss": 0.6412,
+      "step": 693
+    },
+    {
+      "epoch": 0.7402666666666666,
+      "grad_norm": 0.28223210863198006,
+      "learning_rate": 3.331007802044601e-05,
+      "loss": 0.6248,
+      "step": 694
+    },
+    {
+      "epoch": 0.7413333333333333,
+      "grad_norm": 0.29237354581718883,
+      "learning_rate": 3.305266369065901e-05,
+      "loss": 0.6278,
+      "step": 695
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.30077367438476793,
+      "learning_rate": 3.279605078223906e-05,
+      "loss": 0.651,
+      "step": 696
+    },
+    {
+      "epoch": 0.7434666666666667,
+      "grad_norm": 0.30182926963265716,
+      "learning_rate": 3.25402423670793e-05,
+      "loss": 0.6802,
+      "step": 697
+    },
+    {
+      "epoch": 0.7445333333333334,
+      "grad_norm": 0.27123008878769184,
+      "learning_rate": 3.228524150744249e-05,
+      "loss": 0.6459,
+      "step": 698
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.31528566649855466,
+      "learning_rate": 3.2031051255924085e-05,
+      "loss": 0.7312,
+      "step": 699
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.2888019042238811,
+      "learning_rate": 3.1777674655415834e-05,
+      "loss": 0.5961,
+      "step": 700
+    },
+    {
+      "epoch": 0.7477333333333334,
+      "grad_norm": 0.28995831934582733,
+      "learning_rate": 3.1525114739069415e-05,
+      "loss": 0.6922,
+      "step": 701
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.2980666425502893,
+      "learning_rate": 3.127337453025994e-05,
+      "loss": 0.6972,
+      "step": 702
+    },
+    {
+      "epoch": 0.7498666666666667,
+      "grad_norm": 0.2940912890470747,
+      "learning_rate": 3.102245704254995e-05,
+      "loss": 0.6777,
+      "step": 703
+    },
+    {
+      "epoch": 0.7509333333333333,
+      "grad_norm": 0.27656670242701575,
+      "learning_rate": 3.077236527965318e-05,
+      "loss": 0.6446,
+      "step": 704
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.2728768920242138,
+      "learning_rate": 3.0523102235398714e-05,
+      "loss": 0.6429,
+      "step": 705
+    },
+    {
+      "epoch": 0.7530666666666667,
+      "grad_norm": 0.2651053636028773,
+      "learning_rate": 3.0274670893695147e-05,
+      "loss": 0.5969,
+      "step": 706
+    },
+    {
+      "epoch": 0.7541333333333333,
+      "grad_norm": 0.290043020958257,
+      "learning_rate": 3.002707422849472e-05,
+      "loss": 0.6808,
+      "step": 707
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.30388118204687087,
+      "learning_rate": 2.978031520375798e-05,
+      "loss": 0.6818,
+      "step": 708
+    },
+    {
+      "epoch": 0.7562666666666666,
+      "grad_norm": 0.2872867060365483,
+      "learning_rate": 2.9534396773417994e-05,
+      "loss": 0.6322,
+      "step": 709
+    },
+    {
+      "epoch": 0.7573333333333333,
+      "grad_norm": 0.3012089202756257,
+      "learning_rate": 2.9289321881345254e-05,
+      "loss": 0.6655,
+      "step": 710
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.28988814577536387,
+      "learning_rate": 2.9045093461312258e-05,
+      "loss": 0.6522,
+      "step": 711
+    },
+    {
+      "epoch": 0.7594666666666666,
+      "grad_norm": 0.28659804356967133,
+      "learning_rate": 2.8801714436958416e-05,
+      "loss": 0.6688,
+      "step": 712
+    },
+    {
+      "epoch": 0.7605333333333333,
+      "grad_norm": 0.27869443672430017,
+      "learning_rate": 2.855918772175522e-05,
+      "loss": 0.6187,
+      "step": 713
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3218060159736909,
+      "learning_rate": 2.8317516218971073e-05,
+      "loss": 0.6971,
+      "step": 714
+    },
+    {
+      "epoch": 0.7626666666666667,
+      "grad_norm": 0.2735284606053201,
+      "learning_rate": 2.8076702821636867e-05,
+      "loss": 0.6375,
+      "step": 715
+    },
+    {
+      "epoch": 0.7637333333333334,
+      "grad_norm": 0.2890031173362515,
+      "learning_rate": 2.7836750412511016e-05,
+      "loss": 0.6506,
+      "step": 716
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.29823710588390784,
+      "learning_rate": 2.7597661864045233e-05,
+      "loss": 0.7002,
+      "step": 717
+    },
+    {
+      "epoch": 0.7658666666666667,
+      "grad_norm": 0.32196430556211336,
+      "learning_rate": 2.735944003834997e-05,
+      "loss": 0.6594,
+      "step": 718
+    },
+    {
+      "epoch": 0.7669333333333334,
+      "grad_norm": 0.2952511186657231,
+      "learning_rate": 2.7122087787160166e-05,
+      "loss": 0.6295,
+      "step": 719
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.2781234638273664,
+      "learning_rate": 2.688560795180126e-05,
+      "loss": 0.6578,
+      "step": 720
+    },
+    {
+      "epoch": 0.7690666666666667,
+      "grad_norm": 0.2908767563059737,
+      "learning_rate": 2.6650003363154963e-05,
+      "loss": 0.6637,
+      "step": 721
+    },
+    {
+      "epoch": 0.7701333333333333,
+      "grad_norm": 0.2768874140636074,
+      "learning_rate": 2.641527684162556e-05,
+      "loss": 0.6426,
+      "step": 722
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.31144606853893736,
+      "learning_rate": 2.6181431197105998e-05,
+      "loss": 0.7357,
+      "step": 723
+    },
+    {
+      "epoch": 0.7722666666666667,
+      "grad_norm": 0.982380863411313,
+      "learning_rate": 2.5948469228944318e-05,
+      "loss": 0.6872,
+      "step": 724
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.2879611855890254,
+      "learning_rate": 2.5716393725910215e-05,
+      "loss": 0.6256,
+      "step": 725
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.2974019467722814,
+      "learning_rate": 2.5485207466161466e-05,
+      "loss": 0.6188,
+      "step": 726
+    },
+    {
+      "epoch": 0.7754666666666666,
+      "grad_norm": 0.28614307588026977,
+      "learning_rate": 2.5254913217210886e-05,
+      "loss": 0.6449,
+      "step": 727
+    },
+    {
+      "epoch": 0.7765333333333333,
+      "grad_norm": 0.30905402203640764,
+      "learning_rate": 2.5025513735893014e-05,
+      "loss": 0.7016,
+      "step": 728
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.30339890688065574,
+      "learning_rate": 2.47970117683313e-05,
+      "loss": 0.6477,
+      "step": 729
+    },
+    {
+      "epoch": 0.7786666666666666,
+      "grad_norm": 0.2700728879961785,
+      "learning_rate": 2.4569410049905016e-05,
+      "loss": 0.6425,
+      "step": 730
+    },
+    {
+      "epoch": 0.7797333333333333,
+      "grad_norm": 0.28514224544409006,
+      "learning_rate": 2.434271130521666e-05,
+      "loss": 0.6494,
+      "step": 731
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.2952141878210136,
+      "learning_rate": 2.411691824805934e-05,
+      "loss": 0.6424,
+      "step": 732
+    },
+    {
+      "epoch": 0.7818666666666667,
+      "grad_norm": 0.28925194465207066,
+      "learning_rate": 2.389203358138419e-05,
+      "loss": 0.6669,
+      "step": 733
+    },
+    {
+      "epoch": 0.7829333333333334,
+      "grad_norm": 0.26771131590299224,
+      "learning_rate": 2.3668059997268144e-05,
+      "loss": 0.6254,
+      "step": 734
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.29576421959665344,
+      "learning_rate": 2.3445000176881537e-05,
+      "loss": 0.6986,
+      "step": 735
+    },
+    {
+      "epoch": 0.7850666666666667,
+      "grad_norm": 0.310596638325244,
+      "learning_rate": 2.3222856790456226e-05,
+      "loss": 0.6636,
+      "step": 736
+    },
+    {
+      "epoch": 0.7861333333333334,
+      "grad_norm": 0.30693727798424675,
+      "learning_rate": 2.3001632497253424e-05,
+      "loss": 0.6667,
+      "step": 737
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.2626316622716067,
+      "learning_rate": 2.2781329945531936e-05,
+      "loss": 0.6046,
+      "step": 738
+    },
+    {
+      "epoch": 0.7882666666666667,
+      "grad_norm": 0.28520013572402136,
+      "learning_rate": 2.2561951772516587e-05,
+      "loss": 0.607,
+      "step": 739
+    },
+    {
+      "epoch": 0.7893333333333333,
+      "grad_norm": 0.2787390836412758,
+      "learning_rate": 2.2343500604366374e-05,
+      "loss": 0.6679,
+      "step": 740
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.2675406467690718,
+      "learning_rate": 2.2125979056143364e-05,
+      "loss": 0.6282,
+      "step": 741
+    },
+    {
+      "epoch": 0.7914666666666667,
+      "grad_norm": 0.29308253389415895,
+      "learning_rate": 2.190938973178105e-05,
+      "loss": 0.6745,
+      "step": 742
+    },
+    {
+      "epoch": 0.7925333333333333,
+      "grad_norm": 0.2963475640253147,
+      "learning_rate": 2.169373522405349e-05,
+      "loss": 0.6075,
+      "step": 743
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.3029314372436365,
+      "learning_rate": 2.1479018114544026e-05,
+      "loss": 0.6816,
+      "step": 744
+    },
+    {
+      "epoch": 0.7946666666666666,
+      "grad_norm": 0.33429077648590755,
+      "learning_rate": 2.1265240973614486e-05,
+      "loss": 0.6748,
+      "step": 745
+    },
+    {
+      "epoch": 0.7957333333333333,
+      "grad_norm": 0.2914498582316359,
+      "learning_rate": 2.105240636037449e-05,
+      "loss": 0.6704,
+      "step": 746
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.2751680280645356,
+      "learning_rate": 2.0840516822650614e-05,
+      "loss": 0.6588,
+      "step": 747
+    },
+    {
+      "epoch": 0.7978666666666666,
+      "grad_norm": 0.2806967013216181,
+      "learning_rate": 2.0629574896956126e-05,
+      "loss": 0.6421,
+      "step": 748
+    },
+    {
+      "epoch": 0.7989333333333334,
+      "grad_norm": 0.2938583566093967,
+      "learning_rate": 2.0419583108460418e-05,
+      "loss": 0.6584,
+      "step": 749
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.2648036676215061,
+      "learning_rate": 2.0210543970958872e-05,
+      "loss": 0.6172,
+      "step": 750
+    },
+    {
+      "epoch": 0.8010666666666667,
+      "grad_norm": 0.30923057214752575,
+      "learning_rate": 2.0002459986842825e-05,
+      "loss": 0.6791,
+      "step": 751
+    },
+    {
+      "epoch": 0.8021333333333334,
+      "grad_norm": 0.30555190499212365,
+      "learning_rate": 1.9795333647069448e-05,
+      "loss": 0.7012,
+      "step": 752
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.2892376855641532,
+      "learning_rate": 1.958916743113214e-05,
+      "loss": 0.6503,
+      "step": 753
+    },
+    {
+      "epoch": 0.8042666666666667,
+      "grad_norm": 0.3130207684063405,
+      "learning_rate": 1.93839638070306e-05,
+      "loss": 0.688,
+      "step": 754
+    },
+    {
+      "epoch": 0.8053333333333333,
+      "grad_norm": 0.3038811959433142,
+      "learning_rate": 1.9179725231241564e-05,
+      "loss": 0.6412,
+      "step": 755
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.27038678722207404,
+      "learning_rate": 1.8976454148689127e-05,
+      "loss": 0.6132,
+      "step": 756
+    },
+    {
+      "epoch": 0.8074666666666667,
+      "grad_norm": 0.32290200573388256,
+      "learning_rate": 1.877415299271561e-05,
+      "loss": 0.6892,
+      "step": 757
+    },
+    {
+      "epoch": 0.8085333333333333,
+      "grad_norm": 0.29953694143025295,
+      "learning_rate": 1.857282418505253e-05,
+      "loss": 0.5971,
+      "step": 758
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.29367145518346255,
+      "learning_rate": 1.8372470135791344e-05,
+      "loss": 0.6953,
+      "step": 759
+    },
+    {
+      "epoch": 0.8106666666666666,
+      "grad_norm": 0.35767355711505755,
+      "learning_rate": 1.8173093243354878e-05,
+      "loss": 0.704,
+      "step": 760
+    },
+    {
+      "epoch": 0.8117333333333333,
+      "grad_norm": 0.3357107557671172,
+      "learning_rate": 1.7974695894468384e-05,
+      "loss": 0.7061,
+      "step": 761
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.2668410546595407,
+      "learning_rate": 1.7777280464131197e-05,
+      "loss": 0.6159,
+      "step": 762
+    },
+    {
+      "epoch": 0.8138666666666666,
+      "grad_norm": 0.24192534198216858,
+      "learning_rate": 1.7580849315588068e-05,
+      "loss": 0.5873,
+      "step": 763
+    },
+    {
+      "epoch": 0.8149333333333333,
+      "grad_norm": 0.2690632019973026,
+      "learning_rate": 1.7385404800301007e-05,
+      "loss": 0.6209,
+      "step": 764
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.2932124349468624,
+      "learning_rate": 1.7190949257921196e-05,
+      "loss": 0.6489,
+      "step": 765
+    },
+    {
+      "epoch": 0.8170666666666667,
+      "grad_norm": 0.2973507608728614,
+      "learning_rate": 1.6997485016260793e-05,
+      "loss": 0.6502,
+      "step": 766
+    },
+    {
+      "epoch": 0.8181333333333334,
+      "grad_norm": 0.27381684125841704,
+      "learning_rate": 1.680501439126525e-05,
+      "loss": 0.6415,
+      "step": 767
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3001301983794928,
+      "learning_rate": 1.6613539686985458e-05,
+      "loss": 0.6475,
+      "step": 768
+    },
+    {
+      "epoch": 0.8202666666666667,
+      "grad_norm": 0.3189832068704782,
+      "learning_rate": 1.642306319555027e-05,
+      "loss": 0.659,
+      "step": 769
+    },
+    {
+      "epoch": 0.8213333333333334,
+      "grad_norm": 0.28189982638652405,
+      "learning_rate": 1.6233587197138968e-05,
+      "loss": 0.6468,
+      "step": 770
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.30258418883378585,
+      "learning_rate": 1.6045113959953985e-05,
+      "loss": 0.6568,
+      "step": 771
+    },
+    {
+      "epoch": 0.8234666666666667,
+      "grad_norm": 0.31503912109399296,
+      "learning_rate": 1.585764574019388e-05,
+      "loss": 0.6878,
+      "step": 772
+    },
+    {
+      "epoch": 0.8245333333333333,
+      "grad_norm": 0.2980185956983998,
+      "learning_rate": 1.5671184782026106e-05,
+      "loss": 0.6806,
+      "step": 773
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.28729609894612906,
+      "learning_rate": 1.548573331756038e-05,
+      "loss": 0.6475,
+      "step": 774
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.290456116643064,
+      "learning_rate": 1.530129356682175e-05,
+      "loss": 0.6642,
+      "step": 775
+    },
+    {
+      "epoch": 0.8277333333333333,
+      "grad_norm": 0.2676401795213975,
+      "learning_rate": 1.5117867737724134e-05,
+      "loss": 0.6322,
+      "step": 776
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.291436408710233,
+      "learning_rate": 1.4935458026043959e-05,
+      "loss": 0.6869,
+      "step": 777
+    },
+    {
+      "epoch": 0.8298666666666666,
+      "grad_norm": 0.28161522370484643,
+      "learning_rate": 1.4754066615393668e-05,
+      "loss": 0.6575,
+      "step": 778
+    },
+    {
+      "epoch": 0.8309333333333333,
+      "grad_norm": 0.25659119663294533,
+      "learning_rate": 1.457369567719581e-05,
+      "loss": 0.6293,
+      "step": 779
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.2677282975104176,
+      "learning_rate": 1.4394347370656836e-05,
+      "loss": 0.6139,
+      "step": 780
+    },
+    {
+      "epoch": 0.8330666666666666,
+      "grad_norm": 0.2669011346374821,
+      "learning_rate": 1.4216023842741455e-05,
+      "loss": 0.6118,
+      "step": 781
+    },
+    {
+      "epoch": 0.8341333333333333,
+      "grad_norm": 0.29119163663377057,
+      "learning_rate": 1.4038727228146753e-05,
+      "loss": 0.6697,
+      "step": 782
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.2848690193602461,
+      "learning_rate": 1.3862459649276715e-05,
+      "loss": 0.637,
+      "step": 783
+    },
+    {
+      "epoch": 0.8362666666666667,
+      "grad_norm": 0.258917119913294,
+      "learning_rate": 1.3687223216216904e-05,
+      "loss": 0.6252,
+      "step": 784
+    },
+    {
+      "epoch": 0.8373333333333334,
+      "grad_norm": 0.28269640609184815,
+      "learning_rate": 1.3513020026709023e-05,
+      "loss": 0.6346,
+      "step": 785
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.28282334610546944,
+      "learning_rate": 1.3339852166125954e-05,
+      "loss": 0.6553,
+      "step": 786
+    },
+    {
+      "epoch": 0.8394666666666667,
+      "grad_norm": 0.28753951701300084,
+      "learning_rate": 1.3167721707446678e-05,
+      "loss": 0.6618,
+      "step": 787
+    },
+    {
+      "epoch": 0.8405333333333334,
+      "grad_norm": 0.29267116605757804,
+      "learning_rate": 1.2996630711231616e-05,
+      "loss": 0.6513,
+      "step": 788
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.3007934582902463,
+      "learning_rate": 1.2826581225597767e-05,
+      "loss": 0.6325,
+      "step": 789
+    },
+    {
+      "epoch": 0.8426666666666667,
+      "grad_norm": 0.2956581913246511,
+      "learning_rate": 1.26575752861943e-05,
+      "loss": 0.6602,
+      "step": 790
+    },
+    {
+      "epoch": 0.8437333333333333,
+      "grad_norm": 0.25926169436931107,
+      "learning_rate": 1.248961491617826e-05,
+      "loss": 0.6141,
+      "step": 791
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.27382235573934904,
+      "learning_rate": 1.2322702126190156e-05,
+      "loss": 0.6434,
+      "step": 792
+    },
+    {
+      "epoch": 0.8458666666666667,
+      "grad_norm": 0.26936270513648847,
+      "learning_rate": 1.2156838914330072e-05,
+      "loss": 0.6398,
+      "step": 793
+    },
+    {
+      "epoch": 0.8469333333333333,
+      "grad_norm": 0.298932807167484,
+      "learning_rate": 1.1992027266133598e-05,
+      "loss": 0.703,
+      "step": 794
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.2837987918824917,
+      "learning_rate": 1.1828269154548244e-05,
+      "loss": 0.6735,
+      "step": 795
+    },
+    {
+      "epoch": 0.8490666666666666,
+      "grad_norm": 0.287152453838261,
+      "learning_rate": 1.1665566539909623e-05,
+      "loss": 0.6303,
+      "step": 796
+    },
+    {
+      "epoch": 0.8501333333333333,
+      "grad_norm": 0.3067661790509156,
+      "learning_rate": 1.1503921369918091e-05,
+      "loss": 0.6821,
+      "step": 797
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.33194786910939317,
+      "learning_rate": 1.1343335579615467e-05,
+      "loss": 0.7115,
+      "step": 798
+    },
+    {
+      "epoch": 0.8522666666666666,
+      "grad_norm": 0.26636863988989745,
+      "learning_rate": 1.118381109136174e-05,
+      "loss": 0.6227,
+      "step": 799
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.2607029457520605,
+      "learning_rate": 1.1025349814812224e-05,
+      "loss": 0.6156,
+      "step": 800
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.2963260952267333,
+      "learning_rate": 1.0867953646894525e-05,
+      "loss": 0.623,
+      "step": 801
+    },
+    {
+      "epoch": 0.8554666666666667,
+      "grad_norm": 0.27555470116551417,
+      "learning_rate": 1.0711624471785986e-05,
+      "loss": 0.6176,
+      "step": 802
+    },
+    {
+      "epoch": 0.8565333333333334,
+      "grad_norm": 0.2671017143664467,
+      "learning_rate": 1.055636416089102e-05,
+      "loss": 0.6119,
+      "step": 803
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.2743025463236183,
+      "learning_rate": 1.0402174572818723e-05,
+      "loss": 0.6139,
+      "step": 804
+    },
+    {
+      "epoch": 0.8586666666666667,
+      "grad_norm": 0.2901714095169638,
+      "learning_rate": 1.0249057553360742e-05,
+      "loss": 0.6618,
+      "step": 805
+    },
+    {
+      "epoch": 0.8597333333333333,
+      "grad_norm": 0.2738482219598366,
+      "learning_rate": 1.0097014935468984e-05,
+      "loss": 0.6259,
+      "step": 806
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.29202567106931127,
+      "learning_rate": 9.946048539233865e-06,
+      "loss": 0.6599,
+      "step": 807
+    },
+    {
+      "epoch": 0.8618666666666667,
+      "grad_norm": 0.3243425703816153,
+      "learning_rate": 9.796160171862367e-06,
+      "loss": 0.6358,
+      "step": 808
+    },
+    {
+      "epoch": 0.8629333333333333,
+      "grad_norm": 0.33226356431758064,
+      "learning_rate": 9.647351627656543e-06,
+      "loss": 0.6853,
+      "step": 809
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.30129256888672756,
+      "learning_rate": 9.499624687991871e-06,
+      "loss": 0.6809,
+      "step": 810
+    },
+    {
+      "epoch": 0.8650666666666667,
+      "grad_norm": 0.3234482699999095,
+      "learning_rate": 9.352981121296134e-06,
+      "loss": 0.6959,
+      "step": 811
+    },
+    {
+      "epoch": 0.8661333333333333,
+      "grad_norm": 0.3007241867654337,
+      "learning_rate": 9.207422683028066e-06,
+      "loss": 0.6534,
+      "step": 812
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.2714633750452451,
+      "learning_rate": 9.062951115656403e-06,
+      "loss": 0.6419,
+      "step": 813
+    },
+    {
+      "epoch": 0.8682666666666666,
+      "grad_norm": 0.3022747403133687,
+      "learning_rate": 8.919568148639123e-06,
+      "loss": 0.6165,
+      "step": 814
+    },
+    {
+      "epoch": 0.8693333333333333,
+      "grad_norm": 0.28264368788822275,
+      "learning_rate": 8.777275498402548e-06,
+      "loss": 0.6908,
+      "step": 815
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.2972550502683997,
+      "learning_rate": 8.636074868320987e-06,
+      "loss": 0.6598,
+      "step": 816
+    },
+    {
+      "epoch": 0.8714666666666666,
+      "grad_norm": 0.3568305968989144,
+      "learning_rate": 8.495967948696192e-06,
+      "loss": 0.7169,
+      "step": 817
+    },
+    {
+      "epoch": 0.8725333333333334,
+      "grad_norm": 0.28174549663120607,
+      "learning_rate": 8.35695641673725e-06,
+      "loss": 0.6702,
+      "step": 818
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.3045692578885802,
+      "learning_rate": 8.219041936540395e-06,
+      "loss": 0.625,
+      "step": 819
+    },
+    {
+      "epoch": 0.8746666666666667,
+      "grad_norm": 0.2693662188706793,
+      "learning_rate": 8.082226159069196e-06,
+      "loss": 0.6227,
+      "step": 820
+    },
+    {
+      "epoch": 0.8757333333333334,
+      "grad_norm": 0.2880666514678028,
+      "learning_rate": 7.946510722134692e-06,
+      "loss": 0.6244,
+      "step": 821
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.2734145366439765,
+      "learning_rate": 7.811897250375833e-06,
+      "loss": 0.598,
+      "step": 822
+    },
+    {
+      "epoch": 0.8778666666666667,
+      "grad_norm": 0.28781638193912634,
+      "learning_rate": 7.678387355240057e-06,
+      "loss": 0.6503,
+      "step": 823
+    },
+    {
+      "epoch": 0.8789333333333333,
+      "grad_norm": 0.3648063431291061,
+      "learning_rate": 7.5459826349639436e-06,
+      "loss": 0.7601,
+      "step": 824
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.260198966878604,
+      "learning_rate": 7.4146846745541506e-06,
+      "loss": 0.607,
+      "step": 825
+    },
+    {
+      "epoch": 0.8810666666666667,
+      "grad_norm": 0.2940453995650147,
+      "learning_rate": 7.284495045768325e-06,
+      "loss": 0.6568,
+      "step": 826
+    },
+    {
+      "epoch": 0.8821333333333333,
+      "grad_norm": 0.30230013785921245,
+      "learning_rate": 7.155415307096458e-06,
+      "loss": 0.6672,
+      "step": 827
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.3185613783366778,
+      "learning_rate": 7.027447003742071e-06,
+      "loss": 0.6819,
+      "step": 828
+    },
+    {
+      "epoch": 0.8842666666666666,
+      "grad_norm": 0.2640093202070568,
+      "learning_rate": 6.900591667603751e-06,
+      "loss": 0.6028,
+      "step": 829
+    },
+    {
+      "epoch": 0.8853333333333333,
+      "grad_norm": 0.2742802557119247,
+      "learning_rate": 6.774850817256939e-06,
+      "loss": 0.6684,
+      "step": 830
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.28670109048885123,
+      "learning_rate": 6.650225957935552e-06,
+      "loss": 0.6473,
+      "step": 831
+    },
+    {
+      "epoch": 0.8874666666666666,
+      "grad_norm": 0.5482357298278248,
+      "learning_rate": 6.5267185815141355e-06,
+      "loss": 0.5979,
+      "step": 832
+    },
+    {
+      "epoch": 0.8885333333333333,
+      "grad_norm": 0.2636099474261866,
+      "learning_rate": 6.40433016648988e-06,
+      "loss": 0.609,
+      "step": 833
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.2827836449010556,
+      "learning_rate": 6.283062177965038e-06,
+      "loss": 0.6358,
+      "step": 834
+    },
+    {
+      "epoch": 0.8906666666666667,
+      "grad_norm": 0.28553530158050655,
+      "learning_rate": 6.162916067629254e-06,
+      "loss": 0.6199,
+      "step": 835
+    },
+    {
+      "epoch": 0.8917333333333334,
+      "grad_norm": 0.29957610722095446,
+      "learning_rate": 6.043893273742329e-06,
+      "loss": 0.6825,
+      "step": 836
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.29158334974628386,
+      "learning_rate": 5.925995221116853e-06,
+      "loss": 0.6493,
+      "step": 837
+    },
+    {
+      "epoch": 0.8938666666666667,
+      "grad_norm": 0.2725657370081221,
+      "learning_rate": 5.809223321101276e-06,
+      "loss": 0.6001,
+      "step": 838
+    },
+    {
+      "epoch": 0.8949333333333334,
+      "grad_norm": 0.2937486643773653,
+      "learning_rate": 5.693578971562963e-06,
+      "loss": 0.6055,
+      "step": 839
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.29609537125577257,
+      "learning_rate": 5.5790635568714224e-06,
+      "loss": 0.6704,
+      "step": 840
+    },
+    {
+      "epoch": 0.8970666666666667,
+      "grad_norm": 0.2712092200190641,
+      "learning_rate": 5.465678447881828e-06,
+      "loss": 0.618,
+      "step": 841
+    },
+    {
+      "epoch": 0.8981333333333333,
+      "grad_norm": 0.27920206556433336,
+      "learning_rate": 5.3534250019184774e-06,
+      "loss": 0.6365,
+      "step": 842
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.2998923856232532,
+      "learning_rate": 5.242304562758704e-06,
+      "loss": 0.6897,
+      "step": 843
+    },
+    {
+      "epoch": 0.9002666666666667,
+      "grad_norm": 0.31632903764839043,
+      "learning_rate": 5.132318460616625e-06,
+      "loss": 0.6813,
+      "step": 844
+    },
+    {
+      "epoch": 0.9013333333333333,
+      "grad_norm": 0.2850338246119351,
+      "learning_rate": 5.023468012127364e-06,
+      "loss": 0.6378,
+      "step": 845
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.2759659766451079,
+      "learning_rate": 4.915754520331173e-06,
+      "loss": 0.6499,
+      "step": 846
+    },
+    {
+      "epoch": 0.9034666666666666,
+      "grad_norm": 0.27832284175686606,
+      "learning_rate": 4.8091792746578935e-06,
+      "loss": 0.6286,
+      "step": 847
+    },
+    {
+      "epoch": 0.9045333333333333,
+      "grad_norm": 0.2909582769905804,
+      "learning_rate": 4.703743550911543e-06,
+      "loss": 0.6376,
+      "step": 848
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.2946038404031169,
+      "learning_rate": 4.599448611254964e-06,
+      "loss": 0.6479,
+      "step": 849
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.2963271540545558,
+      "learning_rate": 4.496295704194819e-06,
+      "loss": 0.6479,
+      "step": 850
+    },
+    {
+      "epoch": 0.9077333333333333,
+      "grad_norm": 0.30473111778805456,
+      "learning_rate": 4.394286064566511e-06,
+      "loss": 0.6919,
+      "step": 851
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3031317457354698,
+      "learning_rate": 4.293420913519541e-06,
+      "loss": 0.6035,
+      "step": 852
+    },
+    {
+      "epoch": 0.9098666666666667,
+      "grad_norm": 0.34334348312182256,
+      "learning_rate": 4.193701458502807e-06,
+      "loss": 0.7114,
+      "step": 853
+    },
+    {
+      "epoch": 0.9109333333333334,
+      "grad_norm": 0.28957551815838123,
+      "learning_rate": 4.095128893250156e-06,
+      "loss": 0.6422,
+      "step": 854
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3065754791370688,
+      "learning_rate": 3.997704397766122e-06,
+      "loss": 0.6658,
+      "step": 855
+    },
+    {
+      "epoch": 0.9130666666666667,
+      "grad_norm": 0.3005845115860106,
+      "learning_rate": 3.901429138311763e-06,
+      "loss": 0.6942,
+      "step": 856
+    },
+    {
+      "epoch": 0.9141333333333334,
+      "grad_norm": 0.3098675540355066,
+      "learning_rate": 3.80630426739077e-06,
+      "loss": 0.6429,
+      "step": 857
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.31337222049268587,
+      "learning_rate": 3.712330923735563e-06,
+      "loss": 0.6247,
+      "step": 858
+    },
+    {
+      "epoch": 0.9162666666666667,
+      "grad_norm": 0.2505119049981171,
+      "learning_rate": 3.6195102322937545e-06,
+      "loss": 0.5928,
+      "step": 859
+    },
+    {
+      "epoch": 0.9173333333333333,
+      "grad_norm": 0.28032497510759613,
+      "learning_rate": 3.5278433042146397e-06,
+      "loss": 0.6205,
+      "step": 860
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.28657416861883417,
+      "learning_rate": 3.4373312368358944e-06,
+      "loss": 0.6377,
+      "step": 861
+    },
+    {
+      "epoch": 0.9194666666666667,
+      "grad_norm": 0.26747839757924763,
+      "learning_rate": 3.347975113670454e-06,
+      "loss": 0.5963,
+      "step": 862
+    },
+    {
+      "epoch": 0.9205333333333333,
+      "grad_norm": 0.2674669659014821,
+      "learning_rate": 3.259776004393533e-06,
+      "loss": 0.6228,
+      "step": 863
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.3021573273362115,
+      "learning_rate": 3.1727349648298267e-06,
+      "loss": 0.6704,
+      "step": 864
+    },
+    {
+      "epoch": 0.9226666666666666,
+      "grad_norm": 0.2880074026527094,
+      "learning_rate": 3.086853036940862e-06,
+      "loss": 0.6196,
+      "step": 865
+    },
+    {
+      "epoch": 0.9237333333333333,
+      "grad_norm": 0.246973360426054,
+      "learning_rate": 3.0021312488125454e-06,
+      "loss": 0.5727,
+      "step": 866
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.24550040548276553,
+      "learning_rate": 2.9185706146428017e-06,
+      "loss": 0.5936,
+      "step": 867
+    },
+    {
+      "epoch": 0.9258666666666666,
+      "grad_norm": 0.2809872557601715,
+      "learning_rate": 2.836172134729509e-06,
+      "loss": 0.6311,
+      "step": 868
+    },
+    {
+      "epoch": 0.9269333333333334,
+      "grad_norm": 0.41589770308234464,
+      "learning_rate": 2.754936795458485e-06,
+      "loss": 0.6519,
+      "step": 869
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4032233034895968,
+      "learning_rate": 2.674865569291651e-06,
+      "loss": 0.6586,
+      "step": 870
+    },
+    {
+      "epoch": 0.9290666666666667,
+      "grad_norm": 0.3046052266443494,
+      "learning_rate": 2.5959594147554667e-06,
+      "loss": 0.6481,
+      "step": 871
+    },
+    {
+      "epoch": 0.9301333333333334,
+      "grad_norm": 0.2945935251929872,
+      "learning_rate": 2.5182192764293567e-06,
+      "loss": 0.6257,
+      "step": 872
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.29713554063421327,
+      "learning_rate": 2.4416460849345123e-06,
+      "loss": 0.6704,
+      "step": 873
+    },
+    {
+      "epoch": 0.9322666666666667,
+      "grad_norm": 0.28299124531891034,
+      "learning_rate": 2.366240756922644e-06,
+      "loss": 0.6435,
+      "step": 874
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.29507483432351955,
+      "learning_rate": 2.2920041950650783e-06,
+      "loss": 0.689,
+      "step": 875
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.2751642964206543,
+      "learning_rate": 2.218937288041956e-06,
+      "loss": 0.6516,
+      "step": 876
+    },
+    {
+      "epoch": 0.9354666666666667,
+      "grad_norm": 0.27890844581327046,
+      "learning_rate": 2.1470409105315283e-06,
+      "loss": 0.6187,
+      "step": 877
+    },
+    {
+      "epoch": 0.9365333333333333,
+      "grad_norm": 0.2927404678505194,
+      "learning_rate": 2.0763159231997674e-06,
+      "loss": 0.625,
+      "step": 878
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.2574656127587245,
+      "learning_rate": 2.0067631726899962e-06,
+      "loss": 0.5824,
+      "step": 879
+    },
+    {
+      "epoch": 0.9386666666666666,
+      "grad_norm": 0.32738864261467976,
+      "learning_rate": 1.938383491612794e-06,
+      "loss": 0.693,
+      "step": 880
+    },
+    {
+      "epoch": 0.9397333333333333,
+      "grad_norm": 0.2923653428142808,
+      "learning_rate": 1.8711776985360308e-06,
+      "loss": 0.6562,
+      "step": 881
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.28754266408852613,
+      "learning_rate": 1.805146597975016e-06,
+      "loss": 0.6359,
+      "step": 882
+    },
+    {
+      "epoch": 0.9418666666666666,
+      "grad_norm": 0.2793573603542659,
+      "learning_rate": 1.7402909803829525e-06,
+      "loss": 0.6263,
+      "step": 883
+    },
+    {
+      "epoch": 0.9429333333333333,
+      "grad_norm": 0.3242947248714171,
+      "learning_rate": 1.6766116221413774e-06,
+      "loss": 0.6826,
+      "step": 884
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.302229428057383,
+      "learning_rate": 1.61410928555098e-06,
+      "loss": 0.6544,
+      "step": 885
+    },
+    {
+      "epoch": 0.9450666666666667,
+      "grad_norm": 0.2947576789054036,
+      "learning_rate": 1.5527847188223644e-06,
+      "loss": 0.6433,
+      "step": 886
+    },
+    {
+      "epoch": 0.9461333333333334,
+      "grad_norm": 0.30696808944507264,
+      "learning_rate": 1.4926386560671358e-06,
+      "loss": 0.6502,
+      "step": 887
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.2950325951795179,
+      "learning_rate": 1.433671817289184e-06,
+      "loss": 0.6377,
+      "step": 888
+    },
+    {
+      "epoch": 0.9482666666666667,
+      "grad_norm": 0.29142776403284254,
+      "learning_rate": 1.3758849083759352e-06,
+      "loss": 0.6108,
+      "step": 889
+    },
+    {
+      "epoch": 0.9493333333333334,
+      "grad_norm": 0.30799952597440694,
+      "learning_rate": 1.3192786210900033e-06,
+      "loss": 0.6893,
+      "step": 890
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.2649671511757792,
+      "learning_rate": 1.2638536330608408e-06,
+      "loss": 0.5532,
+      "step": 891
+    },
+    {
+      "epoch": 0.9514666666666667,
+      "grad_norm": 0.27907304352974543,
+      "learning_rate": 1.2096106077767011e-06,
+      "loss": 0.6194,
+      "step": 892
+    },
+    {
+      "epoch": 0.9525333333333333,
+      "grad_norm": 0.28586285709978776,
+      "learning_rate": 1.1565501945766222e-06,
+      "loss": 0.6213,
+      "step": 893
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.31334672185860857,
+      "learning_rate": 1.1046730286426775e-06,
+      "loss": 0.6706,
+      "step": 894
+    },
+    {
+      "epoch": 0.9546666666666667,
+      "grad_norm": 0.27241147401690935,
+      "learning_rate": 1.053979730992416e-06,
+      "loss": 0.6247,
+      "step": 895
+    },
+    {
+      "epoch": 0.9557333333333333,
+      "grad_norm": 0.30775552519743304,
+      "learning_rate": 1.0044709084713554e-06,
+      "loss": 0.6552,
+      "step": 896
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.2756705452562053,
+      "learning_rate": 9.56147153745779e-07,
+      "loss": 0.5943,
+      "step": 897
+    },
+    {
+      "epoch": 0.9578666666666666,
+      "grad_norm": 0.27034826424595554,
+      "learning_rate": 9.090090452955835e-07,
+      "loss": 0.6033,
+      "step": 898
+    },
+    {
+      "epoch": 0.9589333333333333,
+      "grad_norm": 0.3245743296461917,
+      "learning_rate": 8.630571474074311e-07,
+      "loss": 0.7088,
+      "step": 899
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.28363528174996244,
+      "learning_rate": 8.182920101679092e-07,
+      "loss": 0.6403,
+      "step": 900
+    },
+    {
+      "epoch": 0.9610666666666666,
+      "grad_norm": 0.2894840028292518,
+      "learning_rate": 7.747141694570026e-07,
+      "loss": 0.619,
+      "step": 901
+    },
+    {
+      "epoch": 0.9621333333333333,
+      "grad_norm": 0.3054324030772581,
+      "learning_rate": 7.323241469416764e-07,
+      "loss": 0.7242,
+      "step": 902
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.2992287194918836,
+      "learning_rate": 6.911224500695702e-07,
+      "loss": 0.6688,
+      "step": 903
+    },
+    {
+      "epoch": 0.9642666666666667,
+      "grad_norm": 0.26701348964873584,
+      "learning_rate": 6.511095720630244e-07,
+      "loss": 0.6208,
+      "step": 904
+    },
+    {
+      "epoch": 0.9653333333333334,
+      "grad_norm": 0.2903174348444887,
+      "learning_rate": 6.122859919130974e-07,
+      "loss": 0.6571,
+      "step": 905
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.29368806634671657,
+      "learning_rate": 5.746521743738354e-07,
+      "loss": 0.6569,
+      "step": 906
+    },
+    {
+      "epoch": 0.9674666666666667,
+      "grad_norm": 0.267650628468648,
+      "learning_rate": 5.382085699567552e-07,
+      "loss": 0.5774,
+      "step": 907
+    },
+    {
+      "epoch": 0.9685333333333334,
+      "grad_norm": 0.27530275791049835,
+      "learning_rate": 5.029556149254266e-07,
+      "loss": 0.6016,
+      "step": 908
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.28684894549841344,
+      "learning_rate": 4.6889373129022085e-07,
+      "loss": 0.6626,
+      "step": 909
+    },
+    {
+      "epoch": 0.9706666666666667,
+      "grad_norm": 0.2858549518354566,
+      "learning_rate": 4.3602332680331425e-07,
+      "loss": 0.642,
+      "step": 910
+    },
+    {
+      "epoch": 0.9717333333333333,
+      "grad_norm": 0.31470427156630587,
+      "learning_rate": 4.0434479495378155e-07,
+      "loss": 0.6986,
+      "step": 911
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.29453088944233013,
+      "learning_rate": 3.7385851496284374e-07,
+      "loss": 0.6459,
+      "step": 912
+    },
+    {
+      "epoch": 0.9738666666666667,
+      "grad_norm": 0.2919790424173549,
+      "learning_rate": 3.445648517793942e-07,
+      "loss": 0.6313,
+      "step": 913
+    },
+    {
+      "epoch": 0.9749333333333333,
+      "grad_norm": 0.2903855362643007,
+      "learning_rate": 3.164641560756132e-07,
+      "loss": 0.6834,
+      "step": 914
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.31817553626214073,
+      "learning_rate": 2.895567642427488e-07,
+      "loss": 0.6859,
+      "step": 915
+    },
+    {
+      "epoch": 0.9770666666666666,
+      "grad_norm": 0.2939831986766623,
+      "learning_rate": 2.638429983870983e-07,
+      "loss": 0.685,
+      "step": 916
+    },
+    {
+      "epoch": 0.9781333333333333,
+      "grad_norm": 0.3222959683434266,
+      "learning_rate": 2.3932316632614416e-07,
+      "loss": 0.7028,
+      "step": 917
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.32057820854732816,
+      "learning_rate": 2.15997561584913e-07,
+      "loss": 0.6953,
+      "step": 918
+    },
+    {
+      "epoch": 0.9802666666666666,
+      "grad_norm": 0.3059488091081678,
+      "learning_rate": 1.9386646339238924e-07,
+      "loss": 0.6399,
+      "step": 919
+    },
+    {
+      "epoch": 0.9813333333333333,
+      "grad_norm": 0.2939641229503454,
+      "learning_rate": 1.7293013667825098e-07,
+      "loss": 0.6716,
+      "step": 920
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.2994233155434253,
+      "learning_rate": 1.5318883206962842e-07,
+      "loss": 0.6587,
+      "step": 921
+    },
+    {
+      "epoch": 0.9834666666666667,
+      "grad_norm": 0.33888384623850104,
+      "learning_rate": 1.3464278588815048e-07,
+      "loss": 0.6649,
+      "step": 922
+    },
+    {
+      "epoch": 0.9845333333333334,
+      "grad_norm": 0.325329202626534,
+      "learning_rate": 1.1729222014709162e-07,
+      "loss": 0.6735,
+      "step": 923
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3105290082581944,
+      "learning_rate": 1.0113734254872942e-07,
+      "loss": 0.6546,
+      "step": 924
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.3276838609071141,
+      "learning_rate": 8.617834648185774e-08,
+      "loss": 0.625,
+      "step": 925
+    },
+    {
+      "epoch": 0.9877333333333334,
+      "grad_norm": 0.2948843407042844,
+      "learning_rate": 7.241541101945526e-08,
+      "loss": 0.6566,
+      "step": 926
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.3143572595824,
+      "learning_rate": 5.984870091654271e-08,
+      "loss": 0.6728,
+      "step": 927
+    },
+    {
+      "epoch": 0.9898666666666667,
+      "grad_norm": 0.29404292704417323,
+      "learning_rate": 4.847836660824001e-08,
+      "loss": 0.6596,
+      "step": 928
+    },
+    {
+      "epoch": 0.9909333333333333,
+      "grad_norm": 0.3604294327978392,
+      "learning_rate": 3.8304544207945495e-08,
+      "loss": 0.6449,
+      "step": 929
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.274139263231835,
+      "learning_rate": 2.9327355505681663e-08,
+      "loss": 0.6238,
+      "step": 930
+    },
+    {
+      "epoch": 0.9930666666666667,
+      "grad_norm": 0.27272103294667494,
+      "learning_rate": 2.1546907966685236e-08,
+      "loss": 0.64,
+      "step": 931
+    },
+    {
+      "epoch": 0.9941333333333333,
+      "grad_norm": 0.2587150141920648,
+      "learning_rate": 1.496329473008595e-08,
+      "loss": 0.6438,
+      "step": 932
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.28797703518091533,
+      "learning_rate": 9.576594607807465e-09,
+      "loss": 0.705,
+      "step": 933
+    },
+    {
+      "epoch": 0.9962666666666666,
+      "grad_norm": 0.2683094544959566,
+      "learning_rate": 5.3868720836236506e-09,
+      "loss": 0.6221,
+      "step": 934
+    },
+    {
+      "epoch": 0.9973333333333333,
+      "grad_norm": 0.27433839910302826,
+      "learning_rate": 2.3941773123814516e-09,
+      "loss": 0.6151,
+      "step": 935
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.2761447567453666,
+      "learning_rate": 5.985461193791509e-10,
+      "loss": 0.6213,
+      "step": 936
+    },
+    {
+      "epoch": 0.9994666666666666,
+      "grad_norm": 0.46724154826656267,
+      "learning_rate": 0.0,
+      "loss": 0.6136,
+      "step": 937
+    },
+    {
+      "epoch": 0.9994666666666666,
+      "step": 937,
+      "total_flos": 2443854479097856.0,
+      "train_loss": 0.7083672744360143,
+      "train_runtime": 29188.8069,
+      "train_samples_per_second": 1.028,
+      "train_steps_per_second": 0.032
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 937,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2443854479097856.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..512cbdf5265a510ca580f516f4cd258cdab4fbf7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9b7cb1a32600557510d34f2f979ff99ba26a9e62
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f9535a4f6a448826a21c3a267011f64cd8747b385292cc0463e5b5fc1f24223
+size 671150064
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..45e5ccdd133ad1d7c402b7953a51772bcee080da
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76eee359f5364c4d527493ac0546a952632f483327c3eb87a542d89502502844
+size 918507402
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2904470624b646fb832ede5753619c202987ceca
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,13167 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005333333333333334,
+      "grad_norm": 0.7628357242415116,
+      "learning_rate": 3.5087719298245615e-06,
+      "loss": 1.255,
+      "step": 1
+    },
+    {
+      "epoch": 0.0010666666666666667,
+      "grad_norm": 0.8194757458054642,
+      "learning_rate": 7.017543859649123e-06,
+      "loss": 1.2986,
+      "step": 2
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.7958014474683687,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.3299,
+      "step": 3
+    },
+    {
+      "epoch": 0.0021333333333333334,
+      "grad_norm": 0.7807215158428453,
+      "learning_rate": 1.4035087719298246e-05,
+      "loss": 1.3331,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026666666666666666,
+      "grad_norm": 1.10867974246785,
+      "learning_rate": 1.7543859649122806e-05,
+      "loss": 1.1955,
+      "step": 5
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.7501373704184361,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.3464,
+      "step": 6
+    },
+    {
+      "epoch": 0.0037333333333333333,
+      "grad_norm": 0.7594642077016195,
+      "learning_rate": 2.456140350877193e-05,
+      "loss": 1.322,
+      "step": 7
+    },
+    {
+      "epoch": 0.004266666666666667,
+      "grad_norm": 0.6424306160023425,
+      "learning_rate": 2.8070175438596492e-05,
+      "loss": 1.1769,
+      "step": 8
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.6621813680604755,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.1253,
+      "step": 9
+    },
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 0.6072150043621733,
+      "learning_rate": 3.508771929824561e-05,
+      "loss": 1.0355,
+      "step": 10
+    },
+    {
+      "epoch": 0.005866666666666667,
+      "grad_norm": 0.9512685642147044,
+      "learning_rate": 3.859649122807018e-05,
+      "loss": 1.0948,
+      "step": 11
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9654646660953872,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 0.9527,
+      "step": 12
+    },
+    {
+      "epoch": 0.006933333333333333,
+      "grad_norm": 0.7158956275127898,
+      "learning_rate": 4.56140350877193e-05,
+      "loss": 0.9619,
+      "step": 13
+    },
+    {
+      "epoch": 0.007466666666666667,
+      "grad_norm": 0.7510755826538486,
+      "learning_rate": 4.912280701754386e-05,
+      "loss": 0.9568,
+      "step": 14
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.6761493950332309,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 0.9471,
+      "step": 15
+    },
+    {
+      "epoch": 0.008533333333333334,
+      "grad_norm": 0.6442709212470771,
+      "learning_rate": 5.6140350877192984e-05,
+      "loss": 0.9905,
+      "step": 16
+    },
+    {
+      "epoch": 0.009066666666666667,
+      "grad_norm": 0.6140912856378853,
+      "learning_rate": 5.9649122807017544e-05,
+      "loss": 0.8823,
+      "step": 17
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.5927185337725561,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 0.9203,
+      "step": 18
+    },
+    {
+      "epoch": 0.010133333333333333,
+      "grad_norm": 0.5885011384967356,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.8946,
+      "step": 19
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": 0.569978601461122,
+      "learning_rate": 7.017543859649122e-05,
+      "loss": 0.9413,
+      "step": 20
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.5450626692942131,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.8821,
+      "step": 21
+    },
+    {
+      "epoch": 0.011733333333333333,
+      "grad_norm": 0.5869407362791327,
+      "learning_rate": 7.719298245614036e-05,
+      "loss": 0.9553,
+      "step": 22
+    },
+    {
+      "epoch": 0.012266666666666667,
+      "grad_norm": 0.5514964151706281,
+      "learning_rate": 8.070175438596491e-05,
+      "loss": 0.8503,
+      "step": 23
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.4576203847614647,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.8835,
+      "step": 24
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 0.4656587189395512,
+      "learning_rate": 8.771929824561403e-05,
+      "loss": 0.8603,
+      "step": 25
+    },
+    {
+      "epoch": 0.013866666666666666,
+      "grad_norm": 0.6353641328243992,
+      "learning_rate": 9.12280701754386e-05,
+      "loss": 0.9426,
+      "step": 26
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.4911824668625709,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.887,
+      "step": 27
+    },
+    {
+      "epoch": 0.014933333333333333,
+      "grad_norm": 0.5022750080958399,
+      "learning_rate": 9.824561403508771e-05,
+      "loss": 0.8704,
+      "step": 28
+    },
+    {
+      "epoch": 0.015466666666666667,
+      "grad_norm": 0.5488540150230059,
+      "learning_rate": 0.0001017543859649123,
+      "loss": 0.9003,
+      "step": 29
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.49086221281679715,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.8717,
+      "step": 30
+    },
+    {
+      "epoch": 0.016533333333333334,
+      "grad_norm": 0.4519977501834657,
+      "learning_rate": 0.00010877192982456141,
+      "loss": 0.8111,
+      "step": 31
+    },
+    {
+      "epoch": 0.017066666666666667,
+      "grad_norm": 0.5667972847554228,
+      "learning_rate": 0.00011228070175438597,
+      "loss": 0.9719,
+      "step": 32
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.5243396224563087,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.8918,
+      "step": 33
+    },
+    {
+      "epoch": 0.018133333333333335,
+      "grad_norm": 0.4206257007883853,
+      "learning_rate": 0.00011929824561403509,
+      "loss": 0.8267,
+      "step": 34
+    },
+    {
+      "epoch": 0.018666666666666668,
+      "grad_norm": 0.4408623734911278,
+      "learning_rate": 0.00012280701754385965,
+      "loss": 0.8421,
+      "step": 35
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.4310398203704329,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8038,
+      "step": 36
+    },
+    {
+      "epoch": 0.019733333333333332,
+      "grad_norm": 0.41152712397478197,
+      "learning_rate": 0.0001298245614035088,
+      "loss": 0.7113,
+      "step": 37
+    },
+    {
+      "epoch": 0.020266666666666665,
+      "grad_norm": 0.4698983552275035,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.8961,
+      "step": 38
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.4496152385894507,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.8877,
+      "step": 39
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 0.5243923818385874,
+      "learning_rate": 0.00014035087719298245,
+      "loss": 0.8824,
+      "step": 40
+    },
+    {
+      "epoch": 0.021866666666666666,
+      "grad_norm": 0.4911900741487491,
+      "learning_rate": 0.00014385964912280703,
+      "loss": 0.9176,
+      "step": 41
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.4856857712861841,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.7998,
+      "step": 42
+    },
+    {
+      "epoch": 0.022933333333333333,
+      "grad_norm": 0.49859929276212756,
+      "learning_rate": 0.00015087719298245616,
+      "loss": 0.8706,
+      "step": 43
+    },
+    {
+      "epoch": 0.023466666666666667,
+      "grad_norm": 0.4377547229110765,
+      "learning_rate": 0.0001543859649122807,
+      "loss": 0.8236,
+      "step": 44
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.6531009589606827,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8414,
+      "step": 45
+    },
+    {
+      "epoch": 0.024533333333333334,
+      "grad_norm": 0.5033954891454102,
+      "learning_rate": 0.00016140350877192982,
+      "loss": 0.8178,
+      "step": 46
+    },
+    {
+      "epoch": 0.025066666666666668,
+      "grad_norm": 0.44798636015217747,
+      "learning_rate": 0.0001649122807017544,
+      "loss": 0.827,
+      "step": 47
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.45582754630713557,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8612,
+      "step": 48
+    },
+    {
+      "epoch": 0.026133333333333335,
+      "grad_norm": 0.45872918777303245,
+      "learning_rate": 0.00017192982456140353,
+      "loss": 0.904,
+      "step": 49
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.5931742003230699,
+      "learning_rate": 0.00017543859649122806,
+      "loss": 0.8941,
+      "step": 50
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.46435386514124116,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8459,
+      "step": 51
+    },
+    {
+      "epoch": 0.027733333333333332,
+      "grad_norm": 0.4103906134060393,
+      "learning_rate": 0.0001824561403508772,
+      "loss": 0.802,
+      "step": 52
+    },
+    {
+      "epoch": 0.028266666666666666,
+      "grad_norm": 0.4527873188642011,
+      "learning_rate": 0.00018596491228070177,
+      "loss": 0.8612,
+      "step": 53
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5377033391169986,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.8861,
+      "step": 54
+    },
+    {
+      "epoch": 0.029333333333333333,
+      "grad_norm": 0.44354325477395923,
+      "learning_rate": 0.00019298245614035088,
+      "loss": 0.8266,
+      "step": 55
+    },
+    {
+      "epoch": 0.029866666666666666,
+      "grad_norm": 0.46965440945573167,
+      "learning_rate": 0.00019649122807017543,
+      "loss": 0.7932,
+      "step": 56
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.49861368848703796,
+      "learning_rate": 0.0002,
+      "loss": 0.9251,
+      "step": 57
+    },
+    {
+      "epoch": 0.030933333333333334,
+      "grad_norm": 0.5358886796410643,
+      "learning_rate": 0.00019999985069241055,
+      "loss": 0.8537,
+      "step": 58
+    },
+    {
+      "epoch": 0.031466666666666664,
+      "grad_norm": 0.4199111933454957,
+      "learning_rate": 0.00019999940277008808,
+      "loss": 0.8055,
+      "step": 59
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.4440651139054541,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8195,
+      "step": 60
+    },
+    {
+      "epoch": 0.03253333333333333,
+      "grad_norm": 0.38496763848220006,
+      "learning_rate": 0.00019999761108748597,
+      "loss": 0.7665,
+      "step": 61
+    },
+    {
+      "epoch": 0.03306666666666667,
+      "grad_norm": 0.4601442010805776,
+      "learning_rate": 0.00019999626733255662,
+      "loss": 0.7948,
+      "step": 62
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.4538395973660438,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.8292,
+      "step": 63
+    },
+    {
+      "epoch": 0.034133333333333335,
+      "grad_norm": 0.4103974919312309,
+      "learning_rate": 0.00019999268401550447,
+      "loss": 0.8019,
+      "step": 64
+    },
+    {
+      "epoch": 0.034666666666666665,
+      "grad_norm": 0.5126054306975563,
+      "learning_rate": 0.000199990444464082,
+      "loss": 0.7964,
+      "step": 65
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.42358893462331326,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8085,
+      "step": 66
+    },
+    {
+      "epoch": 0.03573333333333333,
+      "grad_norm": 0.4935773347491403,
+      "learning_rate": 0.00019998506960888256,
+      "loss": 0.8605,
+      "step": 67
+    },
+    {
+      "epoch": 0.03626666666666667,
+      "grad_norm": 0.5105035840801055,
+      "learning_rate": 0.00019998193432115572,
+      "loss": 0.8788,
+      "step": 68
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.44823839807865595,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.8743,
+      "step": 69
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 0.4644685890239903,
+      "learning_rate": 0.00019997476807225985,
+      "loss": 0.8567,
+      "step": 70
+    },
+    {
+      "epoch": 0.037866666666666667,
+      "grad_norm": 0.471562980689817,
+      "learning_rate": 0.0001999707371324904,
+      "loss": 0.8949,
+      "step": 71
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.41416387277398714,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.8016,
+      "step": 72
+    },
+    {
+      "epoch": 0.038933333333333334,
+      "grad_norm": 0.4067330878703305,
+      "learning_rate": 0.00019996177968249334,
+      "loss": 0.7976,
+      "step": 73
+    },
+    {
+      "epoch": 0.039466666666666664,
+      "grad_norm": 0.4680547465198519,
+      "learning_rate": 0.0001999568531990141,
+      "loss": 0.8615,
+      "step": 74
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.466684378170647,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8326,
+      "step": 75
+    },
+    {
+      "epoch": 0.04053333333333333,
+      "grad_norm": 0.5843105014034774,
+      "learning_rate": 0.00019994610478865011,
+      "loss": 0.9223,
+      "step": 76
+    },
+    {
+      "epoch": 0.04106666666666667,
+      "grad_norm": 0.4516958986843274,
+      "learning_rate": 0.0001999402828938618,
+      "loss": 0.8231,
+      "step": 77
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.46416836784344917,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.8305,
+      "step": 78
+    },
+    {
+      "epoch": 0.042133333333333335,
+      "grad_norm": 0.3697772206556224,
+      "learning_rate": 0.00019992774381199778,
+      "loss": 0.699,
+      "step": 79
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 0.46254325283397635,
+      "learning_rate": 0.00019992102666236566,
+      "loss": 0.8587,
+      "step": 80
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.41761565245303794,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.7599,
+      "step": 81
+    },
+    {
+      "epoch": 0.04373333333333333,
+      "grad_norm": 0.43848110998457335,
+      "learning_rate": 0.00019990669724599336,
+      "loss": 0.8359,
+      "step": 82
+    },
+    {
+      "epoch": 0.04426666666666667,
+      "grad_norm": 0.4720306109707181,
+      "learning_rate": 0.00019989908502204292,
+      "loss": 0.8616,
+      "step": 83
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.9634253575197086,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.7746,
+      "step": 84
+    },
+    {
+      "epoch": 0.04533333333333334,
+      "grad_norm": 0.5086670772892667,
+      "learning_rate": 0.00019988296565626987,
+      "loss": 0.8447,
+      "step": 85
+    },
+    {
+      "epoch": 0.04586666666666667,
+      "grad_norm": 0.4851308210699472,
+      "learning_rate": 0.00019987445856258206,
+      "loss": 0.7816,
+      "step": 86
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.43399889675256137,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.8149,
+      "step": 87
+    },
+    {
+      "epoch": 0.046933333333333334,
+      "grad_norm": 0.45522208083620896,
+      "learning_rate": 0.00019985654968062122,
+      "loss": 0.8185,
+      "step": 88
+    },
+    {
+      "epoch": 0.047466666666666664,
+      "grad_norm": 0.41206090460997447,
+      "learning_rate": 0.00019984714794582683,
+      "loss": 0.7626,
+      "step": 89
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.3927806603634916,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.7618,
+      "step": 90
+    },
+    {
+      "epoch": 0.04853333333333333,
+      "grad_norm": 0.445857279114616,
+      "learning_rate": 0.000199827450028985,
+      "loss": 0.7935,
+      "step": 91
+    },
+    {
+      "epoch": 0.04906666666666667,
+      "grad_norm": 0.4019642963276086,
+      "learning_rate": 0.00019981715390575858,
+      "loss": 0.7742,
+      "step": 92
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.45097234549832604,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.8406,
+      "step": 93
+    },
+    {
+      "epoch": 0.050133333333333335,
+      "grad_norm": 0.4527733588093272,
+      "learning_rate": 0.00019979566748342347,
+      "loss": 0.8657,
+      "step": 94
+    },
+    {
+      "epoch": 0.050666666666666665,
+      "grad_norm": 0.43058663300109606,
+      "learning_rate": 0.00019978447724847652,
+      "loss": 0.7905,
+      "step": 95
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.4223369658847847,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.7845,
+      "step": 96
+    },
+    {
+      "epoch": 0.05173333333333333,
+      "grad_norm": 0.455684068704479,
+      "learning_rate": 0.00019976120289810247,
+      "loss": 0.8505,
+      "step": 97
+    },
+    {
+      "epoch": 0.05226666666666667,
+      "grad_norm": 0.4519630049361916,
+      "learning_rate": 0.00019974911885217608,
+      "loss": 0.7986,
+      "step": 98
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.4351535348540937,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8325,
+      "step": 99
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.43908220372345264,
+      "learning_rate": 0.0001997240571992685,
+      "loss": 0.81,
+      "step": 100
+    },
+    {
+      "epoch": 0.05386666666666667,
+      "grad_norm": 0.4476311325416776,
+      "learning_rate": 0.00019971107966712518,
+      "loss": 0.7769,
+      "step": 101
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.4726063952110799,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.8528,
+      "step": 102
+    },
+    {
+      "epoch": 0.054933333333333334,
+      "grad_norm": 0.4201995743808383,
+      "learning_rate": 0.0001996842313852238,
+      "loss": 0.813,
+      "step": 103
+    },
+    {
+      "epoch": 0.055466666666666664,
+      "grad_norm": 0.4148956484560226,
+      "learning_rate": 0.00019967036071563877,
+      "loss": 0.7447,
+      "step": 104
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.5015600684967997,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.9028,
+      "step": 105
+    },
+    {
+      "epoch": 0.05653333333333333,
+      "grad_norm": 0.4310765367841715,
+      "learning_rate": 0.0001996417265262996,
+      "loss": 0.7803,
+      "step": 106
+    },
+    {
+      "epoch": 0.05706666666666667,
+      "grad_norm": 0.44752749930977775,
+      "learning_rate": 0.00019962696309205148,
+      "loss": 0.8099,
+      "step": 107
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.40063612430475026,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.7469,
+      "step": 108
+    },
+    {
+      "epoch": 0.058133333333333335,
+      "grad_norm": 0.43486569204981024,
+      "learning_rate": 0.0001995965437648273,
+      "loss": 0.8109,
+      "step": 109
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 0.4396585907015772,
+      "learning_rate": 0.00019958088796268793,
+      "loss": 0.8279,
+      "step": 110
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.390314497464367,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.7306,
+      "step": 111
+    },
+    {
+      "epoch": 0.05973333333333333,
+      "grad_norm": 0.42267774616233733,
+      "learning_rate": 0.00019954868431510764,
+      "loss": 0.7179,
+      "step": 112
+    },
+    {
+      "epoch": 0.06026666666666667,
+      "grad_norm": 0.45427963486348716,
+      "learning_rate": 0.00019953213656583168,
+      "loss": 0.8086,
+      "step": 113
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.4527011734468865,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.8277,
+      "step": 114
+    },
+    {
+      "epoch": 0.06133333333333333,
+      "grad_norm": 0.49169284050246814,
+      "learning_rate": 0.00019949814946337838,
+      "loss": 0.8489,
+      "step": 115
+    },
+    {
+      "epoch": 0.06186666666666667,
+      "grad_norm": 0.4279168406154108,
+      "learning_rate": 0.00019948071021169174,
+      "loss": 0.7653,
+      "step": 116
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.4121276367654202,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.7742,
+      "step": 117
+    },
+    {
+      "epoch": 0.06293333333333333,
+      "grad_norm": 0.4536726030404313,
+      "learning_rate": 0.00019944494056777946,
+      "loss": 0.8194,
+      "step": 118
+    },
+    {
+      "epoch": 0.06346666666666667,
+      "grad_norm": 0.4486295176595619,
+      "learning_rate": 0.00019942661028236745,
+      "loss": 0.8214,
+      "step": 119
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.42898423421445664,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.7394,
+      "step": 120
+    },
+    {
+      "epoch": 0.06453333333333333,
+      "grad_norm": 0.45190207354720013,
+      "learning_rate": 0.00019938905905831654,
+      "loss": 0.8329,
+      "step": 121
+    },
+    {
+      "epoch": 0.06506666666666666,
+      "grad_norm": 0.39971546567051797,
+      "learning_rate": 0.00019936983823181132,
+      "loss": 0.7784,
+      "step": 122
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.46504120999614906,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.824,
+      "step": 123
+    },
+    {
+      "epoch": 0.06613333333333334,
+      "grad_norm": 0.40648705882743386,
+      "learning_rate": 0.00019933050643682269,
+      "loss": 0.7704,
+      "step": 124
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.462891583179439,
+      "learning_rate": 0.00019931039558578997,
+      "loss": 0.8047,
+      "step": 125
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.44835435341660984,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.8025,
+      "step": 126
+    },
+    {
+      "epoch": 0.06773333333333334,
+      "grad_norm": 0.47607297228300677,
+      "learning_rate": 0.00019926928427691786,
+      "loss": 0.8257,
+      "step": 127
+    },
+    {
+      "epoch": 0.06826666666666667,
+      "grad_norm": 0.5193575319855029,
+      "learning_rate": 0.00019924828394184306,
+      "loss": 0.9143,
+      "step": 128
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.4766643858963425,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.8641,
+      "step": 129
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 0.49597963705967496,
+      "learning_rate": 0.0001992053942239668,
+      "loss": 0.8312,
+      "step": 130
+    },
+    {
+      "epoch": 0.06986666666666666,
+      "grad_norm": 0.4808812880130579,
+      "learning_rate": 0.0001991835049692405,
+      "loss": 0.7986,
+      "step": 131
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.44599017255723294,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8068,
+      "step": 132
+    },
+    {
+      "epoch": 0.07093333333333333,
+      "grad_norm": 0.48750979954844126,
+      "learning_rate": 0.0001991388379950346,
+      "loss": 0.8945,
+      "step": 133
+    },
+    {
+      "epoch": 0.07146666666666666,
+      "grad_norm": 0.6254679334048621,
+      "learning_rate": 0.0001991160604089374,
+      "loss": 0.7425,
+      "step": 134
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.43829115222454273,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.7859,
+      "step": 135
+    },
+    {
+      "epoch": 0.07253333333333334,
+      "grad_norm": 0.4704662955531175,
+      "learning_rate": 0.00019906961737884077,
+      "loss": 0.7688,
+      "step": 136
+    },
+    {
+      "epoch": 0.07306666666666667,
+      "grad_norm": 0.4409529564290118,
+      "learning_rate": 0.00019904595207352737,
+      "loss": 0.8644,
+      "step": 137
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.4344184221831154,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.7998,
+      "step": 138
+    },
+    {
+      "epoch": 0.07413333333333333,
+      "grad_norm": 0.4046224512304256,
+      "learning_rate": 0.000198997734235711,
+      "loss": 0.7475,
+      "step": 139
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.45801000153339994,
+      "learning_rate": 0.00019897318184719385,
+      "loss": 0.8505,
+      "step": 140
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.4141957413731039,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.7275,
+      "step": 141
+    },
+    {
+      "epoch": 0.07573333333333333,
+      "grad_norm": 0.4314403142961352,
+      "learning_rate": 0.0001989231904975272,
+      "loss": 0.8589,
+      "step": 142
+    },
+    {
+      "epoch": 0.07626666666666666,
+      "grad_norm": 0.4385517025195832,
+      "learning_rate": 0.00019889775168565943,
+      "loss": 0.8784,
+      "step": 143
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.4031136160078963,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.7516,
+      "step": 144
+    },
+    {
+      "epoch": 0.07733333333333334,
+      "grad_norm": 0.43675739098920885,
+      "learning_rate": 0.00019884598816767563,
+      "loss": 0.7659,
+      "step": 145
+    },
+    {
+      "epoch": 0.07786666666666667,
+      "grad_norm": 0.49445359229428,
+      "learning_rate": 0.0001988196636161333,
+      "loss": 0.7858,
+      "step": 146
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.43646456852784826,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.7629,
+      "step": 147
+    },
+    {
+      "epoch": 0.07893333333333333,
+      "grad_norm": 0.42977797610808016,
+      "learning_rate": 0.00019876612932099308,
+      "loss": 0.7425,
+      "step": 148
+    },
+    {
+      "epoch": 0.07946666666666667,
+      "grad_norm": 0.4920222624489503,
+      "learning_rate": 0.0001987389197372567,
+      "loss": 0.8062,
+      "step": 149
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.4479600663744989,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.8066,
+      "step": 150
+    },
+    {
+      "epoch": 0.08053333333333333,
+      "grad_norm": 0.41424496856493076,
+      "learning_rate": 0.00019868361610371097,
+      "loss": 0.7106,
+      "step": 151
+    },
+    {
+      "epoch": 0.08106666666666666,
+      "grad_norm": 0.4880984065020837,
+      "learning_rate": 0.00019865552221904665,
+      "loss": 0.9123,
+      "step": 152
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.43933091597767177,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.784,
+      "step": 153
+    },
+    {
+      "epoch": 0.08213333333333334,
+      "grad_norm": 0.4451132220141123,
+      "learning_rate": 0.00019859845073339787,
+      "loss": 0.8551,
+      "step": 154
+    },
+    {
+      "epoch": 0.08266666666666667,
+      "grad_norm": 0.4483482119194952,
+      "learning_rate": 0.00019856947330283752,
+      "loss": 0.8209,
+      "step": 155
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.41615558492968413,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.813,
+      "step": 156
+    },
+    {
+      "epoch": 0.08373333333333334,
+      "grad_norm": 0.39848951490249424,
+      "learning_rate": 0.0001985106354988997,
+      "loss": 0.6954,
+      "step": 157
+    },
+    {
+      "epoch": 0.08426666666666667,
+      "grad_norm": 0.4241471112610877,
+      "learning_rate": 0.00019848077530122083,
+      "loss": 0.78,
+      "step": 158
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.4242752278974099,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.7545,
+      "step": 159
+    },
+    {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.44284383218892404,
+      "learning_rate": 0.00019842017276027832,
+      "loss": 0.8226,
+      "step": 160
+    },
+    {
+      "epoch": 0.08586666666666666,
+      "grad_norm": 0.4475246490821707,
+      "learning_rate": 0.00019838943059798304,
+      "loss": 0.7354,
+      "step": 161
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5187652451271928,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.886,
+      "step": 162
+    },
+    {
+      "epoch": 0.08693333333333333,
+      "grad_norm": 0.4805342879602958,
+      "learning_rate": 0.0001983270649487481,
+      "loss": 0.8325,
+      "step": 163
+    },
+    {
+      "epoch": 0.08746666666666666,
+      "grad_norm": 0.4844268011021575,
+      "learning_rate": 0.0001982954416480417,
+      "loss": 0.8778,
+      "step": 164
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.43007715527336304,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.8081,
+      "step": 165
+    },
+    {
+      "epoch": 0.08853333333333334,
+      "grad_norm": 0.43584800742577234,
+      "learning_rate": 0.00019823131456661063,
+      "loss": 0.7957,
+      "step": 166
+    },
+    {
+      "epoch": 0.08906666666666667,
+      "grad_norm": 0.49337379328966763,
+      "learning_rate": 0.00019819881097737915,
+      "loss": 0.8278,
+      "step": 167
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4714725906519646,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.8136,
+      "step": 168
+    },
+    {
+      "epoch": 0.09013333333333333,
+      "grad_norm": 0.4584132090446889,
+      "learning_rate": 0.00019813292418718732,
+      "loss": 0.8557,
+      "step": 169
+    },
+    {
+      "epoch": 0.09066666666666667,
+      "grad_norm": 0.4817876126266651,
+      "learning_rate": 0.0001980995411829749,
+      "loss": 0.8136,
+      "step": 170
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.4483299087194101,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8202,
+      "step": 171
+    },
+    {
+      "epoch": 0.09173333333333333,
+      "grad_norm": 0.4859168307388182,
+      "learning_rate": 0.0001980318964547504,
+      "loss": 0.8224,
+      "step": 172
+    },
+    {
+      "epoch": 0.09226666666666666,
+      "grad_norm": 0.39941565954850006,
+      "learning_rate": 0.0001979976349327357,
+      "loss": 0.7702,
+      "step": 173
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4611323663241478,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.828,
+      "step": 174
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 0.4482217525906128,
+      "learning_rate": 0.00019792823408445174,
+      "loss": 0.7599,
+      "step": 175
+    },
+    {
+      "epoch": 0.09386666666666667,
+      "grad_norm": 0.44027942062871145,
+      "learning_rate": 0.0001978930949654239,
+      "loss": 0.8377,
+      "step": 176
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.4659582801677607,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.9079,
+      "step": 177
+    },
+    {
+      "epoch": 0.09493333333333333,
+      "grad_norm": 0.4505204473100588,
+      "learning_rate": 0.00019782193986224995,
+      "loss": 0.8439,
+      "step": 178
+    },
+    {
+      "epoch": 0.09546666666666667,
+      "grad_norm": 0.45960903450500473,
+      "learning_rate": 0.00019778592409058378,
+      "loss": 0.8309,
+      "step": 179
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5377935625560539,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.8042,
+      "step": 180
+    },
+    {
+      "epoch": 0.09653333333333333,
+      "grad_norm": 0.41565057105304903,
+      "learning_rate": 0.0001977130166448355,
+      "loss": 0.8002,
+      "step": 181
+    },
+    {
+      "epoch": 0.09706666666666666,
+      "grad_norm": 0.41460320605279716,
+      "learning_rate": 0.00019767612518846608,
+      "loss": 0.7237,
+      "step": 182
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.49178728981780256,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8454,
+      "step": 183
+    },
+    {
+      "epoch": 0.09813333333333334,
+      "grad_norm": 0.44755309029442725,
+      "learning_rate": 0.00019760146735955388,
+      "loss": 0.8503,
+      "step": 184
+    },
+    {
+      "epoch": 0.09866666666666667,
+      "grad_norm": 0.38714457804353636,
+      "learning_rate": 0.00019756370120995066,
+      "loss": 0.7342,
+      "step": 185
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4760548365486237,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.835,
+      "step": 186
+    },
+    {
+      "epoch": 0.09973333333333333,
+      "grad_norm": 0.43674343320545966,
+      "learning_rate": 0.000197487295004327,
+      "loss": 0.8249,
+      "step": 187
+    },
+    {
+      "epoch": 0.10026666666666667,
+      "grad_norm": 0.4531365469989709,
+      "learning_rate": 0.00019744865517646706,
+      "loss": 0.8027,
+      "step": 188
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.43823941560176616,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.7959,
+      "step": 189
+    },
+    {
+      "epoch": 0.10133333333333333,
+      "grad_norm": 0.3929592720994812,
+      "learning_rate": 0.0001973705026475726,
+      "loss": 0.6907,
+      "step": 190
+    },
+    {
+      "epoch": 0.10186666666666666,
+      "grad_norm": 0.39972921626850777,
+      "learning_rate": 0.00019733099017991341,
+      "loss": 0.814,
+      "step": 191
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.43710404147202,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.8194,
+      "step": 192
+    },
+    {
+      "epoch": 0.10293333333333334,
+      "grad_norm": 0.4342108444860517,
+      "learning_rate": 0.0001972510934281218,
+      "loss": 0.812,
+      "step": 193
+    },
+    {
+      "epoch": 0.10346666666666667,
+      "grad_norm": 0.40976023474313855,
+      "learning_rate": 0.00019721070938257324,
+      "loss": 0.7932,
+      "step": 194
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.42537748527448105,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.8032,
+      "step": 195
+    },
+    {
+      "epoch": 0.10453333333333334,
+      "grad_norm": 0.42091074697602304,
+      "learning_rate": 0.0001971290705551347,
+      "loss": 0.7582,
+      "step": 196
+    },
+    {
+      "epoch": 0.10506666666666667,
+      "grad_norm": 0.3966725914002177,
+      "learning_rate": 0.00019708781601703065,
+      "loss": 0.7514,
+      "step": 197
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.46048178101311826,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.8245,
+      "step": 198
+    },
+    {
+      "epoch": 0.10613333333333333,
+      "grad_norm": 0.46331211907377207,
+      "learning_rate": 0.00019700443730801413,
+      "loss": 0.8152,
+      "step": 199
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.472286790301323,
+      "learning_rate": 0.00019696231338608316,
+      "loss": 0.8484,
+      "step": 200
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.4664840395567395,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.8278,
+      "step": 201
+    },
+    {
+      "epoch": 0.10773333333333333,
+      "grad_norm": 0.44170299279219305,
+      "learning_rate": 0.00019687719703631755,
+      "loss": 0.8216,
+      "step": 202
+    },
+    {
+      "epoch": 0.10826666666666666,
+      "grad_norm": 0.43538865688890044,
+      "learning_rate": 0.00019683420486265327,
+      "loss": 0.8423,
+      "step": 203
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.43026680130665573,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.7704,
+      "step": 204
+    },
+    {
+      "epoch": 0.10933333333333334,
+      "grad_norm": 0.4402184198715376,
+      "learning_rate": 0.0001967473531596671,
+      "loss": 0.8485,
+      "step": 205
+    },
+    {
+      "epoch": 0.10986666666666667,
+      "grad_norm": 0.4597257248557568,
+      "learning_rate": 0.0001967034938896976,
+      "loss": 0.7699,
+      "step": 206
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.43125968937925885,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.8024,
+      "step": 207
+    },
+    {
+      "epoch": 0.11093333333333333,
+      "grad_norm": 0.45240591752822273,
+      "learning_rate": 0.0001966149091676575,
+      "loss": 0.8629,
+      "step": 208
+    },
+    {
+      "epoch": 0.11146666666666667,
+      "grad_norm": 0.49186335474232923,
+      "learning_rate": 0.00019657018398011434,
+      "loss": 0.8732,
+      "step": 209
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.45901536135888155,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.8123,
+      "step": 210
+    },
+    {
+      "epoch": 0.11253333333333333,
+      "grad_norm": 0.4596031446817396,
+      "learning_rate": 0.00019647986861976246,
+      "loss": 0.8852,
+      "step": 211
+    },
+    {
+      "epoch": 0.11306666666666666,
+      "grad_norm": 0.46618706944182065,
+      "learning_rate": 0.0001964342787166491,
+      "loss": 0.7485,
+      "step": 212
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.6902121551823582,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7188,
+      "step": 213
+    },
+    {
+      "epoch": 0.11413333333333334,
+      "grad_norm": 0.4083684587846421,
+      "learning_rate": 0.0001963422351452389,
+      "loss": 0.7432,
+      "step": 214
+    },
+    {
+      "epoch": 0.11466666666666667,
+      "grad_norm": 0.4961646801394343,
+      "learning_rate": 0.0001962957817517982,
+      "loss": 0.8278,
+      "step": 215
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4400375530479702,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.8287,
+      "step": 216
+    },
+    {
+      "epoch": 0.11573333333333333,
+      "grad_norm": 0.4190005436683279,
+      "learning_rate": 0.00019620201244302952,
+      "loss": 0.8171,
+      "step": 217
+    },
+    {
+      "epoch": 0.11626666666666667,
+      "grad_norm": 0.4884107398069035,
+      "learning_rate": 0.00019615469680771096,
+      "loss": 0.8943,
+      "step": 218
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.41500498773944605,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.7398,
+      "step": 219
+    },
+    {
+      "epoch": 0.11733333333333333,
+      "grad_norm": 0.41954265563184173,
+      "learning_rate": 0.00019605920428166323,
+      "loss": 0.7739,
+      "step": 220
+    },
+    {
+      "epoch": 0.11786666666666666,
+      "grad_norm": 0.4513842667939194,
+      "learning_rate": 0.00019601102767608923,
+      "loss": 0.7367,
+      "step": 221
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5733483427976973,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8087,
+      "step": 222
+    },
+    {
+      "epoch": 0.11893333333333334,
+      "grad_norm": 0.43191274270691143,
+      "learning_rate": 0.00019591381449915397,
+      "loss": 0.7329,
+      "step": 223
+    },
+    {
+      "epoch": 0.11946666666666667,
+      "grad_norm": 0.43845824001522565,
+      "learning_rate": 0.00019586477821808597,
+      "loss": 0.7939,
+      "step": 224
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.4464057108322274,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.8048,
+      "step": 225
+    },
+    {
+      "epoch": 0.12053333333333334,
+      "grad_norm": 0.46481050734827417,
+      "learning_rate": 0.00019576584700289768,
+      "loss": 0.7878,
+      "step": 226
+    },
+    {
+      "epoch": 0.12106666666666667,
+      "grad_norm": 0.43270851550710027,
+      "learning_rate": 0.00019571595236420102,
+      "loss": 0.778,
+      "step": 227
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.4367449983041571,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.7888,
+      "step": 228
+    },
+    {
+      "epoch": 0.12213333333333333,
+      "grad_norm": 0.40036781719061393,
+      "learning_rate": 0.00019561530576956703,
+      "loss": 0.7315,
+      "step": 229
+    },
+    {
+      "epoch": 0.12266666666666666,
+      "grad_norm": 0.44202834492107124,
+      "learning_rate": 0.00019556455411417573,
+      "loss": 0.8316,
+      "step": 230
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.44133307517411113,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.772,
+      "step": 231
+    },
+    {
+      "epoch": 0.12373333333333333,
+      "grad_norm": 0.45447721349442555,
+      "learning_rate": 0.00019546219484500475,
+      "loss": 0.8157,
+      "step": 232
+    },
+    {
+      "epoch": 0.12426666666666666,
+      "grad_norm": 0.38817072137937025,
+      "learning_rate": 0.00019541058753688538,
+      "loss": 0.7261,
+      "step": 233
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.4562599393937972,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7925,
+      "step": 234
+    },
+    {
+      "epoch": 0.12533333333333332,
+      "grad_norm": 0.4451932832529039,
+      "learning_rate": 0.00019530651834411474,
+      "loss": 0.7445,
+      "step": 235
+    },
+    {
+      "epoch": 0.12586666666666665,
+      "grad_norm": 0.4425698163551616,
+      "learning_rate": 0.00019525405677022989,
+      "loss": 0.8399,
+      "step": 236
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.4044960659500061,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7836,
+      "step": 237
+    },
+    {
+      "epoch": 0.12693333333333334,
+      "grad_norm": 0.4454396806934391,
+      "learning_rate": 0.0001951482804507517,
+      "loss": 0.8236,
+      "step": 238
+    },
+    {
+      "epoch": 0.12746666666666667,
+      "grad_norm": 0.40740737398119914,
+      "learning_rate": 0.00019509496602102252,
+      "loss": 0.746,
+      "step": 239
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4073032195596395,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.8102,
+      "step": 240
+    },
+    {
+      "epoch": 0.12853333333333333,
+      "grad_norm": 0.4149416935466569,
+      "learning_rate": 0.00019498748541760846,
+      "loss": 0.7821,
+      "step": 241
+    },
+    {
+      "epoch": 0.12906666666666666,
+      "grad_norm": 0.39364797701196813,
+      "learning_rate": 0.0001949333195648769,
+      "loss": 0.753,
+      "step": 242
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.3705972966731362,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.7606,
+      "step": 243
+    },
+    {
+      "epoch": 0.13013333333333332,
+      "grad_norm": 0.4591223904326906,
+      "learning_rate": 0.00019482413756610173,
+      "loss": 0.8666,
+      "step": 244
+    },
+    {
+      "epoch": 0.13066666666666665,
+      "grad_norm": 0.3908139311476786,
+      "learning_rate": 0.0001947691217460921,
+      "loss": 0.7006,
+      "step": 245
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.38215926157824187,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.7295,
+      "step": 246
+    },
+    {
+      "epoch": 0.13173333333333334,
+      "grad_norm": 0.41336248255289815,
+      "learning_rate": 0.00019465824128625617,
+      "loss": 0.7479,
+      "step": 247
+    },
+    {
+      "epoch": 0.13226666666666667,
+      "grad_norm": 0.4259106348010246,
+      "learning_rate": 0.00019460237697753577,
+      "loss": 0.7163,
+      "step": 248
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.45386495326525006,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8195,
+      "step": 249
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.4390682586736766,
+      "learning_rate": 0.00019448980103658613,
+      "loss": 0.8298,
+      "step": 250
+    },
+    {
+      "epoch": 0.13386666666666666,
+      "grad_norm": 0.47482501127831234,
+      "learning_rate": 0.0001944330897405257,
+      "loss": 0.8094,
+      "step": 251
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.4258694435585488,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8671,
+      "step": 252
+    },
+    {
+      "epoch": 0.13493333333333332,
+      "grad_norm": 0.4042887335019348,
+      "learning_rate": 0.00019431882134397598,
+      "loss": 0.7795,
+      "step": 253
+    },
+    {
+      "epoch": 0.13546666666666668,
+      "grad_norm": 0.4060175563463174,
+      "learning_rate": 0.00019426126458470936,
+      "loss": 0.7552,
+      "step": 254
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.42325218335845294,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.8127,
+      "step": 255
+    },
+    {
+      "epoch": 0.13653333333333334,
+      "grad_norm": 0.42025220582760886,
+      "learning_rate": 0.00019414530680355837,
+      "loss": 0.7834,
+      "step": 256
+    },
+    {
+      "epoch": 0.13706666666666667,
+      "grad_norm": 0.3899607546060976,
+      "learning_rate": 0.00019408690612794148,
+      "loss": 0.7461,
+      "step": 257
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.44293320632178584,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.8554,
+      "step": 258
+    },
+    {
+      "epoch": 0.13813333333333333,
+      "grad_norm": 0.38414290035390497,
+      "learning_rate": 0.00019396926207859084,
+      "loss": 0.7265,
+      "step": 259
+    },
+    {
+      "epoch": 0.13866666666666666,
+      "grad_norm": 0.39697025700360344,
+      "learning_rate": 0.0001939100190561601,
+      "loss": 0.7204,
+      "step": 260
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.35223629771328,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.719,
+      "step": 261
+    },
+    {
+      "epoch": 0.13973333333333332,
+      "grad_norm": 0.43080921604558886,
+      "learning_rate": 0.0001937906919003304,
+      "loss": 0.8576,
+      "step": 262
+    },
+    {
+      "epoch": 0.14026666666666668,
+      "grad_norm": 0.41838324723144754,
+      "learning_rate": 0.00019373060812326052,
+      "loss": 0.7771,
+      "step": 263
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4293837353499623,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8393,
+      "step": 264
+    },
+    {
+      "epoch": 0.14133333333333334,
+      "grad_norm": 0.4170972005584217,
+      "learning_rate": 0.00019360960106790643,
+      "loss": 0.7614,
+      "step": 265
+    },
+    {
+      "epoch": 0.14186666666666667,
+      "grad_norm": 0.4003287638349002,
+      "learning_rate": 0.0001935486781509677,
+      "loss": 0.8092,
+      "step": 266
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.3791029707393735,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7869,
+      "step": 267
+    },
+    {
+      "epoch": 0.14293333333333333,
+      "grad_norm": 0.4461110806714854,
+      "learning_rate": 0.00019342599444819168,
+      "loss": 0.8196,
+      "step": 268
+    },
+    {
+      "epoch": 0.14346666666666666,
+      "grad_norm": 0.4032553825749664,
+      "learning_rate": 0.00019336423402870653,
+      "loss": 0.8043,
+      "step": 269
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4489374564257814,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.7999,
+      "step": 270
+    },
+    {
+      "epoch": 0.14453333333333335,
+      "grad_norm": 0.3931813135152387,
+      "learning_rate": 0.0001932398769756714,
+      "loss": 0.7548,
+      "step": 271
+    },
+    {
+      "epoch": 0.14506666666666668,
+      "grad_norm": 0.43213500052159126,
+      "learning_rate": 0.0001931772807134704,
+      "loss": 0.7404,
+      "step": 272
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.463999438603729,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7251,
+      "step": 273
+    },
+    {
+      "epoch": 0.14613333333333334,
+      "grad_norm": 0.4202383869284423,
+      "learning_rate": 0.00019305125365231084,
+      "loss": 0.7599,
+      "step": 274
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 0.4161050003557798,
+      "learning_rate": 0.00019298782322968815,
+      "loss": 0.7888,
+      "step": 275
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.4580749494162176,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.7753,
+      "step": 276
+    },
+    {
+      "epoch": 0.14773333333333333,
+      "grad_norm": 0.4500640596888444,
+      "learning_rate": 0.0001928601295474208,
+      "loss": 0.8523,
+      "step": 277
+    },
+    {
+      "epoch": 0.14826666666666666,
+      "grad_norm": 0.4417347445263267,
+      "learning_rate": 0.00019279586666908884,
+      "loss": 0.7784,
+      "step": 278
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.4191198632455908,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.8291,
+      "step": 279
+    },
+    {
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.40465574964154416,
+      "learning_rate": 0.00019266650979752136,
+      "loss": 0.7461,
+      "step": 280
+    },
+    {
+      "epoch": 0.14986666666666668,
+      "grad_norm": 0.38044221580169696,
+      "learning_rate": 0.00019260141619056507,
+      "loss": 0.8062,
+      "step": 281
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.41989426187836015,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.7404,
+      "step": 282
+    },
+    {
+      "epoch": 0.15093333333333334,
+      "grad_norm": 0.4174598718201883,
+      "learning_rate": 0.0001924703996062038,
+      "loss": 0.8005,
+      "step": 283
+    },
+    {
+      "epoch": 0.15146666666666667,
+      "grad_norm": 0.39746626227909265,
+      "learning_rate": 0.0001924044770200342,
+      "loss": 0.77,
+      "step": 284
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.49211859112155604,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.9203,
+      "step": 285
+    },
+    {
+      "epoch": 0.15253333333333333,
+      "grad_norm": 0.44548839442310045,
+      "learning_rate": 0.0001922718042439908,
+      "loss": 0.8274,
+      "step": 286
+    },
+    {
+      "epoch": 0.15306666666666666,
+      "grad_norm": 0.4206063627886071,
+      "learning_rate": 0.000192205054450298,
+      "loss": 0.7462,
+      "step": 287
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.38061066166890833,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.7482,
+      "step": 288
+    },
+    {
+      "epoch": 0.15413333333333334,
+      "grad_norm": 0.46789180687659593,
+      "learning_rate": 0.00019207072904819486,
+      "loss": 0.8544,
+      "step": 289
+    },
+    {
+      "epoch": 0.15466666666666667,
+      "grad_norm": 0.459318329460134,
+      "learning_rate": 0.00019200315384090044,
+      "loss": 0.77,
+      "step": 290
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.3964244032086574,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.6858,
+      "step": 291
+    },
+    {
+      "epoch": 0.15573333333333333,
+      "grad_norm": 0.4021983466035418,
+      "learning_rate": 0.00019186717942277462,
+      "loss": 0.7392,
+      "step": 292
+    },
+    {
+      "epoch": 0.15626666666666666,
+      "grad_norm": 0.4463722717797184,
+      "learning_rate": 0.00019179878061798347,
+      "loss": 0.814,
+      "step": 293
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.40817539027781785,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.7085,
+      "step": 294
+    },
+    {
+      "epoch": 0.15733333333333333,
+      "grad_norm": 0.44904259139519614,
+      "learning_rate": 0.00019166116083819002,
+      "loss": 0.7432,
+      "step": 295
+    },
+    {
+      "epoch": 0.15786666666666666,
+      "grad_norm": 0.38360848103639306,
+      "learning_rate": 0.00019159194027414128,
+      "loss": 0.7165,
+      "step": 296
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.443531638191723,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8256,
+      "step": 297
+    },
+    {
+      "epoch": 0.15893333333333334,
+      "grad_norm": 0.38095191702642245,
+      "learning_rate": 0.00019145267883125482,
+      "loss": 0.7611,
+      "step": 298
+    },
+    {
+      "epoch": 0.15946666666666667,
+      "grad_norm": 0.3945116950378279,
+      "learning_rate": 0.00019138263836827288,
+      "loss": 0.779,
+      "step": 299
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.397456986825466,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8102,
+      "step": 300
+    },
+    {
+      "epoch": 0.16053333333333333,
+      "grad_norm": 0.6803448460607161,
+      "learning_rate": 0.00019124173900498818,
+      "loss": 0.8261,
+      "step": 301
+    },
+    {
+      "epoch": 0.16106666666666666,
+      "grad_norm": 0.40842654911594944,
+      "learning_rate": 0.00019117088052543233,
+      "loss": 0.7522,
+      "step": 302
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.5638276816528227,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.9793,
+      "step": 303
+    },
+    {
+      "epoch": 0.16213333333333332,
+      "grad_norm": 0.36684193003955196,
+      "learning_rate": 0.00019102834702846387,
+      "loss": 0.7141,
+      "step": 304
+    },
+    {
+      "epoch": 0.16266666666666665,
+      "grad_norm": 0.4537436788475935,
+      "learning_rate": 0.0001909566724366779,
+      "loss": 0.8118,
+      "step": 305
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.4062097659957066,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.7537,
+      "step": 306
+    },
+    {
+      "epoch": 0.16373333333333334,
+      "grad_norm": 0.4514769645704028,
+      "learning_rate": 0.00019081250863665794,
+      "loss": 0.7657,
+      "step": 307
+    },
+    {
+      "epoch": 0.16426666666666667,
+      "grad_norm": 0.44614717708012835,
+      "learning_rate": 0.0001907400198589189,
+      "loss": 0.8617,
+      "step": 308
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.41963744513674606,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7844,
+      "step": 309
+    },
+    {
+      "epoch": 0.16533333333333333,
+      "grad_norm": 0.42682085703394257,
+      "learning_rate": 0.00019059422963029464,
+      "loss": 0.7684,
+      "step": 310
+    },
+    {
+      "epoch": 0.16586666666666666,
+      "grad_norm": 0.39937427047954077,
+      "learning_rate": 0.0001905209286147611,
+      "loss": 0.7551,
+      "step": 311
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.43952243579704303,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.8068,
+      "step": 312
+    },
+    {
+      "epoch": 0.16693333333333332,
+      "grad_norm": 0.5151466577662714,
+      "learning_rate": 0.0001903735158756905,
+      "loss": 0.727,
+      "step": 313
+    },
+    {
+      "epoch": 0.16746666666666668,
+      "grad_norm": 0.46431394111704427,
+      "learning_rate": 0.0001902994045923502,
+      "loss": 0.8529,
+      "step": 314
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.4086453319288397,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.7239,
+      "step": 315
+    },
+    {
+      "epoch": 0.16853333333333334,
+      "grad_norm": 0.47172791859158825,
+      "learning_rate": 0.0001901503733045967,
+      "loss": 0.8012,
+      "step": 316
+    },
+    {
+      "epoch": 0.16906666666666667,
+      "grad_norm": 0.4939060978903562,
+      "learning_rate": 0.00019007545374521355,
+      "loss": 0.8453,
+      "step": 317
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.42505393841937944,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.8035,
+      "step": 318
+    },
+    {
+      "epoch": 0.17013333333333333,
+      "grad_norm": 0.40056450114313424,
+      "learning_rate": 0.00018992480791403958,
+      "loss": 0.7788,
+      "step": 319
+    },
+    {
+      "epoch": 0.17066666666666666,
+      "grad_norm": 0.43859811381683717,
+      "learning_rate": 0.0001898490820921001,
+      "loss": 0.7728,
+      "step": 320
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.38868079227357577,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.7384,
+      "step": 321
+    },
+    {
+      "epoch": 0.17173333333333332,
+      "grad_norm": 0.4169781202695756,
+      "learning_rate": 0.0001896968257661595,
+      "loss": 0.7412,
+      "step": 322
+    },
+    {
+      "epoch": 0.17226666666666668,
+      "grad_norm": 0.4758209190095243,
+      "learning_rate": 0.00018962029571681886,
+      "loss": 0.7737,
+      "step": 323
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.44608960722227414,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.7766,
+      "step": 324
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 0.4384190504043158,
+      "learning_rate": 0.00018946643298804793,
+      "loss": 0.8301,
+      "step": 325
+    },
+    {
+      "epoch": 0.17386666666666667,
+      "grad_norm": 0.4664490211064372,
+      "learning_rate": 0.00018938910076807513,
+      "loss": 0.8381,
+      "step": 326
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.37886426668765194,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.7254,
+      "step": 327
+    },
+    {
+      "epoch": 0.17493333333333333,
+      "grad_norm": 0.46384866324389645,
+      "learning_rate": 0.0001892336357715829,
+      "loss": 0.8627,
+      "step": 328
+    },
+    {
+      "epoch": 0.17546666666666666,
+      "grad_norm": 0.44489240090318244,
+      "learning_rate": 0.0001891555034593055,
+      "loss": 0.7049,
+      "step": 329
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.40644703358089984,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.7327,
+      "step": 330
+    },
+    {
+      "epoch": 0.17653333333333332,
+      "grad_norm": 0.42800900543499615,
+      "learning_rate": 0.00018899844037326225,
+      "loss": 0.797,
+      "step": 331
+    },
+    {
+      "epoch": 0.17706666666666668,
+      "grad_norm": 0.42280954690894723,
+      "learning_rate": 0.0001889195100685106,
+      "loss": 0.7859,
+      "step": 332
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.427761653048338,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7594,
+      "step": 333
+    },
+    {
+      "epoch": 0.17813333333333334,
+      "grad_norm": 0.3892262470310244,
+      "learning_rate": 0.00018876085311403593,
+      "loss": 0.7503,
+      "step": 334
+    },
+    {
+      "epoch": 0.17866666666666667,
+      "grad_norm": 0.4207635391805858,
+      "learning_rate": 0.00018868112693808665,
+      "loss": 0.7673,
+      "step": 335
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.43435097159311664,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.8014,
+      "step": 336
+    },
+    {
+      "epoch": 0.17973333333333333,
+      "grad_norm": 0.40937023088469,
+      "learning_rate": 0.00018852088037913577,
+      "loss": 0.7859,
+      "step": 337
+    },
+    {
+      "epoch": 0.18026666666666666,
+      "grad_norm": 0.41001083482566403,
+      "learning_rate": 0.0001884403604746547,
+      "loss": 0.7813,
+      "step": 338
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.3812789606403339,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.7912,
+      "step": 339
+    },
+    {
+      "epoch": 0.18133333333333335,
+      "grad_norm": 0.38260579878749484,
+      "learning_rate": 0.00018827852861790398,
+      "loss": 0.7276,
+      "step": 340
+    },
+    {
+      "epoch": 0.18186666666666668,
+      "grad_norm": 0.4188375153138234,
+      "learning_rate": 0.00018819721714888877,
+      "loss": 0.7856,
+      "step": 341
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.4191073265804016,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.8182,
+      "step": 342
+    },
+    {
+      "epoch": 0.18293333333333334,
+      "grad_norm": 0.4409245526215755,
+      "learning_rate": 0.00018803380434362,
+      "loss": 0.8265,
+      "step": 343
+    },
+    {
+      "epoch": 0.18346666666666667,
+      "grad_norm": 0.4875836047329921,
+      "learning_rate": 0.0001879517034953418,
+      "loss": 0.8378,
+      "step": 344
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.43206958362534076,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7166,
+      "step": 345
+    },
+    {
+      "epoch": 0.18453333333333333,
+      "grad_norm": 0.45230825476446884,
+      "learning_rate": 0.00018778671413332513,
+      "loss": 0.8139,
+      "step": 346
+    },
+    {
+      "epoch": 0.18506666666666666,
+      "grad_norm": 0.4403885177275518,
+      "learning_rate": 0.00018770382611226987,
+      "loss": 0.7746,
+      "step": 347
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.36705477948609244,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.6628,
+      "step": 348
+    },
+    {
+      "epoch": 0.18613333333333335,
+      "grad_norm": 0.4309350434473072,
+      "learning_rate": 0.000187537264627646,
+      "loss": 0.7983,
+      "step": 349
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.4561987670471556,
+      "learning_rate": 0.00018745359166145523,
+      "loss": 0.8956,
+      "step": 350
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.4506239667312874,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.79,
+      "step": 351
+    },
+    {
+      "epoch": 0.18773333333333334,
+      "grad_norm": 0.41185547045134,
+      "learning_rate": 0.00018728546253061614,
+      "loss": 0.8051,
+      "step": 352
+    },
+    {
+      "epoch": 0.18826666666666667,
+      "grad_norm": 0.4608890231919324,
+      "learning_rate": 0.00018720100686802694,
+      "loss": 0.8128,
+      "step": 353
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.40548159698715563,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7737,
+      "step": 354
+    },
+    {
+      "epoch": 0.18933333333333333,
+      "grad_norm": 0.46337948526039613,
+      "learning_rate": 0.00018703131460949554,
+      "loss": 0.8235,
+      "step": 355
+    },
+    {
+      "epoch": 0.18986666666666666,
+      "grad_norm": 0.4474937103830373,
+      "learning_rate": 0.0001869460785202802,
+      "loss": 0.7144,
+      "step": 356
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.42890106376323245,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7508,
+      "step": 357
+    },
+    {
+      "epoch": 0.19093333333333334,
+      "grad_norm": 0.47516968882081667,
+      "learning_rate": 0.00018677482769458904,
+      "loss": 0.7866,
+      "step": 358
+    },
+    {
+      "epoch": 0.19146666666666667,
+      "grad_norm": 0.44556770166104803,
+      "learning_rate": 0.00018668881346949417,
+      "loss": 0.802,
+      "step": 359
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4109752045262668,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7588,
+      "step": 360
+    },
+    {
+      "epoch": 0.19253333333333333,
+      "grad_norm": 0.4374877722878183,
+      "learning_rate": 0.00018651600867906272,
+      "loss": 0.7432,
+      "step": 361
+    },
+    {
+      "epoch": 0.19306666666666666,
+      "grad_norm": 0.45922625029093916,
+      "learning_rate": 0.00018642921862974742,
+      "loss": 0.7508,
+      "step": 362
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.4433410526588002,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.8436,
+      "step": 363
+    },
+    {
+      "epoch": 0.19413333333333332,
+      "grad_norm": 0.45662892382022074,
+      "learning_rate": 0.00018625486451875843,
+      "loss": 0.8183,
+      "step": 364
+    },
+    {
+      "epoch": 0.19466666666666665,
+      "grad_norm": 0.48004506628680943,
+      "learning_rate": 0.0001861673009777325,
+      "loss": 0.856,
+      "step": 365
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.413905602909353,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.755,
+      "step": 366
+    },
+    {
+      "epoch": 0.19573333333333334,
+      "grad_norm": 0.44171838117859225,
+      "learning_rate": 0.00018599140223200716,
+      "loss": 0.7949,
+      "step": 367
+    },
+    {
+      "epoch": 0.19626666666666667,
+      "grad_norm": 0.4121760243310849,
+      "learning_rate": 0.0001859030675525681,
+      "loss": 0.7873,
+      "step": 368
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.4175764433109541,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.7996,
+      "step": 369
+    },
+    {
+      "epoch": 0.19733333333333333,
+      "grad_norm": 0.36921453998206777,
+      "learning_rate": 0.0001857256288994402,
+      "loss": 0.6906,
+      "step": 370
+    },
+    {
+      "epoch": 0.19786666666666666,
+      "grad_norm": 0.42607268744743987,
+      "learning_rate": 0.00018563652545561013,
+      "loss": 0.8004,
+      "step": 371
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4485820639155176,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.8138,
+      "step": 372
+    },
+    {
+      "epoch": 0.19893333333333332,
+      "grad_norm": 0.4233782391868254,
+      "learning_rate": 0.000185457551663799,
+      "loss": 0.8149,
+      "step": 373
+    },
+    {
+      "epoch": 0.19946666666666665,
+      "grad_norm": 0.4019541542751352,
+      "learning_rate": 0.00018536768185026083,
+      "loss": 0.746,
+      "step": 374
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.44813233661492907,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.8055,
+      "step": 375
+    },
+    {
+      "epoch": 0.20053333333333334,
+      "grad_norm": 0.40332082604890734,
+      "learning_rate": 0.00018518717772974302,
+      "loss": 0.6846,
+      "step": 376
+    },
+    {
+      "epoch": 0.20106666666666667,
+      "grad_norm": 0.3894478056645389,
+      "learning_rate": 0.00018509654396177609,
+      "loss": 0.7378,
+      "step": 377
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.442120712309075,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.8193,
+      "step": 378
+    },
+    {
+      "epoch": 0.20213333333333333,
+      "grad_norm": 0.36333630977196085,
+      "learning_rate": 0.00018491451436365627,
+      "loss": 0.6819,
+      "step": 379
+    },
+    {
+      "epoch": 0.20266666666666666,
+      "grad_norm": 0.4354306266272364,
+      "learning_rate": 0.0001848231190770714,
+      "loss": 0.6946,
+      "step": 380
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.40743520887541135,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.7572,
+      "step": 381
+    },
+    {
+      "epoch": 0.20373333333333332,
+      "grad_norm": 0.42090155183192574,
+      "learning_rate": 0.00018463956889345194,
+      "loss": 0.7501,
+      "step": 382
+    },
+    {
+      "epoch": 0.20426666666666668,
+      "grad_norm": 0.38910082938456136,
+      "learning_rate": 0.00018454741454452603,
+      "loss": 0.7161,
+      "step": 383
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4029892976127122,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7463,
+      "step": 384
+    },
+    {
+      "epoch": 0.20533333333333334,
+      "grad_norm": 0.4276425202170697,
+      "learning_rate": 0.00018436234870837547,
+      "loss": 0.7936,
+      "step": 385
+    },
+    {
+      "epoch": 0.20586666666666667,
+      "grad_norm": 0.46679796325534834,
+      "learning_rate": 0.00018426943777378552,
+      "loss": 0.8171,
+      "step": 386
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4478670167653976,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7552,
+      "step": 387
+    },
+    {
+      "epoch": 0.20693333333333333,
+      "grad_norm": 0.42859027674050987,
+      "learning_rate": 0.00018408286125880604,
+      "loss": 0.7506,
+      "step": 388
+    },
+    {
+      "epoch": 0.20746666666666666,
+      "grad_norm": 0.39325455786258395,
+      "learning_rate": 0.00018398919623556238,
+      "loss": 0.7574,
+      "step": 389
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.3792874715373212,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.6801,
+      "step": 390
+    },
+    {
+      "epoch": 0.20853333333333332,
+      "grad_norm": 0.43147352714426557,
+      "learning_rate": 0.0001838011140560562,
+      "loss": 0.8133,
+      "step": 391
+    },
+    {
+      "epoch": 0.20906666666666668,
+      "grad_norm": 0.44500228600342345,
+      "learning_rate": 0.00018370669746143564,
+      "loss": 0.7759,
+      "step": 392
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.40390199826261264,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.7525,
+      "step": 393
+    },
+    {
+      "epoch": 0.21013333333333334,
+      "grad_norm": 0.3972330956103031,
+      "learning_rate": 0.0001835171146721701,
+      "loss": 0.7708,
+      "step": 394
+    },
+    {
+      "epoch": 0.21066666666666667,
+      "grad_norm": 0.4143928993768653,
+      "learning_rate": 0.00018342194904364813,
+      "loss": 0.767,
+      "step": 395
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.43262304517242256,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.8193,
+      "step": 396
+    },
+    {
+      "epoch": 0.21173333333333333,
+      "grad_norm": 0.40219774460267915,
+      "learning_rate": 0.00018323087073971993,
+      "loss": 0.7255,
+      "step": 397
+    },
+    {
+      "epoch": 0.21226666666666666,
+      "grad_norm": 0.43205612834254487,
+      "learning_rate": 0.00018313495863490258,
+      "loss": 0.7399,
+      "step": 398
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.4598219347282703,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.7702,
+      "step": 399
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.39448665452304327,
+      "learning_rate": 0.00018294238995160094,
+      "loss": 0.7211,
+      "step": 400
+    },
+    {
+      "epoch": 0.21386666666666668,
+      "grad_norm": 0.4206393615135302,
+      "learning_rate": 0.00018284573394815597,
+      "loss": 0.7889,
+      "step": 401
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.4846467223856484,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.8837,
+      "step": 402
+    },
+    {
+      "epoch": 0.21493333333333334,
+      "grad_norm": 0.41667485615355143,
+      "learning_rate": 0.00018265168006082437,
+      "loss": 0.7189,
+      "step": 403
+    },
+    {
+      "epoch": 0.21546666666666667,
+      "grad_norm": 0.4604796342144686,
+      "learning_rate": 0.00018255428275641214,
+      "loss": 0.7937,
+      "step": 404
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.45259207316683736,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7974,
+      "step": 405
+    },
+    {
+      "epoch": 0.21653333333333333,
+      "grad_norm": 0.4341042311882465,
+      "learning_rate": 0.0001823587488803095,
+      "loss": 0.738,
+      "step": 406
+    },
+    {
+      "epoch": 0.21706666666666666,
+      "grad_norm": 0.4683448540774323,
+      "learning_rate": 0.00018226061289251298,
+      "loss": 0.7193,
+      "step": 407
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.41408110761574435,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7251,
+      "step": 408
+    },
+    {
+      "epoch": 0.21813333333333335,
+      "grad_norm": 0.4542005341965333,
+      "learning_rate": 0.00018206360428267332,
+      "loss": 0.7984,
+      "step": 409
+    },
+    {
+      "epoch": 0.21866666666666668,
+      "grad_norm": 0.3939031674123216,
+      "learning_rate": 0.00018196473224892784,
+      "loss": 0.7133,
+      "step": 410
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.4819986008493197,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.8371,
+      "step": 411
+    },
+    {
+      "epoch": 0.21973333333333334,
+      "grad_norm": 0.3960782226169457,
+      "learning_rate": 0.0001817662542000192,
+      "loss": 0.7093,
+      "step": 412
+    },
+    {
+      "epoch": 0.22026666666666667,
+      "grad_norm": 0.4625689307705016,
+      "learning_rate": 0.0001816666487775416,
+      "loss": 0.8074,
+      "step": 413
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.42313403473960337,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.7474,
+      "step": 414
+    },
+    {
+      "epoch": 0.22133333333333333,
+      "grad_norm": 0.4270307709479224,
+      "learning_rate": 0.00018146670662372354,
+      "loss": 0.7943,
+      "step": 415
+    },
+    {
+      "epoch": 0.22186666666666666,
+      "grad_norm": 0.4220964234581579,
+      "learning_rate": 0.0001813663704894407,
+      "loss": 0.8479,
+      "step": 416
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.4055106834380055,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.7673,
+      "step": 417
+    },
+    {
+      "epoch": 0.22293333333333334,
+      "grad_norm": 0.39707451997168985,
+      "learning_rate": 0.00018116496960422107,
+      "loss": 0.7355,
+      "step": 418
+    },
+    {
+      "epoch": 0.22346666666666667,
+      "grad_norm": 0.41645880432509597,
+      "learning_rate": 0.00018106390545469795,
+      "loss": 0.7881,
+      "step": 419
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.40219537606475975,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.759,
+      "step": 420
+    },
+    {
+      "epoch": 0.22453333333333333,
+      "grad_norm": 0.40028445203067436,
+      "learning_rate": 0.00018086105125078857,
+      "loss": 0.758,
+      "step": 421
+    },
+    {
+      "epoch": 0.22506666666666666,
+      "grad_norm": 0.45868882119219095,
+      "learning_rate": 0.00018075926180215576,
+      "loss": 0.7778,
+      "step": 422
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.5055865468343858,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.8473,
+      "step": 423
+    },
+    {
+      "epoch": 0.22613333333333333,
+      "grad_norm": 0.4260233233671494,
+      "learning_rate": 0.0001805549597313267,
+      "loss": 0.7326,
+      "step": 424
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 0.4508177776857487,
+      "learning_rate": 0.0001804524477192075,
+      "loss": 0.7635,
+      "step": 425
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.5457963141188263,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.8356,
+      "step": 426
+    },
+    {
+      "epoch": 0.22773333333333334,
+      "grad_norm": 0.36218033840482383,
+      "learning_rate": 0.00018024670327214084,
+      "loss": 0.6857,
+      "step": 427
+    },
+    {
+      "epoch": 0.22826666666666667,
+      "grad_norm": 0.4265971390621618,
+      "learning_rate": 0.00018014347145157755,
+      "loss": 0.7591,
+      "step": 428
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.44138850691136283,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7895,
+      "step": 429
+    },
+    {
+      "epoch": 0.22933333333333333,
+      "grad_norm": 0.4253692589500084,
+      "learning_rate": 0.0001799362901577196,
+      "loss": 0.7274,
+      "step": 430
+    },
+    {
+      "epoch": 0.22986666666666666,
+      "grad_norm": 0.4172888167913838,
+      "learning_rate": 0.00017983234130309968,
+      "loss": 0.7607,
+      "step": 431
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.43693502598088874,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7521,
+      "step": 432
+    },
+    {
+      "epoch": 0.23093333333333332,
+      "grad_norm": 0.3504487378185642,
+      "learning_rate": 0.00017962372873051252,
+      "loss": 0.6614,
+      "step": 433
+    },
+    {
+      "epoch": 0.23146666666666665,
+      "grad_norm": 0.43630167985015783,
+      "learning_rate": 0.00017951906563549397,
+      "loss": 0.8651,
+      "step": 434
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.3823016448913495,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7414,
+      "step": 435
+    },
+    {
+      "epoch": 0.23253333333333334,
+      "grad_norm": 0.4174112019252735,
+      "learning_rate": 0.00017930902739070562,
+      "loss": 0.7915,
+      "step": 436
+    },
+    {
+      "epoch": 0.23306666666666667,
+      "grad_norm": 0.3953902231311026,
+      "learning_rate": 0.00017920365286814183,
+      "loss": 0.7429,
+      "step": 437
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4781709856899902,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7811,
+      "step": 438
+    },
+    {
+      "epoch": 0.23413333333333333,
+      "grad_norm": 0.47252415534413705,
+      "learning_rate": 0.0001789921945959958,
+      "loss": 0.8806,
+      "step": 439
+    },
+    {
+      "epoch": 0.23466666666666666,
+      "grad_norm": 0.4865219929578287,
+      "learning_rate": 0.00017888611147786002,
+      "loss": 0.8077,
+      "step": 440
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.4425115033476329,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7107,
+      "step": 441
+    },
+    {
+      "epoch": 0.23573333333333332,
+      "grad_norm": 0.4543841872674269,
+      "learning_rate": 0.00017867323886136348,
+      "loss": 0.7758,
+      "step": 442
+    },
+    {
+      "epoch": 0.23626666666666668,
+      "grad_norm": 0.3867618941791085,
+      "learning_rate": 0.00017856644999867264,
+      "loss": 0.7309,
+      "step": 443
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.39943496398845935,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.7168,
+      "step": 444
+    },
+    {
+      "epoch": 0.23733333333333334,
+      "grad_norm": 0.3910802366662262,
+      "learning_rate": 0.00017835216875884368,
+      "loss": 0.7196,
+      "step": 445
+    },
+    {
+      "epoch": 0.23786666666666667,
+      "grad_norm": 0.4535836072431583,
+      "learning_rate": 0.0001782446770215819,
+      "loss": 0.7941,
+      "step": 446
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.4406511069170446,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7642,
+      "step": 447
+    },
+    {
+      "epoch": 0.23893333333333333,
+      "grad_norm": 0.4623494611245526,
+      "learning_rate": 0.00017802899291729585,
+      "loss": 0.8283,
+      "step": 448
+    },
+    {
+      "epoch": 0.23946666666666666,
+      "grad_norm": 0.4636825420493861,
+      "learning_rate": 0.0001779208011943371,
+      "loss": 0.6656,
+      "step": 449
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.40609425486236683,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7222,
+      "step": 450
+    },
+    {
+      "epoch": 0.24053333333333332,
+      "grad_norm": 0.4573656546601646,
+      "learning_rate": 0.00017770372002217172,
+      "loss": 0.8383,
+      "step": 451
+    },
+    {
+      "epoch": 0.24106666666666668,
+      "grad_norm": 0.4222446845886771,
+      "learning_rate": 0.00017759483122120238,
+      "loss": 0.7961,
+      "step": 452
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.41142821506943217,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.7553,
+      "step": 453
+    },
+    {
+      "epoch": 0.24213333333333334,
+      "grad_norm": 0.41008012075185146,
+      "learning_rate": 0.00017737635881528196,
+      "loss": 0.7323,
+      "step": 454
+    },
+    {
+      "epoch": 0.24266666666666667,
+      "grad_norm": 0.4370533231761914,
+      "learning_rate": 0.00017726677586272263,
+      "loss": 0.8049,
+      "step": 455
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.47113539643196534,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.7169,
+      "step": 456
+    },
+    {
+      "epoch": 0.24373333333333333,
+      "grad_norm": 0.39445042339966896,
+      "learning_rate": 0.00017704691809456143,
+      "loss": 0.693,
+      "step": 457
+    },
+    {
+      "epoch": 0.24426666666666666,
+      "grad_norm": 0.42072701077110986,
+      "learning_rate": 0.0001769366439354882,
+      "loss": 0.7396,
+      "step": 458
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.47204364252109704,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7521,
+      "step": 459
+    },
+    {
+      "epoch": 0.24533333333333332,
+      "grad_norm": 0.4200290643447494,
+      "learning_rate": 0.00017671540671383243,
+      "loss": 0.6861,
+      "step": 460
+    },
+    {
+      "epoch": 0.24586666666666668,
+      "grad_norm": 0.4502529045960976,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 0.7767,
+      "step": 461
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.3695645276857633,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.7027,
+      "step": 462
+    },
+    {
+      "epoch": 0.24693333333333334,
+      "grad_norm": 0.4117470459210072,
+      "learning_rate": 0.00017638183358256696,
+      "loss": 0.7655,
+      "step": 463
+    },
+    {
+      "epoch": 0.24746666666666667,
+      "grad_norm": 0.47394585526275107,
+      "learning_rate": 0.00017627018591992018,
+      "loss": 0.7737,
+      "step": 464
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.394885661547385,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.7386,
+      "step": 465
+    },
+    {
+      "epoch": 0.24853333333333333,
+      "grad_norm": 0.4201613770727662,
+      "learning_rate": 0.00017604620766564723,
+      "loss": 0.7362,
+      "step": 466
+    },
+    {
+      "epoch": 0.24906666666666666,
+      "grad_norm": 0.3746099633885623,
+      "learning_rate": 0.00017593387774285412,
+      "loss": 0.7034,
+      "step": 467
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.44593986708903155,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.8034,
+      "step": 468
+    },
+    {
+      "epoch": 0.2501333333333333,
+      "grad_norm": 0.38542476841928264,
+      "learning_rate": 0.0001757085379831246,
+      "loss": 0.7806,
+      "step": 469
+    },
+    {
+      "epoch": 0.25066666666666665,
+      "grad_norm": 0.461235269742075,
+      "learning_rate": 0.00017559552881908695,
+      "loss": 0.793,
+      "step": 470
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.3904459463021993,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.7182,
+      "step": 471
+    },
+    {
+      "epoch": 0.2517333333333333,
+      "grad_norm": 0.5029031182752419,
+      "learning_rate": 0.00017536883360997743,
+      "loss": 0.7641,
+      "step": 472
+    },
+    {
+      "epoch": 0.25226666666666664,
+      "grad_norm": 0.4620326356465615,
+      "learning_rate": 0.00017525514824185185,
+      "loss": 0.8201,
+      "step": 473
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.5001062434922018,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.8608,
+      "step": 474
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 0.44384500980465535,
+      "learning_rate": 0.00017502710367586687,
+      "loss": 0.7526,
+      "step": 475
+    },
+    {
+      "epoch": 0.2538666666666667,
+      "grad_norm": 0.4415719249105902,
+      "learning_rate": 0.0001749127451589832,
+      "loss": 0.7334,
+      "step": 476
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.47188387831377115,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.823,
+      "step": 477
+    },
+    {
+      "epoch": 0.25493333333333335,
+      "grad_norm": 0.3732688102135017,
+      "learning_rate": 0.00017468335736489177,
+      "loss": 0.6511,
+      "step": 478
+    },
+    {
+      "epoch": 0.2554666666666667,
+      "grad_norm": 0.47634472209126383,
+      "learning_rate": 0.00017456832877267084,
+      "loss": 0.7349,
+      "step": 479
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4364189059028313,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.7164,
+      "step": 480
+    },
+    {
+      "epoch": 0.25653333333333334,
+      "grad_norm": 0.4172250477477957,
+      "learning_rate": 0.00017433760391534167,
+      "loss": 0.793,
+      "step": 481
+    },
+    {
+      "epoch": 0.25706666666666667,
+      "grad_norm": 0.3687764366502754,
+      "learning_rate": 0.00017422190833921283,
+      "loss": 0.7177,
+      "step": 482
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.40383041751979587,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.6801,
+      "step": 483
+    },
+    {
+      "epoch": 0.2581333333333333,
+      "grad_norm": 0.4355190780186118,
+      "learning_rate": 0.00017398985261944856,
+      "loss": 0.8137,
+      "step": 484
+    },
+    {
+      "epoch": 0.25866666666666666,
+      "grad_norm": 0.38382928686611334,
+      "learning_rate": 0.00017387349316876666,
+      "loss": 0.7865,
+      "step": 485
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.5227623406808979,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.7953,
+      "step": 486
+    },
+    {
+      "epoch": 0.2597333333333333,
+      "grad_norm": 0.4356224918905176,
+      "learning_rate": 0.0001736401128231373,
+      "loss": 0.8426,
+      "step": 487
+    },
+    {
+      "epoch": 0.26026666666666665,
+      "grad_norm": 0.40711285939059755,
+      "learning_rate": 0.00017352309262509894,
+      "loss": 0.7079,
+      "step": 488
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.47183725682436484,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.8434,
+      "step": 489
+    },
+    {
+      "epoch": 0.2613333333333333,
+      "grad_norm": 0.4550066465514676,
+      "learning_rate": 0.0001732883939257742,
+      "loss": 0.7661,
+      "step": 490
+    },
+    {
+      "epoch": 0.2618666666666667,
+      "grad_norm": 0.3696661486354159,
+      "learning_rate": 0.0001731707161253338,
+      "loss": 0.7439,
+      "step": 491
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.39398364659212626,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.773,
+      "step": 492
+    },
+    {
+      "epoch": 0.26293333333333335,
+      "grad_norm": 0.4026020601598246,
+      "learning_rate": 0.00017293470537991463,
+      "loss": 0.7006,
+      "step": 493
+    },
+    {
+      "epoch": 0.2634666666666667,
+      "grad_norm": 0.4362868754323538,
+      "learning_rate": 0.00017281637313969978,
+      "loss": 0.8482,
+      "step": 494
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.3515804653964102,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.6581,
+      "step": 495
+    },
+    {
+      "epoch": 0.26453333333333334,
+      "grad_norm": 0.4219097245554728,
+      "learning_rate": 0.00017257905669104874,
+      "loss": 0.73,
+      "step": 496
+    },
+    {
+      "epoch": 0.2650666666666667,
+      "grad_norm": 0.38933215122078296,
+      "learning_rate": 0.00017246007319127545,
+      "loss": 0.6736,
+      "step": 497
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.44852673008214267,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7977,
+      "step": 498
+    },
+    {
+      "epoch": 0.26613333333333333,
+      "grad_norm": 0.5126751292572221,
+      "learning_rate": 0.00017222145741734626,
+      "loss": 0.8378,
+      "step": 499
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.3800039625006193,
+      "learning_rate": 0.00017210182585573327,
+      "loss": 0.7026,
+      "step": 500
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.429692197429523,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.8146,
+      "step": 501
+    },
+    {
+      "epoch": 0.2677333333333333,
+      "grad_norm": 0.3715916865545978,
+      "learning_rate": 0.00017186191716939944,
+      "loss": 0.697,
+      "step": 502
+    },
+    {
+      "epoch": 0.26826666666666665,
+      "grad_norm": 0.4206690095658035,
+      "learning_rate": 0.0001717416407610824,
+      "loss": 0.7826,
+      "step": 503
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4215409696210893,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.7789,
+      "step": 504
+    },
+    {
+      "epoch": 0.2693333333333333,
+      "grad_norm": 0.4327914622311286,
+      "learning_rate": 0.00017150044560996488,
+      "loss": 0.7741,
+      "step": 505
+    },
+    {
+      "epoch": 0.26986666666666664,
+      "grad_norm": 0.41717016511194416,
+      "learning_rate": 0.00017137952758740978,
+      "loss": 0.7483,
+      "step": 506
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.42355520239782923,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.7476,
+      "step": 507
+    },
+    {
+      "epoch": 0.27093333333333336,
+      "grad_norm": 0.3867276064443795,
+      "learning_rate": 0.00017113705245370368,
+      "loss": 0.7566,
+      "step": 508
+    },
+    {
+      "epoch": 0.2714666666666667,
+      "grad_norm": 0.3877790135354421,
+      "learning_rate": 0.00017101549606662024,
+      "loss": 0.702,
+      "step": 509
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4277111086201661,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7433,
+      "step": 510
+    },
+    {
+      "epoch": 0.27253333333333335,
+      "grad_norm": 0.3837665247016121,
+      "learning_rate": 0.00017077174746692056,
+      "loss": 0.7263,
+      "step": 511
+    },
+    {
+      "epoch": 0.2730666666666667,
+      "grad_norm": 0.4131361729455296,
+      "learning_rate": 0.00017064955598217462,
+      "loss": 0.7716,
+      "step": 512
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.39463662549594924,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.704,
+      "step": 513
+    },
+    {
+      "epoch": 0.27413333333333334,
+      "grad_norm": 0.46853489829372913,
+      "learning_rate": 0.00017040454046730115,
+      "loss": 0.7531,
+      "step": 514
+    },
+    {
+      "epoch": 0.27466666666666667,
+      "grad_norm": 0.4275530686883492,
+      "learning_rate": 0.00017028171716882714,
+      "loss": 0.7422,
+      "step": 515
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.44886232700057555,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.8124,
+      "step": 516
+    },
+    {
+      "epoch": 0.27573333333333333,
+      "grad_norm": 0.7722961059754266,
+      "learning_rate": 0.00017003544132364846,
+      "loss": 0.7464,
+      "step": 517
+    },
+    {
+      "epoch": 0.27626666666666666,
+      "grad_norm": 0.4051253424216697,
+      "learning_rate": 0.00016991198951236088,
+      "loss": 0.7498,
+      "step": 518
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.4070330323475715,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.766,
+      "step": 519
+    },
+    {
+      "epoch": 0.2773333333333333,
+      "grad_norm": 0.39832875640641235,
+      "learning_rate": 0.00016966445995561727,
+      "loss": 0.7308,
+      "step": 520
+    },
+    {
+      "epoch": 0.27786666666666665,
+      "grad_norm": 0.42205323080764595,
+      "learning_rate": 0.00016954038294932216,
+      "loss": 0.7341,
+      "step": 521
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.38204097319011965,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.6862,
+      "step": 522
+    },
+    {
+      "epoch": 0.2789333333333333,
+      "grad_norm": 0.3842199481630002,
+      "learning_rate": 0.0001692916063334479,
+      "loss": 0.7138,
+      "step": 523
+    },
+    {
+      "epoch": 0.27946666666666664,
+      "grad_norm": 0.42338750483674525,
+      "learning_rate": 0.0001691669074667535,
+      "loss": 0.7055,
+      "step": 524
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.3791350496852627,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.6839,
+      "step": 525
+    },
+    {
+      "epoch": 0.28053333333333336,
+      "grad_norm": 0.41809923777937463,
+      "learning_rate": 0.0001689168904776979,
+      "loss": 0.7568,
+      "step": 526
+    },
+    {
+      "epoch": 0.2810666666666667,
+      "grad_norm": 0.41242984626947155,
+      "learning_rate": 0.00016879157310192535,
+      "loss": 0.6956,
+      "step": 527
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4078573922896774,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.7231,
+      "step": 528
+    },
+    {
+      "epoch": 0.28213333333333335,
+      "grad_norm": 0.3967851891990228,
+      "learning_rate": 0.00016854032245897308,
+      "loss": 0.7102,
+      "step": 529
+    },
+    {
+      "epoch": 0.2826666666666667,
+      "grad_norm": 0.3916542742680514,
+      "learning_rate": 0.00016841438994206595,
+      "loss": 0.7226,
+      "step": 530
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.39236733974013766,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.762,
+      "step": 531
+    },
+    {
+      "epoch": 0.28373333333333334,
+      "grad_norm": 0.42339914460681044,
+      "learning_rate": 0.00016816191239765667,
+      "loss": 0.7814,
+      "step": 532
+    },
+    {
+      "epoch": 0.28426666666666667,
+      "grad_norm": 0.42067552663350977,
+      "learning_rate": 0.00016803536812409075,
+      "loss": 0.7906,
+      "step": 533
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.3764735729388975,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.7398,
+      "step": 534
+    },
+    {
+      "epoch": 0.2853333333333333,
+      "grad_norm": 0.3834201056794837,
+      "learning_rate": 0.00016778167046363734,
+      "loss": 0.7034,
+      "step": 535
+    },
+    {
+      "epoch": 0.28586666666666666,
+      "grad_norm": 0.37414714001659194,
+      "learning_rate": 0.00016765451783432953,
+      "loss": 0.7126,
+      "step": 536
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.4343418279165122,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.7961,
+      "step": 537
+    },
+    {
+      "epoch": 0.2869333333333333,
+      "grad_norm": 0.3624621540174937,
+      "learning_rate": 0.0001673996068760359,
+      "loss": 0.682,
+      "step": 538
+    },
+    {
+      "epoch": 0.28746666666666665,
+      "grad_norm": 0.45200035616835804,
+      "learning_rate": 0.00016727184930825288,
+      "loss": 0.7728,
+      "step": 539
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4124796415417627,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.7864,
+      "step": 540
+    },
+    {
+      "epoch": 0.2885333333333333,
+      "grad_norm": 0.36882536374265185,
+      "learning_rate": 0.00016701573190293077,
+      "loss": 0.6878,
+      "step": 541
+    },
+    {
+      "epoch": 0.2890666666666667,
+      "grad_norm": 0.40553927671667717,
+      "learning_rate": 0.00016688737283019706,
+      "loss": 0.7213,
+      "step": 542
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.3934992140528808,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7619,
+      "step": 543
+    },
+    {
+      "epoch": 0.29013333333333335,
+      "grad_norm": 0.397573986004267,
+      "learning_rate": 0.00016663005586108176,
+      "loss": 0.7675,
+      "step": 544
+    },
+    {
+      "epoch": 0.2906666666666667,
+      "grad_norm": 0.40835174885439235,
+      "learning_rate": 0.00016650109873308765,
+      "loss": 0.7699,
+      "step": 545
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.46876368099461874,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.7934,
+      "step": 546
+    },
+    {
+      "epoch": 0.29173333333333334,
+      "grad_norm": 0.49615378169211716,
+      "learning_rate": 0.0001662425891156531,
+      "loss": 0.7816,
+      "step": 547
+    },
+    {
+      "epoch": 0.2922666666666667,
+      "grad_norm": 0.4053875411080947,
+      "learning_rate": 0.00016611303739816168,
+      "loss": 0.7561,
+      "step": 548
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.42561219312927,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7132,
+      "step": 549
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.40219316282167905,
+      "learning_rate": 0.00016585334207993476,
+      "loss": 0.7283,
+      "step": 550
+    },
+    {
+      "epoch": 0.29386666666666666,
+      "grad_norm": 0.42082404021855246,
+      "learning_rate": 0.00016572319925468892,
+      "loss": 0.762,
+      "step": 551
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4571926838964318,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.7537,
+      "step": 552
+    },
+    {
+      "epoch": 0.2949333333333333,
+      "grad_norm": 0.4206596352880861,
+      "learning_rate": 0.0001654623252150624,
+      "loss": 0.7656,
+      "step": 553
+    },
+    {
+      "epoch": 0.29546666666666666,
+      "grad_norm": 0.38463464009826004,
+      "learning_rate": 0.00016533159477969122,
+      "loss": 0.7162,
+      "step": 554
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.44155313503274923,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.8467,
+      "step": 555
+    },
+    {
+      "epoch": 0.2965333333333333,
+      "grad_norm": 0.3611117779925444,
+      "learning_rate": 0.00016506954902973655,
+      "loss": 0.6999,
+      "step": 556
+    },
+    {
+      "epoch": 0.29706666666666665,
+      "grad_norm": 0.3972195348207249,
+      "learning_rate": 0.00016493823449766136,
+      "loss": 0.7503,
+      "step": 557
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.3535015203682368,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.6968,
+      "step": 558
+    },
+    {
+      "epoch": 0.2981333333333333,
+      "grad_norm": 0.3699748586388991,
+      "learning_rate": 0.00016467502407993992,
+      "loss": 0.7448,
+      "step": 559
+    },
+    {
+      "epoch": 0.2986666666666667,
+      "grad_norm": 0.4424967904763217,
+      "learning_rate": 0.0001645431289802799,
+      "loss": 0.791,
+      "step": 560
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.4498128558965614,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.8088,
+      "step": 561
+    },
+    {
+      "epoch": 0.29973333333333335,
+      "grad_norm": 0.43667457643021707,
+      "learning_rate": 0.00016427876096865394,
+      "loss": 0.766,
+      "step": 562
+    },
+    {
+      "epoch": 0.3002666666666667,
+      "grad_norm": 0.44226026815601244,
+      "learning_rate": 0.00016414628884613107,
+      "loss": 0.7885,
+      "step": 563
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.3916530667440351,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.6686,
+      "step": 564
+    },
+    {
+      "epoch": 0.30133333333333334,
+      "grad_norm": 0.41850974804992097,
+      "learning_rate": 0.00016388077034557355,
+      "loss": 0.7376,
+      "step": 565
+    },
+    {
+      "epoch": 0.30186666666666667,
+      "grad_norm": 0.41055031607605963,
+      "learning_rate": 0.00016374772476041748,
+      "loss": 0.769,
+      "step": 566
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.46206511511307036,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.797,
+      "step": 567
+    },
+    {
+      "epoch": 0.30293333333333333,
+      "grad_norm": 0.4728340133986983,
+      "learning_rate": 0.00016348106290682118,
+      "loss": 0.7736,
+      "step": 568
+    },
+    {
+      "epoch": 0.30346666666666666,
+      "grad_norm": 0.3730680447553586,
+      "learning_rate": 0.00016334744743467364,
+      "loss": 0.6992,
+      "step": 569
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4074647159116535,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.701,
+      "step": 570
+    },
+    {
+      "epoch": 0.3045333333333333,
+      "grad_norm": 0.4416522556853388,
+      "learning_rate": 0.00016307964939465914,
+      "loss": 0.767,
+      "step": 571
+    },
+    {
+      "epoch": 0.30506666666666665,
+      "grad_norm": 0.42544120300347543,
+      "learning_rate": 0.00016294546762647775,
+      "loss": 0.7523,
+      "step": 572
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.5021753292292243,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.813,
+      "step": 573
+    },
+    {
+      "epoch": 0.3061333333333333,
+      "grad_norm": 0.49752288780509973,
+      "learning_rate": 0.0001626765405972011,
+      "loss": 0.8251,
+      "step": 574
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 0.43609273060819986,
+      "learning_rate": 0.00016254179613916278,
+      "loss": 0.7778,
+      "step": 575
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3910594087749003,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7076,
+      "step": 576
+    },
+    {
+      "epoch": 0.30773333333333336,
+      "grad_norm": 0.42399806873192075,
+      "learning_rate": 0.000162271747348122,
+      "loss": 0.76,
+      "step": 577
+    },
+    {
+      "epoch": 0.3082666666666667,
+      "grad_norm": 0.4455481847186077,
+      "learning_rate": 0.0001621364438215262,
+      "loss": 0.7565,
+      "step": 578
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.4159572957710151,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7618,
+      "step": 579
+    },
+    {
+      "epoch": 0.30933333333333335,
+      "grad_norm": 0.3909709559237449,
+      "learning_rate": 0.00016186528052636692,
+      "loss": 0.7156,
+      "step": 580
+    },
+    {
+      "epoch": 0.3098666666666667,
+      "grad_norm": 0.35451495148952406,
+      "learning_rate": 0.0001617294215675382,
+      "loss": 0.6832,
+      "step": 581
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.3848727942305188,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.7181,
+      "step": 582
+    },
+    {
+      "epoch": 0.31093333333333334,
+      "grad_norm": 0.43545144621771725,
+      "learning_rate": 0.0001614571510558588,
+      "loss": 0.7785,
+      "step": 583
+    },
+    {
+      "epoch": 0.31146666666666667,
+      "grad_norm": 0.3995989663529192,
+      "learning_rate": 0.00016132074031604917,
+      "loss": 0.7385,
+      "step": 584
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.495353342458206,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7671,
+      "step": 585
+    },
+    {
+      "epoch": 0.31253333333333333,
+      "grad_norm": 0.47937900411962037,
+      "learning_rate": 0.00016104736990520468,
+      "loss": 0.8072,
+      "step": 586
+    },
+    {
+      "epoch": 0.31306666666666666,
+      "grad_norm": 0.43175232387396334,
+      "learning_rate": 0.0001609104110504954,
+      "loss": 0.7469,
+      "step": 587
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4025463144197025,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.7408,
+      "step": 588
+    },
+    {
+      "epoch": 0.3141333333333333,
+      "grad_norm": 0.4013799939839655,
+      "learning_rate": 0.00016063594808740113,
+      "loss": 0.7749,
+      "step": 589
+    },
+    {
+      "epoch": 0.31466666666666665,
+      "grad_norm": 0.43345023453575765,
+      "learning_rate": 0.00016049844479860422,
+      "loss": 0.7829,
+      "step": 590
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.4456430739648928,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.7576,
+      "step": 591
+    },
+    {
+      "epoch": 0.3157333333333333,
+      "grad_norm": 0.38188836656420205,
+      "learning_rate": 0.00016022289665953808,
+      "loss": 0.6475,
+      "step": 592
+    },
+    {
+      "epoch": 0.31626666666666664,
+      "grad_norm": 0.4574232945912238,
+      "learning_rate": 0.00016008485263209742,
+      "loss": 0.735,
+      "step": 593
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.39777968133725466,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7652,
+      "step": 594
+    },
+    {
+      "epoch": 0.31733333333333336,
+      "grad_norm": 0.45612112090003415,
+      "learning_rate": 0.0001598082267225018,
+      "loss": 0.7095,
+      "step": 595
+    },
+    {
+      "epoch": 0.3178666666666667,
+      "grad_norm": 0.4075773787749344,
+      "learning_rate": 0.0001596696456663938,
+      "loss": 0.7045,
+      "step": 596
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.3862909785511598,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.7453,
+      "step": 597
+    },
+    {
+      "epoch": 0.31893333333333335,
+      "grad_norm": 0.399928635845262,
+      "learning_rate": 0.00015939194942067646,
+      "loss": 0.6982,
+      "step": 598
+    },
+    {
+      "epoch": 0.3194666666666667,
+      "grad_norm": 0.41249320195892486,
+      "learning_rate": 0.0001592528350603103,
+      "loss": 0.7569,
+      "step": 599
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3911247474399028,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7288,
+      "step": 600
+    },
+    {
+      "epoch": 0.32053333333333334,
+      "grad_norm": 0.4004810545842315,
+      "learning_rate": 0.00015897407594164467,
+      "loss": 0.726,
+      "step": 601
+    },
+    {
+      "epoch": 0.32106666666666667,
+      "grad_norm": 0.3841028204352315,
+      "learning_rate": 0.00015883443201576225,
+      "loss": 0.7346,
+      "step": 602
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.41410558984432494,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7583,
+      "step": 603
+    },
+    {
+      "epoch": 0.3221333333333333,
+      "grad_norm": 0.4200283259363451,
+      "learning_rate": 0.00015855461751588677,
+      "loss": 0.7339,
+      "step": 604
+    },
+    {
+      "epoch": 0.32266666666666666,
+      "grad_norm": 0.4206229025591576,
+      "learning_rate": 0.0001584144477774623,
+      "loss": 0.7344,
+      "step": 605
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.4156803145577367,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7744,
+      "step": 606
+    },
+    {
+      "epoch": 0.3237333333333333,
+      "grad_norm": 0.4012540194977796,
+      "learning_rate": 0.00015813358541647915,
+      "loss": 0.7509,
+      "step": 607
+    },
+    {
+      "epoch": 0.32426666666666665,
+      "grad_norm": 0.4085776502061583,
+      "learning_rate": 0.00015799289363261813,
+      "loss": 0.7551,
+      "step": 608
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.39676717102139497,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.7408,
+      "step": 609
+    },
+    {
+      "epoch": 0.3253333333333333,
+      "grad_norm": 0.38378286227930847,
+      "learning_rate": 0.00015771099095879108,
+      "loss": 0.7295,
+      "step": 610
+    },
+    {
+      "epoch": 0.3258666666666667,
+      "grad_norm": 0.421053547266049,
+      "learning_rate": 0.0001575697809106292,
+      "loss": 0.8521,
+      "step": 611
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.46088047457782805,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.8125,
+      "step": 612
+    },
+    {
+      "epoch": 0.32693333333333335,
+      "grad_norm": 0.4276792719691738,
+      "learning_rate": 0.00015728684550018064,
+      "loss": 0.7634,
+      "step": 613
+    },
+    {
+      "epoch": 0.3274666666666667,
+      "grad_norm": 0.3925885280285251,
+      "learning_rate": 0.0001571451209827821,
+      "loss": 0.6892,
+      "step": 614
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.5430540476336514,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7434,
+      "step": 615
+    },
+    {
+      "epoch": 0.32853333333333334,
+      "grad_norm": 0.3887821052046365,
+      "learning_rate": 0.00015686116043968972,
+      "loss": 0.7168,
+      "step": 616
+    },
+    {
+      "epoch": 0.3290666666666667,
+      "grad_norm": 0.4079441568956553,
+      "learning_rate": 0.00015671892526194516,
+      "loss": 0.7344,
+      "step": 617
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.4346419986751891,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.7316,
+      "step": 618
+    },
+    {
+      "epoch": 0.33013333333333333,
+      "grad_norm": 0.43325646364544634,
+      "learning_rate": 0.0001564339472177373,
+      "loss": 0.7776,
+      "step": 619
+    },
+    {
+      "epoch": 0.33066666666666666,
+      "grad_norm": 0.3984917555196218,
+      "learning_rate": 0.00015629120520226165,
+      "loss": 0.6962,
+      "step": 620
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.4082803946841214,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.7923,
+      "step": 621
+    },
+    {
+      "epoch": 0.3317333333333333,
+      "grad_norm": 0.3641715820819403,
+      "learning_rate": 0.0001560052173158123,
+      "loss": 0.6643,
+      "step": 622
+    },
+    {
+      "epoch": 0.33226666666666665,
+      "grad_norm": 0.3991432810815113,
+      "learning_rate": 0.00015586197229884184,
+      "loss": 0.7311,
+      "step": 623
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3863300926249194,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.74,
+      "step": 624
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.4859436336249122,
+      "learning_rate": 0.00015557498225616487,
+      "loss": 0.7763,
+      "step": 625
+    },
+    {
+      "epoch": 0.33386666666666664,
+      "grad_norm": 0.47830405925011366,
+      "learning_rate": 0.0001554312380874542,
+      "loss": 0.8074,
+      "step": 626
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.4657261158809216,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.8706,
+      "step": 627
+    },
+    {
+      "epoch": 0.33493333333333336,
+      "grad_norm": 0.36286031031171573,
+      "learning_rate": 0.00015514325360149668,
+      "loss": 0.673,
+      "step": 628
+    },
+    {
+      "epoch": 0.3354666666666667,
+      "grad_norm": 0.45028701724880094,
+      "learning_rate": 0.0001549990141442153,
+      "loss": 0.7509,
+      "step": 629
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4201810736786452,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7447,
+      "step": 630
+    },
+    {
+      "epoch": 0.33653333333333335,
+      "grad_norm": 0.4939152221740485,
+      "learning_rate": 0.00015471004295465035,
+      "loss": 0.7872,
+      "step": 631
+    },
+    {
+      "epoch": 0.3370666666666667,
+      "grad_norm": 0.4066915162517629,
+      "learning_rate": 0.0001545653120852787,
+      "loss": 0.7218,
+      "step": 632
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.48413692220193694,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.7152,
+      "step": 633
+    },
+    {
+      "epoch": 0.33813333333333334,
+      "grad_norm": 0.42940795553738864,
+      "learning_rate": 0.00015427536195829742,
+      "loss": 0.7457,
+      "step": 634
+    },
+    {
+      "epoch": 0.33866666666666667,
+      "grad_norm": 0.4828547742134924,
+      "learning_rate": 0.00015413014356652286,
+      "loss": 0.825,
+      "step": 635
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.40041206947874025,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.6988,
+      "step": 636
+    },
+    {
+      "epoch": 0.33973333333333333,
+      "grad_norm": 0.41485634840986896,
+      "learning_rate": 0.00015383922229462549,
+      "loss": 0.7078,
+      "step": 637
+    },
+    {
+      "epoch": 0.34026666666666666,
+      "grad_norm": 0.4409892443135497,
+      "learning_rate": 0.00015369352028323774,
+      "loss": 0.8183,
+      "step": 638
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.39035772942470676,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7061,
+      "step": 639
+    },
+    {
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.40006171935545326,
+      "learning_rate": 0.0001534016356850244,
+      "loss": 0.6999,
+      "step": 640
+    },
+    {
+      "epoch": 0.34186666666666665,
+      "grad_norm": 0.446260087804474,
+      "learning_rate": 0.0001532554539698105,
+      "loss": 0.8193,
+      "step": 641
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.3958759602788678,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7817,
+      "step": 642
+    },
+    {
+      "epoch": 0.3429333333333333,
+      "grad_norm": 0.45673771090859966,
+      "learning_rate": 0.00015296261388977108,
+      "loss": 0.7665,
+      "step": 643
+    },
+    {
+      "epoch": 0.34346666666666664,
+      "grad_norm": 0.3998799624324905,
+      "learning_rate": 0.0001528159563994104,
+      "loss": 0.7017,
+      "step": 644
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.47534550520921026,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.8133,
+      "step": 645
+    },
+    {
+      "epoch": 0.34453333333333336,
+      "grad_norm": 0.38642056247769024,
+      "learning_rate": 0.00015252216870771345,
+      "loss": 0.7129,
+      "step": 646
+    },
+    {
+      "epoch": 0.3450666666666667,
+      "grad_norm": 0.40304098042143616,
+      "learning_rate": 0.00015237503938367186,
+      "loss": 0.7441,
+      "step": 647
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.382119132421216,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.6694,
+      "step": 648
+    },
+    {
+      "epoch": 0.34613333333333335,
+      "grad_norm": 0.3711789028725146,
+      "learning_rate": 0.00015208031197595356,
+      "loss": 0.67,
+      "step": 649
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.4001171072797071,
+      "learning_rate": 0.0001519327147723776,
+      "loss": 0.7339,
+      "step": 650
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.4008137127909884,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7103,
+      "step": 651
+    },
+    {
+      "epoch": 0.34773333333333334,
+      "grad_norm": 0.42932147112891744,
+      "learning_rate": 0.0001516370555695291,
+      "loss": 0.7508,
+      "step": 652
+    },
+    {
+      "epoch": 0.34826666666666667,
+      "grad_norm": 0.443665893254457,
+      "learning_rate": 0.00015148899445313981,
+      "loss": 0.7586,
+      "step": 653
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.41413894710051435,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.7637,
+      "step": 654
+    },
+    {
+      "epoch": 0.34933333333333333,
+      "grad_norm": 0.4414564870415473,
+      "learning_rate": 0.00015119241140109467,
+      "loss": 0.8341,
+      "step": 655
+    },
+    {
+      "epoch": 0.34986666666666666,
+      "grad_norm": 0.40053786190114776,
+      "learning_rate": 0.00015104389035108077,
+      "loss": 0.7158,
+      "step": 656
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.3964999448735445,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.7802,
+      "step": 657
+    },
+    {
+      "epoch": 0.3509333333333333,
+      "grad_norm": 0.38901853503797473,
+      "learning_rate": 0.0001507463914206012,
+      "loss": 0.6997,
+      "step": 658
+    },
+    {
+      "epoch": 0.35146666666666665,
+      "grad_norm": 0.38298133405679025,
+      "learning_rate": 0.0001505974144285124,
+      "loss": 0.6957,
+      "step": 659
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.39563189030321416,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7326,
+      "step": 660
+    },
+    {
+      "epoch": 0.3525333333333333,
+      "grad_norm": 0.4893833016542019,
+      "learning_rate": 0.00015029900761497506,
+      "loss": 0.827,
+      "step": 661
+    },
+    {
+      "epoch": 0.35306666666666664,
+      "grad_norm": 0.4701387507607022,
+      "learning_rate": 0.00015014957868461458,
+      "loss": 0.8191,
+      "step": 662
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.4121261824700458,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.7452,
+      "step": 663
+    },
+    {
+      "epoch": 0.35413333333333336,
+      "grad_norm": 0.4368116591376392,
+      "learning_rate": 0.000149850272007796,
+      "loss": 0.7436,
+      "step": 664
+    },
+    {
+      "epoch": 0.3546666666666667,
+      "grad_norm": 0.4600104278464735,
+      "learning_rate": 0.00014970039515511304,
+      "loss": 0.761,
+      "step": 665
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.41678022759529676,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.8406,
+      "step": 666
+    },
+    {
+      "epoch": 0.35573333333333335,
+      "grad_norm": 0.4335627895419274,
+      "learning_rate": 0.0001494001966589736,
+      "loss": 0.7462,
+      "step": 667
+    },
+    {
+      "epoch": 0.3562666666666667,
+      "grad_norm": 0.3972841911355577,
+      "learning_rate": 0.00014924987591195547,
+      "loss": 0.7077,
+      "step": 668
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.40045395994328453,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.6897,
+      "step": 669
+    },
+    {
+      "epoch": 0.35733333333333334,
+      "grad_norm": 0.4061394581635898,
+      "learning_rate": 0.0001489487936644237,
+      "loss": 0.7358,
+      "step": 670
+    },
+    {
+      "epoch": 0.35786666666666667,
+      "grad_norm": 0.42942956602146515,
+      "learning_rate": 0.00014879803306298736,
+      "loss": 0.7192,
+      "step": 671
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.4075239157016998,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7562,
+      "step": 672
+    },
+    {
+      "epoch": 0.3589333333333333,
+      "grad_norm": 0.3862678290075918,
+      "learning_rate": 0.00014849607515574276,
+      "loss": 0.7188,
+      "step": 673
+    },
+    {
+      "epoch": 0.35946666666666666,
+      "grad_norm": 0.40313571654281727,
+      "learning_rate": 0.00014834487875162657,
+      "loss": 0.8012,
+      "step": 674
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4254581098635978,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.6991,
+      "step": 675
+    },
+    {
+      "epoch": 0.3605333333333333,
+      "grad_norm": 0.43773082579986694,
+      "learning_rate": 0.00014804205329988225,
+      "loss": 0.7523,
+      "step": 676
+    },
+    {
+      "epoch": 0.36106666666666665,
+      "grad_norm": 0.3802979587947457,
+      "learning_rate": 0.00014789042515653687,
+      "loss": 0.6875,
+      "step": 677
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.3922326377194318,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7306,
+      "step": 678
+    },
+    {
+      "epoch": 0.3621333333333333,
+      "grad_norm": 0.4136677784543476,
+      "learning_rate": 0.00014758674029882152,
+      "loss": 0.7304,
+      "step": 679
+    },
+    {
+      "epoch": 0.3626666666666667,
+      "grad_norm": 0.3705776881560074,
+      "learning_rate": 0.00014743468449130063,
+      "loss": 0.6981,
+      "step": 680
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.4052160744898137,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.7498,
+      "step": 681
+    },
+    {
+      "epoch": 0.36373333333333335,
+      "grad_norm": 0.424671283876574,
+      "learning_rate": 0.00014713014838923976,
+      "loss": 0.6655,
+      "step": 682
+    },
+    {
+      "epoch": 0.3642666666666667,
+      "grad_norm": 0.3648877686813302,
+      "learning_rate": 0.00014697766900409074,
+      "loss": 0.6902,
+      "step": 683
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.4557973911607531,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.8138,
+      "step": 684
+    },
+    {
+      "epoch": 0.36533333333333334,
+      "grad_norm": 0.38960918349115975,
+      "learning_rate": 0.0001466722898421873,
+      "loss": 0.6965,
+      "step": 685
+    },
+    {
+      "epoch": 0.3658666666666667,
+      "grad_norm": 0.36731589024834815,
+      "learning_rate": 0.0001465193909773413,
+      "loss": 0.6657,
+      "step": 686
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.3915171183405222,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.7516,
+      "step": 687
+    },
+    {
+      "epoch": 0.36693333333333333,
+      "grad_norm": 0.415460175616078,
+      "learning_rate": 0.00014621317696275564,
+      "loss": 0.7333,
+      "step": 688
+    },
+    {
+      "epoch": 0.36746666666666666,
+      "grad_norm": 0.3800235477148507,
+      "learning_rate": 0.00014605986272741748,
+      "loss": 0.7432,
+      "step": 689
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4438067155328549,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7023,
+      "step": 690
+    },
+    {
+      "epoch": 0.3685333333333333,
+      "grad_norm": 0.4208286667125372,
+      "learning_rate": 0.00014575282208974702,
+      "loss": 0.7524,
+      "step": 691
+    },
+    {
+      "epoch": 0.36906666666666665,
+      "grad_norm": 0.40353530011730415,
+      "learning_rate": 0.00014559909660428468,
+      "loss": 0.688,
+      "step": 692
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.4199313683077951,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7732,
+      "step": 693
+    },
+    {
+      "epoch": 0.3701333333333333,
+      "grad_norm": 0.3691237934804446,
+      "learning_rate": 0.00014529123759534255,
+      "loss": 0.7028,
+      "step": 694
+    },
+    {
+      "epoch": 0.37066666666666664,
+      "grad_norm": 0.40556832944039695,
+      "learning_rate": 0.00014513710499117647,
+      "loss": 0.7003,
+      "step": 695
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.5177498432332827,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.8535,
+      "step": 696
+    },
+    {
+      "epoch": 0.37173333333333336,
+      "grad_norm": 0.4793977878312778,
+      "learning_rate": 0.00014482843588476974,
+      "loss": 0.7669,
+      "step": 697
+    },
+    {
+      "epoch": 0.3722666666666667,
+      "grad_norm": 0.36899268429416426,
+      "learning_rate": 0.00014467390030426186,
+      "loss": 0.6601,
+      "step": 698
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.3932280743144855,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.7379,
+      "step": 699
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.36856723406712216,
+      "learning_rate": 0.0001443644293959693,
+      "loss": 0.6514,
+      "step": 700
+    },
+    {
+      "epoch": 0.3738666666666667,
+      "grad_norm": 0.4495881605134991,
+      "learning_rate": 0.00014420949499231172,
+      "loss": 0.7473,
+      "step": 701
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4710259029838652,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.7786,
+      "step": 702
+    },
+    {
+      "epoch": 0.37493333333333334,
+      "grad_norm": 0.36807343178593516,
+      "learning_rate": 0.00014389923059926062,
+      "loss": 0.6731,
+      "step": 703
+    },
+    {
+      "epoch": 0.37546666666666667,
+      "grad_norm": 0.34658569975565173,
+      "learning_rate": 0.0001437439015363638,
+      "loss": 0.6782,
+      "step": 704
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.40579788888054424,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.6872,
+      "step": 705
+    },
+    {
+      "epoch": 0.37653333333333333,
+      "grad_norm": 0.38918742202678164,
+      "learning_rate": 0.00014343285199700683,
+      "loss": 0.7208,
+      "step": 706
+    },
+    {
+      "epoch": 0.37706666666666666,
+      "grad_norm": 0.3739971026467054,
+      "learning_rate": 0.0001432771324493879,
+      "loss": 0.6854,
+      "step": 707
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.47901758944800493,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.8333,
+      "step": 708
+    },
+    {
+      "epoch": 0.3781333333333333,
+      "grad_norm": 0.4303849937384296,
+      "learning_rate": 0.00014296530612327863,
+      "loss": 0.7523,
+      "step": 709
+    },
+    {
+      "epoch": 0.37866666666666665,
+      "grad_norm": 0.4374209556151092,
+      "learning_rate": 0.00014280920027594907,
+      "loss": 0.7678,
+      "step": 710
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.47198809216605603,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.8356,
+      "step": 711
+    },
+    {
+      "epoch": 0.3797333333333333,
+      "grad_norm": 0.4024862399022665,
+      "learning_rate": 0.00014249660554351752,
+      "loss": 0.7007,
+      "step": 712
+    },
+    {
+      "epoch": 0.38026666666666664,
+      "grad_norm": 0.3792924443405352,
+      "learning_rate": 0.00014234011759187083,
+      "loss": 0.7153,
+      "step": 713
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.430053974658381,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7302,
+      "step": 714
+    },
+    {
+      "epoch": 0.38133333333333336,
+      "grad_norm": 0.4012972328411576,
+      "learning_rate": 0.00014202676285419812,
+      "loss": 0.774,
+      "step": 715
+    },
+    {
+      "epoch": 0.3818666666666667,
+      "grad_norm": 0.39699382652413323,
+      "learning_rate": 0.00014186989700389687,
+      "loss": 0.7181,
+      "step": 716
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.3719421065089118,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.6906,
+      "step": 717
+    },
+    {
+      "epoch": 0.38293333333333335,
+      "grad_norm": 0.3887516144237888,
+      "learning_rate": 0.0001415557906824895,
+      "loss": 0.6608,
+      "step": 718
+    },
+    {
+      "epoch": 0.3834666666666667,
+      "grad_norm": 0.3895210635046898,
+      "learning_rate": 0.00014139855114935252,
+      "loss": 0.7179,
+      "step": 719
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.45450082390451063,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.8367,
+      "step": 720
+    },
+    {
+      "epoch": 0.38453333333333334,
+      "grad_norm": 0.3725659355692262,
+      "learning_rate": 0.0001410837016859161,
+      "loss": 0.6959,
+      "step": 721
+    },
+    {
+      "epoch": 0.38506666666666667,
+      "grad_norm": 0.35309941086865837,
+      "learning_rate": 0.00014092609269580496,
+      "loss": 0.6689,
+      "step": 722
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.3963476275709724,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.6858,
+      "step": 723
+    },
+    {
+      "epoch": 0.38613333333333333,
+      "grad_norm": 0.4528365425048492,
+      "learning_rate": 0.00014061050855201723,
+      "loss": 0.7301,
+      "step": 724
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 0.3795630731390314,
+      "learning_rate": 0.0001404525343407228,
+      "loss": 0.6641,
+      "step": 725
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.4496395597025512,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.7653,
+      "step": 726
+    },
+    {
+      "epoch": 0.3877333333333333,
+      "grad_norm": 0.4691699332043802,
+      "learning_rate": 0.00014013622399800627,
+      "loss": 0.7242,
+      "step": 727
+    },
+    {
+      "epoch": 0.38826666666666665,
+      "grad_norm": 0.4069917374756741,
+      "learning_rate": 0.00013997788881113489,
+      "loss": 0.7501,
+      "step": 728
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.4323151852428113,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7484,
+      "step": 729
+    },
+    {
+      "epoch": 0.3893333333333333,
+      "grad_norm": 0.3829524315730722,
+      "learning_rate": 0.0001396608607704289,
+      "loss": 0.7703,
+      "step": 730
+    },
+    {
+      "epoch": 0.38986666666666664,
+      "grad_norm": 0.479005062678588,
+      "learning_rate": 0.0001395021688632882,
+      "loss": 0.8218,
+      "step": 731
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.3976082145679388,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.7109,
+      "step": 732
+    },
+    {
+      "epoch": 0.39093333333333335,
+      "grad_norm": 0.376953678622604,
+      "learning_rate": 0.00013918443164482046,
+      "loss": 0.6589,
+      "step": 733
+    },
+    {
+      "epoch": 0.3914666666666667,
+      "grad_norm": 0.41759204644824766,
+      "learning_rate": 0.000139025387282305,
+      "loss": 0.751,
+      "step": 734
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.44479204319411103,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.773,
+      "step": 735
+    },
+    {
+      "epoch": 0.39253333333333335,
+      "grad_norm": 0.40128061601500625,
+      "learning_rate": 0.0001387069494253626,
+      "loss": 0.7418,
+      "step": 736
+    },
+    {
+      "epoch": 0.3930666666666667,
+      "grad_norm": 0.43936434787238515,
+      "learning_rate": 0.0001385475568818394,
+      "loss": 0.7049,
+      "step": 737
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.46942071117925305,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.8682,
+      "step": 738
+    },
+    {
+      "epoch": 0.39413333333333334,
+      "grad_norm": 0.3505449819227317,
+      "learning_rate": 0.00013822842694453924,
+      "loss": 0.6281,
+      "step": 739
+    },
+    {
+      "epoch": 0.39466666666666667,
+      "grad_norm": 0.3886842803955136,
+      "learning_rate": 0.0001380686905037327,
+      "loss": 0.6781,
+      "step": 740
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.4250225771278342,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7187,
+      "step": 741
+    },
+    {
+      "epoch": 0.3957333333333333,
+      "grad_norm": 0.38015548304077507,
+      "learning_rate": 0.00013774887706279165,
+      "loss": 0.7301,
+      "step": 742
+    },
+    {
+      "epoch": 0.39626666666666666,
+      "grad_norm": 0.4090302692938638,
+      "learning_rate": 0.0001375888010176686,
+      "loss": 0.7224,
+      "step": 743
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.41420052771101323,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7579,
+      "step": 744
+    },
+    {
+      "epoch": 0.3973333333333333,
+      "grad_norm": 0.3865481278497134,
+      "learning_rate": 0.00013726831266817278,
+      "loss": 0.7213,
+      "step": 745
+    },
+    {
+      "epoch": 0.39786666666666665,
+      "grad_norm": 0.3711153136441077,
+      "learning_rate": 0.00013710790132082692,
+      "loss": 0.6416,
+      "step": 746
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.3994148069993138,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.708,
+      "step": 747
+    },
+    {
+      "epoch": 0.3989333333333333,
+      "grad_norm": 0.43846752150037704,
+      "learning_rate": 0.00013678674667600102,
+      "loss": 0.6679,
+      "step": 748
+    },
+    {
+      "epoch": 0.3994666666666667,
+      "grad_norm": 0.3943227265598233,
+      "learning_rate": 0.00013662600433753745,
+      "loss": 0.6761,
+      "step": 749
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4690445251675281,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.7539,
+      "step": 750
+    },
+    {
+      "epoch": 0.40053333333333335,
+      "grad_norm": 0.38284744944500254,
+      "learning_rate": 0.00013630419202851284,
+      "loss": 0.6997,
+      "step": 751
+    },
+    {
+      "epoch": 0.4010666666666667,
+      "grad_norm": 0.39521289749916383,
+      "learning_rate": 0.00013614312301893223,
+      "loss": 0.6664,
+      "step": 752
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.48568882811090963,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.7892,
+      "step": 753
+    },
+    {
+      "epoch": 0.40213333333333334,
+      "grad_norm": 0.44396111349935563,
+      "learning_rate": 0.00013582066169451535,
+      "loss": 0.8052,
+      "step": 754
+    },
+    {
+      "epoch": 0.4026666666666667,
+      "grad_norm": 0.3890595348444515,
+      "learning_rate": 0.0001356592703425976,
+      "loss": 0.6517,
+      "step": 755
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.417056206989733,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7816,
+      "step": 756
+    },
+    {
+      "epoch": 0.40373333333333333,
+      "grad_norm": 0.37572599477212015,
+      "learning_rate": 0.00013533616866903735,
+      "loss": 0.6736,
+      "step": 757
+    },
+    {
+      "epoch": 0.40426666666666666,
+      "grad_norm": 0.43568748018498377,
+      "learning_rate": 0.0001351744593122255,
+      "loss": 0.7693,
+      "step": 758
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.4053396712735877,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.6891,
+      "step": 759
+    },
+    {
+      "epoch": 0.4053333333333333,
+      "grad_norm": 0.3575046459966235,
+      "learning_rate": 0.00013485072597298038,
+      "loss": 0.6732,
+      "step": 760
+    },
+    {
+      "epoch": 0.40586666666666665,
+      "grad_norm": 0.40571273371726574,
+      "learning_rate": 0.00013468870295726398,
+      "loss": 0.7085,
+      "step": 761
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.3756296277014389,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.6825,
+      "step": 762
+    },
+    {
+      "epoch": 0.4069333333333333,
+      "grad_norm": 0.37585736144686316,
+      "learning_rate": 0.00013436434665276865,
+      "loss": 0.6686,
+      "step": 763
+    },
+    {
+      "epoch": 0.40746666666666664,
+      "grad_norm": 0.47134961235009076,
+      "learning_rate": 0.00013420201433256689,
+      "loss": 0.8003,
+      "step": 764
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.4229574470837783,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7446,
+      "step": 765
+    },
+    {
+      "epoch": 0.40853333333333336,
+      "grad_norm": 0.41021182123563316,
+      "learning_rate": 0.00013387704377999842,
+      "loss": 0.7067,
+      "step": 766
+    },
+    {
+      "epoch": 0.4090666666666667,
+      "grad_norm": 0.43462186873872444,
+      "learning_rate": 0.00013371440651804313,
+      "loss": 0.7266,
+      "step": 767
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.39502082983477904,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7415,
+      "step": 768
+    },
+    {
+      "epoch": 0.41013333333333335,
+      "grad_norm": 0.41845170832603323,
+      "learning_rate": 0.00013338883045108674,
+      "loss": 0.7058,
+      "step": 769
+    },
+    {
+      "epoch": 0.4106666666666667,
+      "grad_norm": 0.38417517485198227,
+      "learning_rate": 0.00013322589261830517,
+      "loss": 0.7297,
+      "step": 770
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.46865854857087563,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7911,
+      "step": 771
+    },
+    {
+      "epoch": 0.41173333333333334,
+      "grad_norm": 0.432714007982488,
+      "learning_rate": 0.0001328997197869194,
+      "loss": 0.7296,
+      "step": 772
+    },
+    {
+      "epoch": 0.41226666666666667,
+      "grad_norm": 0.4130104753805327,
+      "learning_rate": 0.0001327364857623168,
+      "loss": 0.7109,
+      "step": 773
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3965384097121275,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.712,
+      "step": 774
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 0.39753345856911626,
+      "learning_rate": 0.00013240972493249847,
+      "loss": 0.7294,
+      "step": 775
+    },
+    {
+      "epoch": 0.41386666666666666,
+      "grad_norm": 0.37337568391982695,
+      "learning_rate": 0.0001322461991030402,
+      "loss": 0.6801,
+      "step": 776
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.42495863013532975,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7795,
+      "step": 777
+    },
+    {
+      "epoch": 0.4149333333333333,
+      "grad_norm": 0.3694816298883662,
+      "learning_rate": 0.00013191885905658872,
+      "loss": 0.7609,
+      "step": 778
+    },
+    {
+      "epoch": 0.41546666666666665,
+      "grad_norm": 0.422197920472493,
+      "learning_rate": 0.0001317550458170826,
+      "loss": 0.727,
+      "step": 779
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.42133160117778606,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.74,
+      "step": 780
+    },
+    {
+      "epoch": 0.4165333333333333,
+      "grad_norm": 0.40522540034554205,
+      "learning_rate": 0.00013142713535136414,
+      "loss": 0.7057,
+      "step": 781
+    },
+    {
+      "epoch": 0.41706666666666664,
+      "grad_norm": 0.4257453396923358,
+      "learning_rate": 0.00013126303910434214,
+      "loss": 0.7536,
+      "step": 782
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.45821836303267105,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.7428,
+      "step": 783
+    },
+    {
+      "epoch": 0.41813333333333336,
+      "grad_norm": 0.40996940512801966,
+      "learning_rate": 0.00013093456703205288,
+      "loss": 0.7069,
+      "step": 784
+    },
+    {
+      "epoch": 0.4186666666666667,
+      "grad_norm": 0.4136682843634082,
+      "learning_rate": 0.00013077019218765305,
+      "loss": 0.7174,
+      "step": 785
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4330694095649913,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7558,
+      "step": 786
+    },
+    {
+      "epoch": 0.41973333333333335,
+      "grad_norm": 0.38072139633099955,
+      "learning_rate": 0.0001304411673365826,
+      "loss": 0.7485,
+      "step": 787
+    },
+    {
+      "epoch": 0.4202666666666667,
+      "grad_norm": 0.44681542398150365,
+      "learning_rate": 0.0001302765183124302,
+      "loss": 0.752,
+      "step": 788
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.4835967639933307,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.7112,
+      "step": 789
+    },
+    {
+      "epoch": 0.42133333333333334,
+      "grad_norm": 0.4337798575706466,
+      "learning_rate": 0.00012994694952522435,
+      "loss": 0.7119,
+      "step": 790
+    },
+    {
+      "epoch": 0.42186666666666667,
+      "grad_norm": 0.4394053368246134,
+      "learning_rate": 0.00012978203074631334,
+      "loss": 0.7606,
+      "step": 791
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.4184670488382918,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.7899,
+      "step": 792
+    },
+    {
+      "epoch": 0.42293333333333333,
+      "grad_norm": 0.4486236155519449,
+      "learning_rate": 0.00012945192688023624,
+      "loss": 0.6806,
+      "step": 793
+    },
+    {
+      "epoch": 0.42346666666666666,
+      "grad_norm": 0.37217166593542145,
+      "learning_rate": 0.0001292867427788104,
+      "loss": 0.6633,
+      "step": 794
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.4056099913698769,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.6699,
+      "step": 795
+    },
+    {
+      "epoch": 0.4245333333333333,
+      "grad_norm": 0.3748965966249936,
+      "learning_rate": 0.00012895611270550666,
+      "loss": 0.6827,
+      "step": 796
+    },
+    {
+      "epoch": 0.42506666666666665,
+      "grad_norm": 0.3682053282598978,
+      "learning_rate": 0.0001287906677209403,
+      "loss": 0.6746,
+      "step": 797
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.4648781783236153,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7523,
+      "step": 798
+    },
+    {
+      "epoch": 0.4261333333333333,
+      "grad_norm": 0.3927181598506785,
+      "learning_rate": 0.0001284595203261965,
+      "loss": 0.6646,
+      "step": 799
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.4302304548474638,
+      "learning_rate": 0.00012829381890487536,
+      "loss": 0.7261,
+      "step": 800
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.4037203306983247,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.717,
+      "step": 801
+    },
+    {
+      "epoch": 0.42773333333333335,
+      "grad_norm": 0.45973219805149657,
+      "learning_rate": 0.00012796216308838117,
+      "loss": 0.7974,
+      "step": 802
+    },
+    {
+      "epoch": 0.4282666666666667,
+      "grad_norm": 0.4156731658562213,
+      "learning_rate": 0.00012779620968358273,
+      "loss": 0.6816,
+      "step": 803
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.38686160742998493,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.6187,
+      "step": 804
+    },
+    {
+      "epoch": 0.42933333333333334,
+      "grad_norm": 0.38541876275141473,
+      "learning_rate": 0.00012746405435869198,
+      "loss": 0.731,
+      "step": 805
+    },
+    {
+      "epoch": 0.4298666666666667,
+      "grad_norm": 0.3650985882425949,
+      "learning_rate": 0.00012729785343046588,
+      "loss": 0.6375,
+      "step": 806
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.3820816035240978,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.6481,
+      "step": 807
+    },
+    {
+      "epoch": 0.43093333333333333,
+      "grad_norm": 0.4098100896890776,
+      "learning_rate": 0.00012696520752395672,
+      "loss": 0.7088,
+      "step": 808
+    },
+    {
+      "epoch": 0.43146666666666667,
+      "grad_norm": 0.4019992067110189,
+      "learning_rate": 0.00012679876353900482,
+      "loss": 0.6383,
+      "step": 809
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4477475970231683,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.7161,
+      "step": 810
+    },
+    {
+      "epoch": 0.4325333333333333,
+      "grad_norm": 0.4832723278823965,
+      "learning_rate": 0.00012646563599083996,
+      "loss": 0.8073,
+      "step": 811
+    },
+    {
+      "epoch": 0.43306666666666666,
+      "grad_norm": 0.39144528129308914,
+      "learning_rate": 0.00012629895342239643,
+      "loss": 0.7019,
+      "step": 812
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.48014585161105533,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.7228,
+      "step": 813
+    },
+    {
+      "epoch": 0.4341333333333333,
+      "grad_norm": 0.38467131682164596,
+      "learning_rate": 0.00012596535318548289,
+      "loss": 0.6221,
+      "step": 814
+    },
+    {
+      "epoch": 0.43466666666666665,
+      "grad_norm": 0.4530745317538008,
+      "learning_rate": 0.0001257984365131938,
+      "loss": 0.718,
+      "step": 815
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4082702248795799,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.6797,
+      "step": 816
+    },
+    {
+      "epoch": 0.4357333333333333,
+      "grad_norm": 0.43342089139276424,
+      "learning_rate": 0.00012546437255314222,
+      "loss": 0.6645,
+      "step": 817
+    },
+    {
+      "epoch": 0.4362666666666667,
+      "grad_norm": 0.4207451429602256,
+      "learning_rate": 0.0001252972262629454,
+      "loss": 0.6833,
+      "step": 818
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.38360984855426133,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.6894,
+      "step": 819
+    },
+    {
+      "epoch": 0.43733333333333335,
+      "grad_norm": 0.4240341291920665,
+      "learning_rate": 0.00012496270755782914,
+      "loss": 0.6722,
+      "step": 820
+    },
+    {
+      "epoch": 0.4378666666666667,
+      "grad_norm": 0.4305886105350733,
+      "learning_rate": 0.00012479533614183334,
+      "loss": 0.7681,
+      "step": 821
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3823607516679746,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.6938,
+      "step": 822
+    },
+    {
+      "epoch": 0.43893333333333334,
+      "grad_norm": 0.3729241120465026,
+      "learning_rate": 0.00012446037168194714,
+      "loss": 0.6707,
+      "step": 823
+    },
+    {
+      "epoch": 0.43946666666666667,
+      "grad_norm": 0.4818185548709059,
+      "learning_rate": 0.00012429277963831148,
+      "loss": 0.7729,
+      "step": 824
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.3839065434027884,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.6874,
+      "step": 825
+    },
+    {
+      "epoch": 0.44053333333333333,
+      "grad_norm": 0.6072014627863225,
+      "learning_rate": 0.00012395737842592995,
+      "loss": 0.6463,
+      "step": 826
+    },
+    {
+      "epoch": 0.44106666666666666,
+      "grad_norm": 0.45298407699820475,
+      "learning_rate": 0.000123789570258743,
+      "loss": 0.795,
+      "step": 827
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.3768745247623755,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.6621,
+      "step": 828
+    },
+    {
+      "epoch": 0.4421333333333333,
+      "grad_norm": 0.46702341552804794,
+      "learning_rate": 0.00012345374130787854,
+      "loss": 0.759,
+      "step": 829
+    },
+    {
+      "epoch": 0.44266666666666665,
+      "grad_norm": 0.37489083732690975,
+      "learning_rate": 0.00012328572152703725,
+      "loss": 0.6834,
+      "step": 830
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.395287838499955,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.6993,
+      "step": 831
+    },
+    {
+      "epoch": 0.4437333333333333,
+      "grad_norm": 0.4268120690808298,
+      "learning_rate": 0.00012294947386319794,
+      "loss": 0.7052,
+      "step": 832
+    },
+    {
+      "epoch": 0.44426666666666664,
+      "grad_norm": 0.42529780825975927,
+      "learning_rate": 0.0001227812469842864,
+      "loss": 0.7153,
+      "step": 833
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.4770704866468159,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.8014,
+      "step": 834
+    },
+    {
+      "epoch": 0.44533333333333336,
+      "grad_norm": 0.41073002696832,
+      "learning_rate": 0.00012244458964423327,
+      "loss": 0.6851,
+      "step": 835
+    },
+    {
+      "epoch": 0.4458666666666667,
+      "grad_norm": 0.42479891566442424,
+      "learning_rate": 0.00012227616018840154,
+      "loss": 0.7233,
+      "step": 836
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.43012358618229257,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.7238,
+      "step": 837
+    },
+    {
+      "epoch": 0.44693333333333335,
+      "grad_norm": 0.427970768184565,
+      "learning_rate": 0.00012193910221990581,
+      "loss": 0.7545,
+      "step": 838
+    },
+    {
+      "epoch": 0.4474666666666667,
+      "grad_norm": 0.44220901200305707,
+      "learning_rate": 0.00012177047471374807,
+      "loss": 0.7454,
+      "step": 839
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3708598571062073,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.6875,
+      "step": 840
+    },
+    {
+      "epoch": 0.44853333333333334,
+      "grad_norm": 0.38154112367893656,
+      "learning_rate": 0.0001214330251753481,
+      "loss": 0.655,
+      "step": 841
+    },
+    {
+      "epoch": 0.44906666666666667,
+      "grad_norm": 0.4180924914745405,
+      "learning_rate": 0.00012126420415078132,
+      "loss": 0.7026,
+      "step": 842
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.37706281735537867,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.6642,
+      "step": 843
+    },
+    {
+      "epoch": 0.45013333333333333,
+      "grad_norm": 0.45162402306065524,
+      "learning_rate": 0.00012092637211153885,
+      "loss": 0.702,
+      "step": 844
+    },
+    {
+      "epoch": 0.45066666666666666,
+      "grad_norm": 0.38926688752106436,
+      "learning_rate": 0.0001207573621056809,
+      "loss": 0.6694,
+      "step": 845
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.4112584316278816,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.7053,
+      "step": 846
+    },
+    {
+      "epoch": 0.4517333333333333,
+      "grad_norm": 0.40729998026649766,
+      "learning_rate": 0.00012041915664493761,
+      "loss": 0.7366,
+      "step": 847
+    },
+    {
+      "epoch": 0.45226666666666665,
+      "grad_norm": 0.47521057991822163,
+      "learning_rate": 0.00012024996219998517,
+      "loss": 0.7819,
+      "step": 848
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.3968728998280063,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.7633,
+      "step": 849
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.39435414186145457,
+      "learning_rate": 0.00011991139240711857,
+      "loss": 0.6521,
+      "step": 850
+    },
+    {
+      "epoch": 0.45386666666666664,
+      "grad_norm": 0.38015894341698714,
+      "learning_rate": 0.00011974201807022525,
+      "loss": 0.6816,
+      "step": 851
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.40211203961776676,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.732,
+      "step": 852
+    },
+    {
+      "epoch": 0.45493333333333336,
+      "grad_norm": 0.3687423287361794,
+      "learning_rate": 0.00011940309304440433,
+      "loss": 0.6605,
+      "step": 853
+    },
+    {
+      "epoch": 0.4554666666666667,
+      "grad_norm": 0.3608434524524818,
+      "learning_rate": 0.00011923354336755835,
+      "loss": 0.6756,
+      "step": 854
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.3922623609380738,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.6725,
+      "step": 855
+    },
+    {
+      "epoch": 0.45653333333333335,
+      "grad_norm": 0.39597347062270916,
+      "learning_rate": 0.00011889427221749916,
+      "loss": 0.7483,
+      "step": 856
+    },
+    {
+      "epoch": 0.4570666666666667,
+      "grad_norm": 0.34586685440181364,
+      "learning_rate": 0.00011872455175740112,
+      "loss": 0.6412,
+      "step": 857
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.44501496790223405,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.673,
+      "step": 858
+    },
+    {
+      "epoch": 0.45813333333333334,
+      "grad_norm": 0.4600204345998243,
+      "learning_rate": 0.00011838494360112185,
+      "loss": 0.7201,
+      "step": 859
+    },
+    {
+      "epoch": 0.45866666666666667,
+      "grad_norm": 0.4014196509721014,
+      "learning_rate": 0.00011821505691906216,
+      "loss": 0.699,
+      "step": 860
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.3865882147784291,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.6704,
+      "step": 861
+    },
+    {
+      "epoch": 0.4597333333333333,
+      "grad_norm": 0.42434731199748743,
+      "learning_rate": 0.00011787512088363817,
+      "loss": 0.7314,
+      "step": 862
+    },
+    {
+      "epoch": 0.46026666666666666,
+      "grad_norm": 0.3669948305054552,
+      "learning_rate": 0.00011770507254537453,
+      "loss": 0.6729,
+      "step": 863
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.4080468927162067,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.7209,
+      "step": 864
+    },
+    {
+      "epoch": 0.4613333333333333,
+      "grad_norm": 0.40052575061760193,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.7068,
+      "step": 865
+    },
+    {
+      "epoch": 0.46186666666666665,
+      "grad_norm": 0.38516728199904116,
+      "learning_rate": 0.00011719461234232764,
+      "loss": 0.6365,
+      "step": 866
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.4642683269504646,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7884,
+      "step": 867
+    },
+    {
+      "epoch": 0.4629333333333333,
+      "grad_norm": 0.42914379555245596,
+      "learning_rate": 0.00011685404796484225,
+      "loss": 0.7144,
+      "step": 868
+    },
+    {
+      "epoch": 0.4634666666666667,
+      "grad_norm": 0.39355197006503395,
+      "learning_rate": 0.00011668369002869912,
+      "loss": 0.6593,
+      "step": 869
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.41403830664421076,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7261,
+      "step": 870
+    },
+    {
+      "epoch": 0.46453333333333335,
+      "grad_norm": 0.39302743911168797,
+      "learning_rate": 0.00011634282520518383,
+      "loss": 0.7442,
+      "step": 871
+    },
+    {
+      "epoch": 0.4650666666666667,
+      "grad_norm": 0.3799207034605691,
+      "learning_rate": 0.00011617231933568578,
+      "loss": 0.6613,
+      "step": 872
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.4190638514179366,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7603,
+      "step": 873
+    },
+    {
+      "epoch": 0.46613333333333334,
+      "grad_norm": 0.37643098596186125,
+      "learning_rate": 0.00011583116322698935,
+      "loss": 0.6847,
+      "step": 874
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.39834273988122226,
+      "learning_rate": 0.00011566051400653486,
+      "loss": 0.7247,
+      "step": 875
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3853714718645344,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.6914,
+      "step": 876
+    },
+    {
+      "epoch": 0.46773333333333333,
+      "grad_norm": 0.3853852562389037,
+      "learning_rate": 0.00011531907578133429,
+      "loss": 0.685,
+      "step": 877
+    },
+    {
+      "epoch": 0.46826666666666666,
+      "grad_norm": 0.40483027574830416,
+      "learning_rate": 0.00011514828779617459,
+      "loss": 0.6772,
+      "step": 878
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.43773822287020947,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.7151,
+      "step": 879
+    },
+    {
+      "epoch": 0.4693333333333333,
+      "grad_norm": 0.401585884405608,
+      "learning_rate": 0.00011480657663072896,
+      "loss": 0.713,
+      "step": 880
+    },
+    {
+      "epoch": 0.46986666666666665,
+      "grad_norm": 0.373259268876888,
+      "learning_rate": 0.00011463565447084445,
+      "loss": 0.6618,
+      "step": 881
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.39481180820237505,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.6679,
+      "step": 882
+    },
+    {
+      "epoch": 0.4709333333333333,
+      "grad_norm": 0.3648008060062637,
+      "learning_rate": 0.00011429367954874819,
+      "loss": 0.6838,
+      "step": 883
+    },
+    {
+      "epoch": 0.47146666666666665,
+      "grad_norm": 0.35357619534888196,
+      "learning_rate": 0.0001141226278077254,
+      "loss": 0.6673,
+      "step": 884
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.43376610744122884,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7405,
+      "step": 885
+    },
+    {
+      "epoch": 0.47253333333333336,
+      "grad_norm": 0.34837970865201134,
+      "learning_rate": 0.00011378039831966134,
+      "loss": 0.6533,
+      "step": 886
+    },
+    {
+      "epoch": 0.4730666666666667,
+      "grad_norm": 0.4208128648663218,
+      "learning_rate": 0.00011360922159456928,
+      "loss": 0.7388,
+      "step": 887
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.388716968023213,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.709,
+      "step": 888
+    },
+    {
+      "epoch": 0.47413333333333335,
+      "grad_norm": 0.5021201733267082,
+      "learning_rate": 0.00011326674673806195,
+      "loss": 0.8396,
+      "step": 889
+    },
+    {
+      "epoch": 0.4746666666666667,
+      "grad_norm": 0.36603509229198133,
+      "learning_rate": 0.00011309544962932862,
+      "loss": 0.6432,
+      "step": 890
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.3653170131859906,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.656,
+      "step": 891
+    },
+    {
+      "epoch": 0.47573333333333334,
+      "grad_norm": 0.47370322209378996,
+      "learning_rate": 0.00011275273860849684,
+      "loss": 0.8309,
+      "step": 892
+    },
+    {
+      "epoch": 0.47626666666666667,
+      "grad_norm": 0.445726497230847,
+      "learning_rate": 0.00011258132571978555,
+      "loss": 0.7042,
+      "step": 893
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.38058288768635273,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.6078,
+      "step": 894
+    },
+    {
+      "epoch": 0.47733333333333333,
+      "grad_norm": 0.4219977727956937,
+      "learning_rate": 0.00011223838774509514,
+      "loss": 0.7854,
+      "step": 895
+    },
+    {
+      "epoch": 0.47786666666666666,
+      "grad_norm": 0.4003805066609488,
+      "learning_rate": 0.00011206686368318086,
+      "loss": 0.6933,
+      "step": 896
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.3869224368581654,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.7227,
+      "step": 897
+    },
+    {
+      "epoch": 0.4789333333333333,
+      "grad_norm": 0.3969519981251819,
+      "learning_rate": 0.00011172370797119712,
+      "loss": 0.6594,
+      "step": 898
+    },
+    {
+      "epoch": 0.47946666666666665,
+      "grad_norm": 0.43243940086344307,
+      "learning_rate": 0.00011155207734584263,
+      "loss": 0.707,
+      "step": 899
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.40973836972076577,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7206,
+      "step": 900
+    },
+    {
+      "epoch": 0.4805333333333333,
+      "grad_norm": 0.4333513928815028,
+      "learning_rate": 0.00011120871311898254,
+      "loss": 0.7369,
+      "step": 901
+    },
+    {
+      "epoch": 0.48106666666666664,
+      "grad_norm": 0.3778470478153257,
+      "learning_rate": 0.0001110369805428146,
+      "loss": 0.6896,
+      "step": 902
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.3787175393176003,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.706,
+      "step": 903
+    },
+    {
+      "epoch": 0.48213333333333336,
+      "grad_norm": 0.419055853729895,
+      "learning_rate": 0.0001106934170290991,
+      "loss": 0.7251,
+      "step": 904
+    },
+    {
+      "epoch": 0.4826666666666667,
+      "grad_norm": 0.3939707033337408,
+      "learning_rate": 0.00011052158711748434,
+      "loss": 0.7261,
+      "step": 905
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.34817981691445615,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.6236,
+      "step": 906
+    },
+    {
+      "epoch": 0.48373333333333335,
+      "grad_norm": 0.39871326971289006,
+      "learning_rate": 0.00011017783355029026,
+      "loss": 0.7646,
+      "step": 907
+    },
+    {
+      "epoch": 0.4842666666666667,
+      "grad_norm": 0.3994819769347582,
+      "learning_rate": 0.00011000591092121127,
+      "loss": 0.6639,
+      "step": 908
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.40472772943920193,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.6971,
+      "step": 909
+    },
+    {
+      "epoch": 0.48533333333333334,
+      "grad_norm": 0.4535036946152579,
+      "learning_rate": 0.0001096619765390232,
+      "loss": 0.7437,
+      "step": 910
+    },
+    {
+      "epoch": 0.48586666666666667,
+      "grad_norm": 0.45382853786824645,
+      "learning_rate": 0.00010948996581295436,
+      "loss": 0.731,
+      "step": 911
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.4099178543711851,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.6624,
+      "step": 912
+    },
+    {
+      "epoch": 0.48693333333333333,
+      "grad_norm": 0.4351417287032228,
+      "learning_rate": 0.00010914585985911632,
+      "loss": 0.7108,
+      "step": 913
+    },
+    {
+      "epoch": 0.48746666666666666,
+      "grad_norm": 0.3865999238995926,
+      "learning_rate": 0.00010897376565889971,
+      "loss": 0.686,
+      "step": 914
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.4131282123736442,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.7737,
+      "step": 915
+    },
+    {
+      "epoch": 0.4885333333333333,
+      "grad_norm": 0.6147139842631306,
+      "learning_rate": 0.00010862949738136681,
+      "loss": 0.6286,
+      "step": 916
+    },
+    {
+      "epoch": 0.48906666666666665,
+      "grad_norm": 0.43222057493158733,
+      "learning_rate": 0.00010845732433208779,
+      "loss": 0.7238,
+      "step": 917
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3656758056743063,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.6355,
+      "step": 918
+    },
+    {
+      "epoch": 0.4901333333333333,
+      "grad_norm": 0.3979783969248524,
+      "learning_rate": 0.00010811290298317755,
+      "loss": 0.7204,
+      "step": 919
+    },
+    {
+      "epoch": 0.49066666666666664,
+      "grad_norm": 0.38982035334648496,
+      "learning_rate": 0.00010794065571204072,
+      "loss": 0.6974,
+      "step": 920
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.43538029830173397,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.6914,
+      "step": 921
+    },
+    {
+      "epoch": 0.49173333333333336,
+      "grad_norm": 0.4253887801936794,
+      "learning_rate": 0.00010759609054818458,
+      "loss": 0.7152,
+      "step": 922
+    },
+    {
+      "epoch": 0.4922666666666667,
+      "grad_norm": 0.39675412191254944,
+      "learning_rate": 0.00010742377368438914,
+      "loss": 0.6935,
+      "step": 923
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.42791248982595853,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.6907,
+      "step": 924
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 0.4154154123016647,
+      "learning_rate": 0.00010707907396588361,
+      "loss": 0.7278,
+      "step": 925
+    },
+    {
+      "epoch": 0.4938666666666667,
+      "grad_norm": 0.40598956985927526,
+      "learning_rate": 0.0001069066921404992,
+      "loss": 0.6992,
+      "step": 926
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.3994948473070552,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.7263,
+      "step": 927
+    },
+    {
+      "epoch": 0.49493333333333334,
+      "grad_norm": 0.40504268232009794,
+      "learning_rate": 0.00010656186713125689,
+      "loss": 0.7408,
+      "step": 928
+    },
+    {
+      "epoch": 0.49546666666666667,
+      "grad_norm": 0.4029635829319738,
+      "learning_rate": 0.0001063894249770989,
+      "loss": 0.7085,
+      "step": 929
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4091891981199755,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.6426,
+      "step": 930
+    },
+    {
+      "epoch": 0.4965333333333333,
+      "grad_norm": 0.364839486764378,
+      "learning_rate": 0.00010604448394439983,
+      "loss": 0.5997,
+      "step": 931
+    },
+    {
+      "epoch": 0.49706666666666666,
+      "grad_norm": 0.4297159365033185,
+      "learning_rate": 0.00010587198609590505,
+      "loss": 0.703,
+      "step": 932
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.3637943546123168,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.6128,
+      "step": 933
+    },
+    {
+      "epoch": 0.4981333333333333,
+      "grad_norm": 0.3988480910800117,
+      "learning_rate": 0.00010552693831014726,
+      "loss": 0.6687,
+      "step": 934
+    },
+    {
+      "epoch": 0.49866666666666665,
+      "grad_norm": 0.4161199702634679,
+      "learning_rate": 0.0001053543894032493,
+      "loss": 0.7151,
+      "step": 935
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.36805818507906174,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.68,
+      "step": 936
+    },
+    {
+      "epoch": 0.4997333333333333,
+      "grad_norm": 0.4009215069937478,
+      "learning_rate": 0.00010500924413769988,
+      "loss": 0.6919,
+      "step": 937
+    },
+    {
+      "epoch": 0.5002666666666666,
+      "grad_norm": 0.42282417996551025,
+      "learning_rate": 0.00010483664880970457,
+      "loss": 0.7095,
+      "step": 938
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.43058054375470933,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.7035,
+      "step": 939
+    },
+    {
+      "epoch": 0.5013333333333333,
+      "grad_norm": 0.4217570417848946,
+      "learning_rate": 0.00010449141534025045,
+      "loss": 0.7146,
+      "step": 940
+    },
+    {
+      "epoch": 0.5018666666666667,
+      "grad_norm": 0.42027817301221454,
+      "learning_rate": 0.00010431877822971117,
+      "loss": 0.7376,
+      "step": 941
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.33834147707907003,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.6487,
+      "step": 942
+    },
+    {
+      "epoch": 0.5029333333333333,
+      "grad_norm": 0.4134821879558162,
+      "learning_rate": 0.00010397346583460971,
+      "loss": 0.7886,
+      "step": 943
+    },
+    {
+      "epoch": 0.5034666666666666,
+      "grad_norm": 0.3949731948776204,
+      "learning_rate": 0.0001038007915812028,
+      "loss": 0.7012,
+      "step": 944
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.34576332442971586,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.6382,
+      "step": 945
+    },
+    {
+      "epoch": 0.5045333333333333,
+      "grad_norm": 0.35388171583496386,
+      "learning_rate": 0.0001034554095408326,
+      "loss": 0.6219,
+      "step": 946
+    },
+    {
+      "epoch": 0.5050666666666667,
+      "grad_norm": 0.38278455453057586,
+      "learning_rate": 0.00010328270278523256,
+      "loss": 0.7315,
+      "step": 947
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.4253906902405411,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.7071,
+      "step": 948
+    },
+    {
+      "epoch": 0.5061333333333333,
+      "grad_norm": 0.39983972561935216,
+      "learning_rate": 0.00010293726038184393,
+      "loss": 0.6921,
+      "step": 949
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.4908593442118835,
+      "learning_rate": 0.00010276452576559879,
+      "loss": 0.7144,
+      "step": 950
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.41244314476852084,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7213,
+      "step": 951
+    },
+    {
+      "epoch": 0.5077333333333334,
+      "grad_norm": 0.3768081492396289,
+      "learning_rate": 0.00010241903228306431,
+      "loss": 0.6784,
+      "step": 952
+    },
+    {
+      "epoch": 0.5082666666666666,
+      "grad_norm": 0.4091084323886437,
+      "learning_rate": 0.0001022462744484709,
+      "loss": 0.7362,
+      "step": 953
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.39542854824294177,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.6316,
+      "step": 954
+    },
+    {
+      "epoch": 0.5093333333333333,
+      "grad_norm": 0.42658853432054694,
+      "learning_rate": 0.00010190073917203589,
+      "loss": 0.7116,
+      "step": 955
+    },
+    {
+      "epoch": 0.5098666666666667,
+      "grad_norm": 0.4415072977745614,
+      "learning_rate": 0.00010172796276201503,
+      "loss": 0.7296,
+      "step": 956
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.3940372246149541,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.6972,
+      "step": 957
+    },
+    {
+      "epoch": 0.5109333333333334,
+      "grad_norm": 0.40714611050590943,
+      "learning_rate": 0.00010138239497804804,
+      "loss": 0.7074,
+      "step": 958
+    },
+    {
+      "epoch": 0.5114666666666666,
+      "grad_norm": 0.3947412761805426,
+      "learning_rate": 0.00010120960463601976,
+      "loss": 0.6611,
+      "step": 959
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.40630643296196406,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.6494,
+      "step": 960
+    },
+    {
+      "epoch": 0.5125333333333333,
+      "grad_norm": 0.37606127365186426,
+      "learning_rate": 0.00010086401363176305,
+      "loss": 0.6793,
+      "step": 961
+    },
+    {
+      "epoch": 0.5130666666666667,
+      "grad_norm": 0.4225471917863874,
+      "learning_rate": 0.00010069121400152181,
+      "loss": 0.6946,
+      "step": 962
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.3900813851774753,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.6834,
+      "step": 963
+    },
+    {
+      "epoch": 0.5141333333333333,
+      "grad_norm": 0.3723727654743894,
+      "learning_rate": 0.0001003456090648416,
+      "loss": 0.6645,
+      "step": 964
+    },
+    {
+      "epoch": 0.5146666666666667,
+      "grad_norm": 0.3761631915368695,
+      "learning_rate": 0.00010017280479043147,
+      "loss": 0.693,
+      "step": 965
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.3586208818733987,
+      "learning_rate": 0.0001,
+      "loss": 0.7007,
+      "step": 966
+    },
+    {
+      "epoch": 0.5157333333333334,
+      "grad_norm": 0.407637951605177,
+      "learning_rate": 9.982719520956855e-05,
+      "loss": 0.702,
+      "step": 967
+    },
+    {
+      "epoch": 0.5162666666666667,
+      "grad_norm": 0.4059428046057281,
+      "learning_rate": 9.965439093515841e-05,
+      "loss": 0.732,
+      "step": 968
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.3850818151480131,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.6823,
+      "step": 969
+    },
+    {
+      "epoch": 0.5173333333333333,
+      "grad_norm": 0.4354398472055768,
+      "learning_rate": 9.930878599847821e-05,
+      "loss": 0.7214,
+      "step": 970
+    },
+    {
+      "epoch": 0.5178666666666667,
+      "grad_norm": 0.395903093538153,
+      "learning_rate": 9.913598636823693e-05,
+      "loss": 0.6995,
+      "step": 971
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.41586008261746277,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.7089,
+      "step": 972
+    },
+    {
+      "epoch": 0.5189333333333334,
+      "grad_norm": 0.385195194061558,
+      "learning_rate": 9.879039536398024e-05,
+      "loss": 0.6803,
+      "step": 973
+    },
+    {
+      "epoch": 0.5194666666666666,
+      "grad_norm": 0.4331270399274859,
+      "learning_rate": 9.861760502195197e-05,
+      "loss": 0.6991,
+      "step": 974
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.42700022596637477,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7252,
+      "step": 975
+    },
+    {
+      "epoch": 0.5205333333333333,
+      "grad_norm": 0.36384908021036255,
+      "learning_rate": 9.827203723798498e-05,
+      "loss": 0.6223,
+      "step": 976
+    },
+    {
+      "epoch": 0.5210666666666667,
+      "grad_norm": 0.39288151581608477,
+      "learning_rate": 9.809926082796415e-05,
+      "loss": 0.6317,
+      "step": 977
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.39847243013391914,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.7035,
+      "step": 978
+    },
+    {
+      "epoch": 0.5221333333333333,
+      "grad_norm": 0.4039212053709652,
+      "learning_rate": 9.775372555152912e-05,
+      "loss": 0.6943,
+      "step": 979
+    },
+    {
+      "epoch": 0.5226666666666666,
+      "grad_norm": 0.42290336828658043,
+      "learning_rate": 9.758096771693573e-05,
+      "loss": 0.7361,
+      "step": 980
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.40170444050876447,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.6502,
+      "step": 981
+    },
+    {
+      "epoch": 0.5237333333333334,
+      "grad_norm": 0.37503291600085475,
+      "learning_rate": 9.723547423440122e-05,
+      "loss": 0.6663,
+      "step": 982
+    },
+    {
+      "epoch": 0.5242666666666667,
+      "grad_norm": 0.4491079520111571,
+      "learning_rate": 9.70627396181561e-05,
+      "loss": 0.7287,
+      "step": 983
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.37615029576785664,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7293,
+      "step": 984
+    },
+    {
+      "epoch": 0.5253333333333333,
+      "grad_norm": 0.4062311879287013,
+      "learning_rate": 9.671729721476746e-05,
+      "loss": 0.7597,
+      "step": 985
+    },
+    {
+      "epoch": 0.5258666666666667,
+      "grad_norm": 0.4106555572244141,
+      "learning_rate": 9.654459045916743e-05,
+      "loss": 0.6898,
+      "step": 986
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.3810138927623543,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.6712,
+      "step": 987
+    },
+    {
+      "epoch": 0.5269333333333334,
+      "grad_norm": 0.43511644718936276,
+      "learning_rate": 9.619920841879725e-05,
+      "loss": 0.6573,
+      "step": 988
+    },
+    {
+      "epoch": 0.5274666666666666,
+      "grad_norm": 0.3596408153487559,
+      "learning_rate": 9.602653416539031e-05,
+      "loss": 0.6547,
+      "step": 989
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4227460775317761,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.7407,
+      "step": 990
+    },
+    {
+      "epoch": 0.5285333333333333,
+      "grad_norm": 0.41660743948710655,
+      "learning_rate": 9.568122177028884e-05,
+      "loss": 0.7037,
+      "step": 991
+    },
+    {
+      "epoch": 0.5290666666666667,
+      "grad_norm": 0.42203230232212147,
+      "learning_rate": 9.550858465974958e-05,
+      "loss": 0.7697,
+      "step": 992
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.40976606484533545,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.705,
+      "step": 993
+    },
+    {
+      "epoch": 0.5301333333333333,
+      "grad_norm": 0.35721645630057663,
+      "learning_rate": 9.516335119029546e-05,
+      "loss": 0.6238,
+      "step": 994
+    },
+    {
+      "epoch": 0.5306666666666666,
+      "grad_norm": 0.414618087560364,
+      "learning_rate": 9.499075586230013e-05,
+      "loss": 0.7066,
+      "step": 995
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.3824425093661902,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6245,
+      "step": 996
+    },
+    {
+      "epoch": 0.5317333333333333,
+      "grad_norm": 0.3901463888480024,
+      "learning_rate": 9.464561059675073e-05,
+      "loss": 0.7366,
+      "step": 997
+    },
+    {
+      "epoch": 0.5322666666666667,
+      "grad_norm": 0.3858907265428058,
+      "learning_rate": 9.44730616898528e-05,
+      "loss": 0.7096,
+      "step": 998
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.37916375453449047,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.6696,
+      "step": 999
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.3903566812994692,
+      "learning_rate": 9.412801390409497e-05,
+      "loss": 0.6568,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5338666666666667,
+      "grad_norm": 0.3958293696206784,
+      "learning_rate": 9.395551605560018e-05,
+      "loss": 0.7225,
+      "step": 1001
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3932641529729933,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.6274,
+      "step": 1002
+    },
+    {
+      "epoch": 0.5349333333333334,
+      "grad_norm": 0.38899566090338883,
+      "learning_rate": 9.361057502290113e-05,
+      "loss": 0.6884,
+      "step": 1003
+    },
+    {
+      "epoch": 0.5354666666666666,
+      "grad_norm": 0.4112888261967654,
+      "learning_rate": 9.343813286874312e-05,
+      "loss": 0.7223,
+      "step": 1004
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.43383671370061827,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.7063,
+      "step": 1005
+    },
+    {
+      "epoch": 0.5365333333333333,
+      "grad_norm": 0.40397753060597436,
+      "learning_rate": 9.309330785950086e-05,
+      "loss": 0.7188,
+      "step": 1006
+    },
+    {
+      "epoch": 0.5370666666666667,
+      "grad_norm": 0.42631618450905356,
+      "learning_rate": 9.292092603411641e-05,
+      "loss": 0.7059,
+      "step": 1007
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.43939400703942777,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.7835,
+      "step": 1008
+    },
+    {
+      "epoch": 0.5381333333333334,
+      "grad_norm": 0.4764160775924318,
+      "learning_rate": 9.257622631561085e-05,
+      "loss": 0.7543,
+      "step": 1009
+    },
+    {
+      "epoch": 0.5386666666666666,
+      "grad_norm": 0.41294112008589423,
+      "learning_rate": 9.240390945181543e-05,
+      "loss": 0.7348,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.38480351527250445,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.6566,
+      "step": 1011
+    },
+    {
+      "epoch": 0.5397333333333333,
+      "grad_norm": 0.4208846692026214,
+      "learning_rate": 9.205934428795929e-05,
+      "loss": 0.7269,
+      "step": 1012
+    },
+    {
+      "epoch": 0.5402666666666667,
+      "grad_norm": 0.39984952875800545,
+      "learning_rate": 9.188709701682247e-05,
+      "loss": 0.6576,
+      "step": 1013
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.450115400163912,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.7546,
+      "step": 1014
+    },
+    {
+      "epoch": 0.5413333333333333,
+      "grad_norm": 0.4384828064806114,
+      "learning_rate": 9.154267566791223e-05,
+      "loss": 0.6777,
+      "step": 1015
+    },
+    {
+      "epoch": 0.5418666666666667,
+      "grad_norm": 0.3537485819664656,
+      "learning_rate": 9.137050261863324e-05,
+      "loss": 0.6394,
+      "step": 1016
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.3925664477015748,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.6461,
+      "step": 1017
+    },
+    {
+      "epoch": 0.5429333333333334,
+      "grad_norm": 0.4042824405410308,
+      "learning_rate": 9.102623434110028e-05,
+      "loss": 0.7461,
+      "step": 1018
+    },
+    {
+      "epoch": 0.5434666666666667,
+      "grad_norm": 0.4150795731005183,
+      "learning_rate": 9.085414014088369e-05,
+      "loss": 0.7075,
+      "step": 1019
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4263003552600655,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.7252,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5445333333333333,
+      "grad_norm": 0.37707965868426524,
+      "learning_rate": 9.051003418704565e-05,
+      "loss": 0.6708,
+      "step": 1021
+    },
+    {
+      "epoch": 0.5450666666666667,
+      "grad_norm": 0.3698700436596066,
+      "learning_rate": 9.033802346097682e-05,
+      "loss": 0.6426,
+      "step": 1022
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.4129200764803728,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.709,
+      "step": 1023
+    },
+    {
+      "epoch": 0.5461333333333334,
+      "grad_norm": 0.4083963175462401,
+      "learning_rate": 8.999408907878877e-05,
+      "loss": 0.6855,
+      "step": 1024
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 0.4663104499736953,
+      "learning_rate": 8.982216644970979e-05,
+      "loss": 0.811,
+      "step": 1025
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.4297412267341237,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.7313,
+      "step": 1026
+    },
+    {
+      "epoch": 0.5477333333333333,
+      "grad_norm": 0.3358616111146125,
+      "learning_rate": 8.947841288251568e-05,
+      "loss": 0.6229,
+      "step": 1027
+    },
+    {
+      "epoch": 0.5482666666666667,
+      "grad_norm": 0.3515527834892537,
+      "learning_rate": 8.930658297090091e-05,
+      "loss": 0.6222,
+      "step": 1028
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.41901629544799746,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.7907,
+      "step": 1029
+    },
+    {
+      "epoch": 0.5493333333333333,
+      "grad_norm": 0.3990309806229665,
+      "learning_rate": 8.896301945718541e-05,
+      "loss": 0.6926,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5498666666666666,
+      "grad_norm": 0.40509861786141166,
+      "learning_rate": 8.879128688101749e-05,
+      "loss": 0.6226,
+      "step": 1031
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.38529132852394044,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.6713,
+      "step": 1032
+    },
+    {
+      "epoch": 0.5509333333333334,
+      "grad_norm": 0.41600132868777884,
+      "learning_rate": 8.844792265415738e-05,
+      "loss": 0.7254,
+      "step": 1033
+    },
+    {
+      "epoch": 0.5514666666666667,
+      "grad_norm": 0.3787373054795975,
+      "learning_rate": 8.827629202880293e-05,
+      "loss": 0.6199,
+      "step": 1034
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.42480829465729075,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.6591,
+      "step": 1035
+    },
+    {
+      "epoch": 0.5525333333333333,
+      "grad_norm": 0.4017399826137374,
+      "learning_rate": 8.793313631681915e-05,
+      "loss": 0.687,
+      "step": 1036
+    },
+    {
+      "epoch": 0.5530666666666667,
+      "grad_norm": 0.3735955162878494,
+      "learning_rate": 8.776161225490489e-05,
+      "loss": 0.6351,
+      "step": 1037
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.37236289910720743,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.6406,
+      "step": 1038
+    },
+    {
+      "epoch": 0.5541333333333334,
+      "grad_norm": 0.3700742805962829,
+      "learning_rate": 8.741867428021446e-05,
+      "loss": 0.6314,
+      "step": 1039
+    },
+    {
+      "epoch": 0.5546666666666666,
+      "grad_norm": 0.38741742446437183,
+      "learning_rate": 8.724726139150318e-05,
+      "loss": 0.6481,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.42061139016224,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.6808,
+      "step": 1041
+    },
+    {
+      "epoch": 0.5557333333333333,
+      "grad_norm": 0.40189289414047447,
+      "learning_rate": 8.690455037067141e-05,
+      "loss": 0.731,
+      "step": 1042
+    },
+    {
+      "epoch": 0.5562666666666667,
+      "grad_norm": 0.43194099617650467,
+      "learning_rate": 8.673325326193806e-05,
+      "loss": 0.7576,
+      "step": 1043
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.40434161731585816,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.6961,
+      "step": 1044
+    },
+    {
+      "epoch": 0.5573333333333333,
+      "grad_norm": 0.33576949025640107,
+      "learning_rate": 8.639077840543077e-05,
+      "loss": 0.6091,
+      "step": 1045
+    },
+    {
+      "epoch": 0.5578666666666666,
+      "grad_norm": 0.4209921060098242,
+      "learning_rate": 8.621960168033867e-05,
+      "loss": 0.686,
+      "step": 1046
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.40776023676536,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.6471,
+      "step": 1047
+    },
+    {
+      "epoch": 0.5589333333333333,
+      "grad_norm": 0.44231962281175785,
+      "learning_rate": 8.587737219227462e-05,
+      "loss": 0.7029,
+      "step": 1048
+    },
+    {
+      "epoch": 0.5594666666666667,
+      "grad_norm": 0.4627369363006571,
+      "learning_rate": 8.570632045125185e-05,
+      "loss": 0.7135,
+      "step": 1049
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4126007280129486,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.6872,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5605333333333333,
+      "grad_norm": 0.40118740732652464,
+      "learning_rate": 8.536434552915556e-05,
+      "loss": 0.7161,
+      "step": 1051
+    },
+    {
+      "epoch": 0.5610666666666667,
+      "grad_norm": 0.40074343327936646,
+      "learning_rate": 8.519342336927105e-05,
+      "loss": 0.6029,
+      "step": 1052
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.4900151191949164,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7302,
+      "step": 1053
+    },
+    {
+      "epoch": 0.5621333333333334,
+      "grad_norm": 0.3978294212795194,
+      "learning_rate": 8.485171220382545e-05,
+      "loss": 0.673,
+      "step": 1054
+    },
+    {
+      "epoch": 0.5626666666666666,
+      "grad_norm": 0.42979827939326554,
+      "learning_rate": 8.468092421866573e-05,
+      "loss": 0.674,
+      "step": 1055
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.37596990999365637,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.6423,
+      "step": 1056
+    },
+    {
+      "epoch": 0.5637333333333333,
+      "grad_norm": 0.330748612387618,
+      "learning_rate": 8.433948599346516e-05,
+      "loss": 0.6101,
+      "step": 1057
+    },
+    {
+      "epoch": 0.5642666666666667,
+      "grad_norm": 0.38004071496037267,
+      "learning_rate": 8.416883677301069e-05,
+      "loss": 0.6522,
+      "step": 1058
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.3958252004411559,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6519,
+      "step": 1059
+    },
+    {
+      "epoch": 0.5653333333333334,
+      "grad_norm": 0.4610886404839161,
+      "learning_rate": 8.382768066431425e-05,
+      "loss": 0.7679,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5658666666666666,
+      "grad_norm": 0.43999354538623453,
+      "learning_rate": 8.36571747948162e-05,
+      "loss": 0.7649,
+      "step": 1061
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.431954569120567,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.7746,
+      "step": 1062
+    },
+    {
+      "epoch": 0.5669333333333333,
+      "grad_norm": 0.41751123735713236,
+      "learning_rate": 8.33163099713009e-05,
+      "loss": 0.6857,
+      "step": 1063
+    },
+    {
+      "epoch": 0.5674666666666667,
+      "grad_norm": 0.3639205433366738,
+      "learning_rate": 8.31459520351578e-05,
+      "loss": 0.5999,
+      "step": 1064
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.41782076174871,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.7019,
+      "step": 1065
+    },
+    {
+      "epoch": 0.5685333333333333,
+      "grad_norm": 0.40137239964322896,
+      "learning_rate": 8.280538765767235e-05,
+      "loss": 0.6499,
+      "step": 1066
+    },
+    {
+      "epoch": 0.5690666666666667,
+      "grad_norm": 0.3916657116648281,
+      "learning_rate": 8.263518223330697e-05,
+      "loss": 0.6731,
+      "step": 1067
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.43648214272950975,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.6936,
+      "step": 1068
+    },
+    {
+      "epoch": 0.5701333333333334,
+      "grad_norm": 0.3614702550180529,
+      "learning_rate": 8.22949274546255e-05,
+      "loss": 0.6604,
+      "step": 1069
+    },
+    {
+      "epoch": 0.5706666666666667,
+      "grad_norm": 0.39171255560881335,
+      "learning_rate": 8.212487911636184e-05,
+      "loss": 0.6435,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.41475724678973946,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6484,
+      "step": 1071
+    },
+    {
+      "epoch": 0.5717333333333333,
+      "grad_norm": 0.37044620468466394,
+      "learning_rate": 8.178494308093789e-05,
+      "loss": 0.6953,
+      "step": 1072
+    },
+    {
+      "epoch": 0.5722666666666667,
+      "grad_norm": 0.4104652708106284,
+      "learning_rate": 8.161505639887817e-05,
+      "loss": 0.6964,
+      "step": 1073
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.40310359248969063,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.6186,
+      "step": 1074
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 0.4167507617064418,
+      "learning_rate": 8.127544824259889e-05,
+      "loss": 0.6916,
+      "step": 1075
+    },
+    {
+      "epoch": 0.5738666666666666,
+      "grad_norm": 0.4036111480437301,
+      "learning_rate": 8.110572778250085e-05,
+      "loss": 0.6987,
+      "step": 1076
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.42691669543925936,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7399,
+      "step": 1077
+    },
+    {
+      "epoch": 0.5749333333333333,
+      "grad_norm": 0.4896969373431105,
+      "learning_rate": 8.076645663244168e-05,
+      "loss": 0.7086,
+      "step": 1078
+    },
+    {
+      "epoch": 0.5754666666666667,
+      "grad_norm": 0.4256350569942081,
+      "learning_rate": 8.059690695559568e-05,
+      "loss": 0.6358,
+      "step": 1079
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.44296581475694147,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.7005,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5765333333333333,
+      "grad_norm": 0.41529459953318565,
+      "learning_rate": 8.025798192977481e-05,
+      "loss": 0.6933,
+      "step": 1081
+    },
+    {
+      "epoch": 0.5770666666666666,
+      "grad_norm": 0.3958102091557811,
+      "learning_rate": 8.008860759288147e-05,
+      "loss": 0.6498,
+      "step": 1082
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.39296075905514277,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6627,
+      "step": 1083
+    },
+    {
+      "epoch": 0.5781333333333334,
+      "grad_norm": 0.36343079853159505,
+      "learning_rate": 7.975003780001485e-05,
+      "loss": 0.634,
+      "step": 1084
+    },
+    {
+      "epoch": 0.5786666666666667,
+      "grad_norm": 0.4121630028521987,
+      "learning_rate": 7.958084335506239e-05,
+      "loss": 0.6705,
+      "step": 1085
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.4707733225954869,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.7134,
+      "step": 1086
+    },
+    {
+      "epoch": 0.5797333333333333,
+      "grad_norm": 0.5458950698848051,
+      "learning_rate": 7.924263789431912e-05,
+      "loss": 0.7358,
+      "step": 1087
+    },
+    {
+      "epoch": 0.5802666666666667,
+      "grad_norm": 0.4056371907864066,
+      "learning_rate": 7.907362788846116e-05,
+      "loss": 0.6493,
+      "step": 1088
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.3391220300166163,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.6239,
+      "step": 1089
+    },
+    {
+      "epoch": 0.5813333333333334,
+      "grad_norm": 0.37303526946535215,
+      "learning_rate": 7.873579584921869e-05,
+      "loss": 0.6727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5818666666666666,
+      "grad_norm": 0.3975563747867086,
+      "learning_rate": 7.856697482465196e-05,
+      "loss": 0.716,
+      "step": 1091
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.40754193597631116,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6645,
+      "step": 1092
+    },
+    {
+      "epoch": 0.5829333333333333,
+      "grad_norm": 0.338121630211166,
+      "learning_rate": 7.822952528625191e-05,
+      "loss": 0.6345,
+      "step": 1093
+    },
+    {
+      "epoch": 0.5834666666666667,
+      "grad_norm": 0.4025820479162939,
+      "learning_rate": 7.806089778009421e-05,
+      "loss": 0.6989,
+      "step": 1094
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.3580610072994493,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.6521,
+      "step": 1095
+    },
+    {
+      "epoch": 0.5845333333333333,
+      "grad_norm": 0.4119649831750368,
+      "learning_rate": 7.772383981159849e-05,
+      "loss": 0.6839,
+      "step": 1096
+    },
+    {
+      "epoch": 0.5850666666666666,
+      "grad_norm": 0.4024807176089574,
+      "learning_rate": 7.755541035576677e-05,
+      "loss": 0.6858,
+      "step": 1097
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.4029083821244428,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6943,
+      "step": 1098
+    },
+    {
+      "epoch": 0.5861333333333333,
+      "grad_norm": 0.37969668653973554,
+      "learning_rate": 7.721875301571359e-05,
+      "loss": 0.6509,
+      "step": 1099
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.42182782146596565,
+      "learning_rate": 7.705052613680211e-05,
+      "loss": 0.6985,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.5037731368237888,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.7754,
+      "step": 1101
+    },
+    {
+      "epoch": 0.5877333333333333,
+      "grad_norm": 0.37337809091949264,
+      "learning_rate": 7.671427847296275e-05,
+      "loss": 0.6729,
+      "step": 1102
+    },
+    {
+      "epoch": 0.5882666666666667,
+      "grad_norm": 0.37837334845546255,
+      "learning_rate": 7.654625869212146e-05,
+      "loss": 0.6453,
+      "step": 1103
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.39421939332444644,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.6808,
+      "step": 1104
+    },
+    {
+      "epoch": 0.5893333333333334,
+      "grad_norm": 0.3784931025230661,
+      "learning_rate": 7.6210429741257e-05,
+      "loss": 0.6368,
+      "step": 1105
+    },
+    {
+      "epoch": 0.5898666666666667,
+      "grad_norm": 0.3586567232964291,
+      "learning_rate": 7.604262157407007e-05,
+      "loss": 0.6439,
+      "step": 1106
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.4154311155365181,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.6845,
+      "step": 1107
+    },
+    {
+      "epoch": 0.5909333333333333,
+      "grad_norm": 0.47099031407025466,
+      "learning_rate": 7.570722036168854e-05,
+      "loss": 0.6661,
+      "step": 1108
+    },
+    {
+      "epoch": 0.5914666666666667,
+      "grad_norm": 0.374356321105018,
+      "learning_rate": 7.55396283180529e-05,
+      "loss": 0.6592,
+      "step": 1109
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.36033775907891485,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.7013,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5925333333333334,
+      "grad_norm": 0.4115680582233992,
+      "learning_rate": 7.520466385816671e-05,
+      "loss": 0.7397,
+      "step": 1111
+    },
+    {
+      "epoch": 0.5930666666666666,
+      "grad_norm": 0.5948169725433807,
+      "learning_rate": 7.503729244217086e-05,
+      "loss": 0.8126,
+      "step": 1112
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.39424201853387325,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6381,
+      "step": 1113
+    },
+    {
+      "epoch": 0.5941333333333333,
+      "grad_norm": 0.38596472037517565,
+      "learning_rate": 7.470277373705461e-05,
+      "loss": 0.6297,
+      "step": 1114
+    },
+    {
+      "epoch": 0.5946666666666667,
+      "grad_norm": 0.46693697863083733,
+      "learning_rate": 7.453562744685778e-05,
+      "loss": 0.7187,
+      "step": 1115
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.38993864802943057,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.6778,
+      "step": 1116
+    },
+    {
+      "epoch": 0.5957333333333333,
+      "grad_norm": 0.5095305964683324,
+      "learning_rate": 7.42015634868062e-05,
+      "loss": 0.7064,
+      "step": 1117
+    },
+    {
+      "epoch": 0.5962666666666666,
+      "grad_norm": 0.4702317248701071,
+      "learning_rate": 7.403464681451715e-05,
+      "loss": 0.687,
+      "step": 1118
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.39116626359279505,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6652,
+      "step": 1119
+    },
+    {
+      "epoch": 0.5973333333333334,
+      "grad_norm": 0.41385588550831426,
+      "learning_rate": 7.370104657760361e-05,
+      "loss": 0.7124,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5978666666666667,
+      "grad_norm": 0.44730105821365734,
+      "learning_rate": 7.353436400916004e-05,
+      "loss": 0.6595,
+      "step": 1121
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4260691895277321,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.6834,
+      "step": 1122
+    },
+    {
+      "epoch": 0.5989333333333333,
+      "grad_norm": 0.40502517650835107,
+      "learning_rate": 7.320123646099519e-05,
+      "loss": 0.6344,
+      "step": 1123
+    },
+    {
+      "epoch": 0.5994666666666667,
+      "grad_norm": 0.3648254148708204,
+      "learning_rate": 7.303479247604332e-05,
+      "loss": 0.6619,
+      "step": 1124
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.33968598750408824,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.6153,
+      "step": 1125
+    },
+    {
+      "epoch": 0.6005333333333334,
+      "grad_norm": 0.4376866940534957,
+      "learning_rate": 7.270214656953415e-05,
+      "loss": 0.6884,
+      "step": 1126
+    },
+    {
+      "epoch": 0.6010666666666666,
+      "grad_norm": 0.386730695532095,
+      "learning_rate": 7.253594564130804e-05,
+      "loss": 0.6565,
+      "step": 1127
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.45854972772706987,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.7372,
+      "step": 1128
+    },
+    {
+      "epoch": 0.6021333333333333,
+      "grad_norm": 0.44580155415378203,
+      "learning_rate": 7.22037903164173e-05,
+      "loss": 0.6983,
+      "step": 1129
+    },
+    {
+      "epoch": 0.6026666666666667,
+      "grad_norm": 0.3471943361728626,
+      "learning_rate": 7.203783691161883e-05,
+      "loss": 0.6468,
+      "step": 1130
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.371897283706324,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.6417,
+      "step": 1131
+    },
+    {
+      "epoch": 0.6037333333333333,
+      "grad_norm": 0.37634830715800677,
+      "learning_rate": 7.170618109512465e-05,
+      "loss": 0.6456,
+      "step": 1132
+    },
+    {
+      "epoch": 0.6042666666666666,
+      "grad_norm": 0.3643292769279553,
+      "learning_rate": 7.154047967380354e-05,
+      "loss": 0.577,
+      "step": 1133
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.37373161239745434,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.6718,
+      "step": 1134
+    },
+    {
+      "epoch": 0.6053333333333333,
+      "grad_norm": 0.44290810246790635,
+      "learning_rate": 7.12093322790597e-05,
+      "loss": 0.8329,
+      "step": 1135
+    },
+    {
+      "epoch": 0.6058666666666667,
+      "grad_norm": 0.3768964282090524,
+      "learning_rate": 7.104388729449338e-05,
+      "loss": 0.6526,
+      "step": 1136
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.436605121674996,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.7088,
+      "step": 1137
+    },
+    {
+      "epoch": 0.6069333333333333,
+      "grad_norm": 0.4067130810225034,
+      "learning_rate": 7.071325722118963e-05,
+      "loss": 0.6762,
+      "step": 1138
+    },
+    {
+      "epoch": 0.6074666666666667,
+      "grad_norm": 0.3883653454852225,
+      "learning_rate": 7.054807311976379e-05,
+      "loss": 0.724,
+      "step": 1139
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.46747281420734993,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.6873,
+      "step": 1140
+    },
+    {
+      "epoch": 0.6085333333333334,
+      "grad_norm": 0.41205043009038744,
+      "learning_rate": 7.021796925368667e-05,
+      "loss": 0.6204,
+      "step": 1141
+    },
+    {
+      "epoch": 0.6090666666666666,
+      "grad_norm": 0.3242277894532913,
+      "learning_rate": 7.005305047477566e-05,
+      "loss": 0.5768,
+      "step": 1142
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.47407899824723065,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.7422,
+      "step": 1143
+    },
+    {
+      "epoch": 0.6101333333333333,
+      "grad_norm": 0.39820363944011855,
+      "learning_rate": 6.972348168756983e-05,
+      "loss": 0.6312,
+      "step": 1144
+    },
+    {
+      "epoch": 0.6106666666666667,
+      "grad_norm": 0.40064410648902177,
+      "learning_rate": 6.955883266341741e-05,
+      "loss": 0.6727,
+      "step": 1145
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.456453597260157,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.7063,
+      "step": 1146
+    },
+    {
+      "epoch": 0.6117333333333334,
+      "grad_norm": 0.41770655843835774,
+      "learning_rate": 6.922980781234699e-05,
+      "loss": 0.7075,
+      "step": 1147
+    },
+    {
+      "epoch": 0.6122666666666666,
+      "grad_norm": 0.3239776965801339,
+      "learning_rate": 6.906543296794714e-05,
+      "loss": 0.591,
+      "step": 1148
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.39484658036286896,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.6829,
+      "step": 1149
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.41105779736446446,
+      "learning_rate": 6.873696089565786e-05,
+      "loss": 0.6334,
+      "step": 1150
+    },
+    {
+      "epoch": 0.6138666666666667,
+      "grad_norm": 0.38249004175965406,
+      "learning_rate": 6.85728646486359e-05,
+      "loss": 0.6742,
+      "step": 1151
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3799212702114044,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6081,
+      "step": 1152
+    },
+    {
+      "epoch": 0.6149333333333333,
+      "grad_norm": 0.3835122723400976,
+      "learning_rate": 6.82449541829174e-05,
+      "loss": 0.6256,
+      "step": 1153
+    },
+    {
+      "epoch": 0.6154666666666667,
+      "grad_norm": 0.4135617148717006,
+      "learning_rate": 6.80811409434113e-05,
+      "loss": 0.6985,
+      "step": 1154
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.40148384921597696,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.6893,
+      "step": 1155
+    },
+    {
+      "epoch": 0.6165333333333334,
+      "grad_norm": 0.39123862561955897,
+      "learning_rate": 6.775380089695986e-05,
+      "loss": 0.6104,
+      "step": 1156
+    },
+    {
+      "epoch": 0.6170666666666667,
+      "grad_norm": 0.41887583289624125,
+      "learning_rate": 6.759027506750158e-05,
+      "loss": 0.6715,
+      "step": 1157
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.40638348485919545,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7107,
+      "step": 1158
+    },
+    {
+      "epoch": 0.6181333333333333,
+      "grad_norm": 0.4083949119435472,
+      "learning_rate": 6.726351423768322e-05,
+      "loss": 0.7447,
+      "step": 1159
+    },
+    {
+      "epoch": 0.6186666666666667,
+      "grad_norm": 0.37205781132951976,
+      "learning_rate": 6.710028021308061e-05,
+      "loss": 0.6352,
+      "step": 1160
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.38742841472003764,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.674,
+      "step": 1161
+    },
+    {
+      "epoch": 0.6197333333333334,
+      "grad_norm": 0.4033401083647566,
+      "learning_rate": 6.677410738169485e-05,
+      "loss": 0.6006,
+      "step": 1162
+    },
+    {
+      "epoch": 0.6202666666666666,
+      "grad_norm": 0.37598660488854835,
+      "learning_rate": 6.661116954891328e-05,
+      "loss": 0.6619,
+      "step": 1163
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.38121772986234537,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.6168,
+      "step": 1164
+    },
+    {
+      "epoch": 0.6213333333333333,
+      "grad_norm": 0.38586387962012575,
+      "learning_rate": 6.62855934819569e-05,
+      "loss": 0.6581,
+      "step": 1165
+    },
+    {
+      "epoch": 0.6218666666666667,
+      "grad_norm": 0.38770106756014167,
+      "learning_rate": 6.612295622000162e-05,
+      "loss": 0.652,
+      "step": 1166
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.4030686783354466,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.7031,
+      "step": 1167
+    },
+    {
+      "epoch": 0.6229333333333333,
+      "grad_norm": 0.39957943934094087,
+      "learning_rate": 6.579798566743314e-05,
+      "loss": 0.6753,
+      "step": 1168
+    },
+    {
+      "epoch": 0.6234666666666666,
+      "grad_norm": 0.4026404387336808,
+      "learning_rate": 6.563565334723134e-05,
+      "loss": 0.6309,
+      "step": 1169
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.41969507174402737,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.7228,
+      "step": 1170
+    },
+    {
+      "epoch": 0.6245333333333334,
+      "grad_norm": 0.39514615047582613,
+      "learning_rate": 6.531129704273604e-05,
+      "loss": 0.6668,
+      "step": 1171
+    },
+    {
+      "epoch": 0.6250666666666667,
+      "grad_norm": 0.4140566547772474,
+      "learning_rate": 6.514927402701964e-05,
+      "loss": 0.7516,
+      "step": 1172
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.42045406988313533,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.7,
+      "step": 1173
+    },
+    {
+      "epoch": 0.6261333333333333,
+      "grad_norm": 0.38178455221971197,
+      "learning_rate": 6.48255406877745e-05,
+      "loss": 0.6577,
+      "step": 1174
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 0.4019039081897145,
+      "learning_rate": 6.466383133096267e-05,
+      "loss": 0.6716,
+      "step": 1175
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3588840369768513,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6251,
+      "step": 1176
+    },
+    {
+      "epoch": 0.6277333333333334,
+      "grad_norm": 0.4302669315413597,
+      "learning_rate": 6.434072965740242e-05,
+      "loss": 0.6699,
+      "step": 1177
+    },
+    {
+      "epoch": 0.6282666666666666,
+      "grad_norm": 0.3810964054868667,
+      "learning_rate": 6.417933830548467e-05,
+      "loss": 0.6837,
+      "step": 1178
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.34145638426913544,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.6012,
+      "step": 1179
+    },
+    {
+      "epoch": 0.6293333333333333,
+      "grad_norm": 0.4009564732786252,
+      "learning_rate": 6.385687698106781e-05,
+      "loss": 0.6984,
+      "step": 1180
+    },
+    {
+      "epoch": 0.6298666666666667,
+      "grad_norm": 0.44252840493030354,
+      "learning_rate": 6.369580797148718e-05,
+      "loss": 0.6296,
+      "step": 1181
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.48871443302150325,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.7097,
+      "step": 1182
+    },
+    {
+      "epoch": 0.6309333333333333,
+      "grad_norm": 0.389317956940689,
+      "learning_rate": 6.337399566246257e-05,
+      "loss": 0.6922,
+      "step": 1183
+    },
+    {
+      "epoch": 0.6314666666666666,
+      "grad_norm": 0.3855175133639868,
+      "learning_rate": 6.321325332399903e-05,
+      "loss": 0.6778,
+      "step": 1184
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.44817808009371796,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.7177,
+      "step": 1185
+    },
+    {
+      "epoch": 0.6325333333333333,
+      "grad_norm": 0.45834516083898486,
+      "learning_rate": 6.289209867917312e-05,
+      "loss": 0.7301,
+      "step": 1186
+    },
+    {
+      "epoch": 0.6330666666666667,
+      "grad_norm": 0.41103896454877714,
+      "learning_rate": 6.273168733182722e-05,
+      "loss": 0.6816,
+      "step": 1187
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.4164157056477641,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.7342,
+      "step": 1188
+    },
+    {
+      "epoch": 0.6341333333333333,
+      "grad_norm": 0.38471632755102275,
+      "learning_rate": 6.241119898233144e-05,
+      "loss": 0.6227,
+      "step": 1189
+    },
+    {
+      "epoch": 0.6346666666666667,
+      "grad_norm": 0.4266485507533036,
+      "learning_rate": 6.225112293720836e-05,
+      "loss": 0.7651,
+      "step": 1190
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.4053459194807253,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.6023,
+      "step": 1191
+    },
+    {
+      "epoch": 0.6357333333333334,
+      "grad_norm": 0.36870562674552515,
+      "learning_rate": 6.19313094962673e-05,
+      "loss": 0.6791,
+      "step": 1192
+    },
+    {
+      "epoch": 0.6362666666666666,
+      "grad_norm": 0.4351844206500832,
+      "learning_rate": 6.177157305546078e-05,
+      "loss": 0.6601,
+      "step": 1193
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.40422879255325983,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.6963,
+      "step": 1194
+    },
+    {
+      "epoch": 0.6373333333333333,
+      "grad_norm": 0.4106921388255115,
+      "learning_rate": 6.145244311816063e-05,
+      "loss": 0.6799,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6378666666666667,
+      "grad_norm": 0.38328431141759234,
+      "learning_rate": 6.129305057463741e-05,
+      "loss": 0.7202,
+      "step": 1196
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.3796663367342876,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.648,
+      "step": 1197
+    },
+    {
+      "epoch": 0.6389333333333334,
+      "grad_norm": 0.40038403068977274,
+      "learning_rate": 6.0974612717695004e-05,
+      "loss": 0.6987,
+      "step": 1198
+    },
+    {
+      "epoch": 0.6394666666666666,
+      "grad_norm": 0.42132201533530983,
+      "learning_rate": 6.0815568355179556e-05,
+      "loss": 0.6697,
+      "step": 1199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.4360505688893721,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.5691,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6405333333333333,
+      "grad_norm": 0.38331061780435666,
+      "learning_rate": 6.0497831136711836e-05,
+      "loss": 0.6395,
+      "step": 1201
+    },
+    {
+      "epoch": 0.6410666666666667,
+      "grad_norm": 0.38488621036550796,
+      "learning_rate": 6.0339139229571116e-05,
+      "loss": 0.6883,
+      "step": 1202
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.39389684709596867,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.6414,
+      "step": 1203
+    },
+    {
+      "epoch": 0.6421333333333333,
+      "grad_norm": 0.42512107820513184,
+      "learning_rate": 6.002211118886514e-05,
+      "loss": 0.7008,
+      "step": 1204
+    },
+    {
+      "epoch": 0.6426666666666667,
+      "grad_norm": 0.3706379623136382,
+      "learning_rate": 5.986377600199371e-05,
+      "loss": 0.6591,
+      "step": 1205
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.3570458684110341,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.6095,
+      "step": 1206
+    },
+    {
+      "epoch": 0.6437333333333334,
+      "grad_norm": 0.8271381482491155,
+      "learning_rate": 5.9547465659277215e-05,
+      "loss": 0.6464,
+      "step": 1207
+    },
+    {
+      "epoch": 0.6442666666666667,
+      "grad_norm": 0.43554154795344535,
+      "learning_rate": 5.938949144798279e-05,
+      "loss": 0.6337,
+      "step": 1208
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.3926311897135617,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6868,
+      "step": 1209
+    },
+    {
+      "epoch": 0.6453333333333333,
+      "grad_norm": 0.38478540025206204,
+      "learning_rate": 5.907390730419507e-05,
+      "loss": 0.6421,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6458666666666667,
+      "grad_norm": 0.4253999798599033,
+      "learning_rate": 5.8916298314083915e-05,
+      "loss": 0.7009,
+      "step": 1211
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4520283482029617,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.6557,
+      "step": 1212
+    },
+    {
+      "epoch": 0.6469333333333334,
+      "grad_norm": 0.4481957998629243,
+      "learning_rate": 5.860144885064751e-05,
+      "loss": 0.6792,
+      "step": 1213
+    },
+    {
+      "epoch": 0.6474666666666666,
+      "grad_norm": 0.36804510383786965,
+      "learning_rate": 5.8444209317510514e-05,
+      "loss": 0.5613,
+      "step": 1214
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.43784577616548054,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.7515,
+      "step": 1215
+    },
+    {
+      "epoch": 0.6485333333333333,
+      "grad_norm": 0.3537276196215712,
+      "learning_rate": 5.813010299610313e-05,
+      "loss": 0.6131,
+      "step": 1216
+    },
+    {
+      "epoch": 0.6490666666666667,
+      "grad_norm": 0.350357203269661,
+      "learning_rate": 5.797323714580192e-05,
+      "loss": 0.6,
+      "step": 1217
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.4368660304973895,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.6682,
+      "step": 1218
+    },
+    {
+      "epoch": 0.6501333333333333,
+      "grad_norm": 0.4495026254782156,
+      "learning_rate": 5.765988240812921e-05,
+      "loss": 0.7361,
+      "step": 1219
+    },
+    {
+      "epoch": 0.6506666666666666,
+      "grad_norm": 0.38677664651139976,
+      "learning_rate": 5.750339445648252e-05,
+      "loss": 0.6633,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.3828365457034956,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.7032,
+      "step": 1221
+    },
+    {
+      "epoch": 0.6517333333333334,
+      "grad_norm": 0.38427762390030007,
+      "learning_rate": 5.7190799724050924e-05,
+      "loss": 0.6446,
+      "step": 1222
+    },
+    {
+      "epoch": 0.6522666666666667,
+      "grad_norm": 0.45968469339016016,
+      "learning_rate": 5.7034693876721376e-05,
+      "loss": 0.674,
+      "step": 1223
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.39687146622532793,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.6928,
+      "step": 1224
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 0.38722431254428086,
+      "learning_rate": 5.6722867550612116e-05,
+      "loss": 0.6573,
+      "step": 1225
+    },
+    {
+      "epoch": 0.6538666666666667,
+      "grad_norm": 0.3371248323421096,
+      "learning_rate": 5.6567148002993164e-05,
+      "loss": 0.5685,
+      "step": 1226
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.39236805124952945,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.6556,
+      "step": 1227
+    },
+    {
+      "epoch": 0.6549333333333334,
+      "grad_norm": 0.3564760869939289,
+      "learning_rate": 5.625609846363622e-05,
+      "loss": 0.6324,
+      "step": 1228
+    },
+    {
+      "epoch": 0.6554666666666666,
+      "grad_norm": 0.4518864695946772,
+      "learning_rate": 5.6100769400739383e-05,
+      "loss": 0.7223,
+      "step": 1229
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.40173947579151,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.6502,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6565333333333333,
+      "grad_norm": 0.41437685149036163,
+      "learning_rate": 5.579050500768836e-05,
+      "loss": 0.7034,
+      "step": 1231
+    },
+    {
+      "epoch": 0.6570666666666667,
+      "grad_norm": 0.3742027548508857,
+      "learning_rate": 5.5635570604030705e-05,
+      "loss": 0.6564,
+      "step": 1232
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.4294903032274521,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.6838,
+      "step": 1233
+    },
+    {
+      "epoch": 0.6581333333333333,
+      "grad_norm": 0.3575554784385886,
+      "learning_rate": 5.53260996957381e-05,
+      "loss": 0.6564,
+      "step": 1234
+    },
+    {
+      "epoch": 0.6586666666666666,
+      "grad_norm": 0.4113853938730298,
+      "learning_rate": 5.5171564115230254e-05,
+      "loss": 0.6645,
+      "step": 1235
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.4212484075042805,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.7032,
+      "step": 1236
+    },
+    {
+      "epoch": 0.6597333333333333,
+      "grad_norm": 0.3604563221522146,
+      "learning_rate": 5.486289500882355e-05,
+      "loss": 0.6126,
+      "step": 1237
+    },
+    {
+      "epoch": 0.6602666666666667,
+      "grad_norm": 0.3911411320130094,
+      "learning_rate": 5.47087624046575e-05,
+      "loss": 0.6888,
+      "step": 1238
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.46634873634767904,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.7848,
+      "step": 1239
+    },
+    {
+      "epoch": 0.6613333333333333,
+      "grad_norm": 0.3746195868623726,
+      "learning_rate": 5.4400903395715366e-05,
+      "loss": 0.6348,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6618666666666667,
+      "grad_norm": 0.45268404733657536,
+      "learning_rate": 5.424717791025302e-05,
+      "loss": 0.6367,
+      "step": 1241
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.40228550414974823,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.6452,
+      "step": 1242
+    },
+    {
+      "epoch": 0.6629333333333334,
+      "grad_norm": 0.38298979887710355,
+      "learning_rate": 5.394013727258254e-05,
+      "loss": 0.654,
+      "step": 1243
+    },
+    {
+      "epoch": 0.6634666666666666,
+      "grad_norm": 0.40411342029411973,
+      "learning_rate": 5.378682303724435e-05,
+      "loss": 0.6474,
+      "step": 1244
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.33999023185508753,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6017,
+      "step": 1245
+    },
+    {
+      "epoch": 0.6645333333333333,
+      "grad_norm": 0.4456025315628354,
+      "learning_rate": 5.348060902265871e-05,
+      "loss": 0.7676,
+      "step": 1246
+    },
+    {
+      "epoch": 0.6650666666666667,
+      "grad_norm": 0.3865625631324515,
+      "learning_rate": 5.332771015781275e-05,
+      "loss": 0.6322,
+      "step": 1247
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.36785554514355084,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.6363,
+      "step": 1248
+    },
+    {
+      "epoch": 0.6661333333333334,
+      "grad_norm": 0.5623616443529484,
+      "learning_rate": 5.302233099590928e-05,
+      "loss": 0.831,
+      "step": 1249
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.36364798127137493,
+      "learning_rate": 5.286985161076029e-05,
+      "loss": 0.6062,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.3644481549404707,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.5689,
+      "step": 1251
+    },
+    {
+      "epoch": 0.6677333333333333,
+      "grad_norm": 0.42209112129325826,
+      "learning_rate": 5.2565315508699376e-05,
+      "loss": 0.6826,
+      "step": 1252
+    },
+    {
+      "epoch": 0.6682666666666667,
+      "grad_norm": 0.3934119923502396,
+      "learning_rate": 5.2413259701178505e-05,
+      "loss": 0.6653,
+      "step": 1253
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.39861550420020664,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.6519,
+      "step": 1254
+    },
+    {
+      "epoch": 0.6693333333333333,
+      "grad_norm": 0.5533346648181713,
+      "learning_rate": 5.210957484346314e-05,
+      "loss": 0.7071,
+      "step": 1255
+    },
+    {
+      "epoch": 0.6698666666666667,
+      "grad_norm": 0.72441566042067,
+      "learning_rate": 5.195794670011776e-05,
+      "loss": 0.6945,
+      "step": 1256
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.4496207841402144,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.6927,
+      "step": 1257
+    },
+    {
+      "epoch": 0.6709333333333334,
+      "grad_norm": 0.35741518746187967,
+      "learning_rate": 5.165512124837344e-05,
+      "loss": 0.6388,
+      "step": 1258
+    },
+    {
+      "epoch": 0.6714666666666667,
+      "grad_norm": 0.37513644438477145,
+      "learning_rate": 5.150392484425728e-05,
+      "loss": 0.6884,
+      "step": 1259
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.6741583897905343,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.707,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6725333333333333,
+      "grad_norm": 0.39833802909880506,
+      "learning_rate": 5.120196693701267e-05,
+      "loss": 0.6929,
+      "step": 1261
+    },
+    {
+      "epoch": 0.6730666666666667,
+      "grad_norm": 0.35463548537829404,
+      "learning_rate": 5.105120633557634e-05,
+      "loss": 0.5978,
+      "step": 1262
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.5320189322195104,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.8426,
+      "step": 1263
+    },
+    {
+      "epoch": 0.6741333333333334,
+      "grad_norm": 0.46625066246950425,
+      "learning_rate": 5.075012408804458e-05,
+      "loss": 0.7109,
+      "step": 1264
+    },
+    {
+      "epoch": 0.6746666666666666,
+      "grad_norm": 0.4070584110611591,
+      "learning_rate": 5.059980334102637e-05,
+      "loss": 0.6713,
+      "step": 1265
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.42093544577227315,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6742,
+      "step": 1266
+    },
+    {
+      "epoch": 0.6757333333333333,
+      "grad_norm": 0.4103878154272117,
+      "learning_rate": 5.0299604844886985e-05,
+      "loss": 0.7077,
+      "step": 1267
+    },
+    {
+      "epoch": 0.6762666666666667,
+      "grad_norm": 0.3828060499876506,
+      "learning_rate": 5.014972799220403e-05,
+      "loss": 0.6372,
+      "step": 1268
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.38990894323417574,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.6586,
+      "step": 1269
+    },
+    {
+      "epoch": 0.6773333333333333,
+      "grad_norm": 0.37954425450696466,
+      "learning_rate": 4.985042131538545e-05,
+      "loss": 0.6697,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6778666666666666,
+      "grad_norm": 0.3532334908994542,
+      "learning_rate": 4.9700992385024934e-05,
+      "loss": 0.592,
+      "step": 1271
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.4195420438639813,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6759,
+      "step": 1272
+    },
+    {
+      "epoch": 0.6789333333333334,
+      "grad_norm": 0.37941635847563,
+      "learning_rate": 4.940258557148765e-05,
+      "loss": 0.5741,
+      "step": 1273
+    },
+    {
+      "epoch": 0.6794666666666667,
+      "grad_norm": 0.4648313404904717,
+      "learning_rate": 4.9253608579398855e-05,
+      "loss": 0.6948,
+      "step": 1274
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.3711456063807938,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.5983,
+      "step": 1275
+    },
+    {
+      "epoch": 0.6805333333333333,
+      "grad_norm": 0.3654848182579989,
+      "learning_rate": 4.895610964891923e-05,
+      "loss": 0.6036,
+      "step": 1276
+    },
+    {
+      "epoch": 0.6810666666666667,
+      "grad_norm": 0.37029369470976586,
+      "learning_rate": 4.880758859890536e-05,
+      "loss": 0.5892,
+      "step": 1277
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.3922062442904201,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.692,
+      "step": 1278
+    },
+    {
+      "epoch": 0.6821333333333334,
+      "grad_norm": 0.4967936001419372,
+      "learning_rate": 4.851100554686021e-05,
+      "loss": 0.744,
+      "step": 1279
+    },
+    {
+      "epoch": 0.6826666666666666,
+      "grad_norm": 0.3931460680340042,
+      "learning_rate": 4.836294443047088e-05,
+      "loss": 0.6686,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.4280181545183494,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.6806,
+      "step": 1281
+    },
+    {
+      "epoch": 0.6837333333333333,
+      "grad_norm": 0.42223312575635014,
+      "learning_rate": 4.8067285227622404e-05,
+      "loss": 0.6923,
+      "step": 1282
+    },
+    {
+      "epoch": 0.6842666666666667,
+      "grad_norm": 0.3809484000051097,
+      "learning_rate": 4.791968802404648e-05,
+      "loss": 0.6413,
+      "step": 1283
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.4210284404515193,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6599,
+      "step": 1284
+    },
+    {
+      "epoch": 0.6853333333333333,
+      "grad_norm": 0.3607822198484989,
+      "learning_rate": 4.762496061632814e-05,
+      "loss": 0.627,
+      "step": 1285
+    },
+    {
+      "epoch": 0.6858666666666666,
+      "grad_norm": 0.366669715798918,
+      "learning_rate": 4.747783129228656e-05,
+      "loss": 0.6031,
+      "step": 1286
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.40441128854705194,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.63,
+      "step": 1287
+    },
+    {
+      "epoch": 0.6869333333333333,
+      "grad_norm": 0.3612029088391989,
+      "learning_rate": 4.718404360058966e-05,
+      "loss": 0.6508,
+      "step": 1288
+    },
+    {
+      "epoch": 0.6874666666666667,
+      "grad_norm": 0.3901357281304347,
+      "learning_rate": 4.7037386110228985e-05,
+      "loss": 0.6449,
+      "step": 1289
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.36044613359339933,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.6401,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6885333333333333,
+      "grad_norm": 0.36899814505997225,
+      "learning_rate": 4.6744546030189486e-05,
+      "loss": 0.6334,
+      "step": 1291
+    },
+    {
+      "epoch": 0.6890666666666667,
+      "grad_norm": 0.391995130463381,
+      "learning_rate": 4.659836431497563e-05,
+      "loss": 0.6472,
+      "step": 1292
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.4340878228899344,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.7035,
+      "step": 1293
+    },
+    {
+      "epoch": 0.6901333333333334,
+      "grad_norm": 0.4423501611222201,
+      "learning_rate": 4.630647971676232e-05,
+      "loss": 0.6572,
+      "step": 1294
+    },
+    {
+      "epoch": 0.6906666666666667,
+      "grad_norm": 0.38532884886931085,
+      "learning_rate": 4.6160777705374524e-05,
+      "loss": 0.6644,
+      "step": 1295
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.44444047693909494,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.7538,
+      "step": 1296
+    },
+    {
+      "epoch": 0.6917333333333333,
+      "grad_norm": 0.4262671758385985,
+      "learning_rate": 4.586985643347717e-05,
+      "loss": 0.7397,
+      "step": 1297
+    },
+    {
+      "epoch": 0.6922666666666667,
+      "grad_norm": 0.4626250448381168,
+      "learning_rate": 4.572463804170263e-05,
+      "loss": 0.6834,
+      "step": 1298
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.375971749447931,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6049,
+      "step": 1299
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.38727589689119296,
+      "learning_rate": 4.543468791472131e-05,
+      "loss": 0.6471,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6938666666666666,
+      "grad_norm": 0.4009341292741956,
+      "learning_rate": 4.5289957045349653e-05,
+      "loss": 0.7063,
+      "step": 1301
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3975197832083184,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.6391,
+      "step": 1302
+    },
+    {
+      "epoch": 0.6949333333333333,
+      "grad_norm": 0.37280124273004256,
+      "learning_rate": 4.5000985855784746e-05,
+      "loss": 0.6313,
+      "step": 1303
+    },
+    {
+      "epoch": 0.6954666666666667,
+      "grad_norm": 0.4373590508588539,
+      "learning_rate": 4.485674639850333e-05,
+      "loss": 0.7093,
+      "step": 1304
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.38578450678888154,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.6489,
+      "step": 1305
+    },
+    {
+      "epoch": 0.6965333333333333,
+      "grad_norm": 0.4853093037820455,
+      "learning_rate": 4.456876191254582e-05,
+      "loss": 0.7684,
+      "step": 1306
+    },
+    {
+      "epoch": 0.6970666666666666,
+      "grad_norm": 0.4251998933688053,
+      "learning_rate": 4.442501774383515e-05,
+      "loss": 0.6282,
+      "step": 1307
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.42300865697478757,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.663,
+      "step": 1308
+    },
+    {
+      "epoch": 0.6981333333333334,
+      "grad_norm": 0.43778930703677116,
+      "learning_rate": 4.413802770115816e-05,
+      "loss": 0.701,
+      "step": 1309
+    },
+    {
+      "epoch": 0.6986666666666667,
+      "grad_norm": 0.39122495916107725,
+      "learning_rate": 4.399478268418771e-05,
+      "loss": 0.6524,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.45307591571359396,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.6816,
+      "step": 1311
+    },
+    {
+      "epoch": 0.6997333333333333,
+      "grad_norm": 0.38140562797791966,
+      "learning_rate": 4.3708794797738375e-05,
+      "loss": 0.6353,
+      "step": 1312
+    },
+    {
+      "epoch": 0.7002666666666667,
+      "grad_norm": 0.4068793642660578,
+      "learning_rate": 4.3566052782262735e-05,
+      "loss": 0.6598,
+      "step": 1313
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.39167806307515934,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.668,
+      "step": 1314
+    },
+    {
+      "epoch": 0.7013333333333334,
+      "grad_norm": 0.43109647072709595,
+      "learning_rate": 4.328107473805487e-05,
+      "loss": 0.7076,
+      "step": 1315
+    },
+    {
+      "epoch": 0.7018666666666666,
+      "grad_norm": 0.36208296146563684,
+      "learning_rate": 4.3138839560310303e-05,
+      "loss": 0.6429,
+      "step": 1316
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.47041028455487194,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6851,
+      "step": 1317
+    },
+    {
+      "epoch": 0.7029333333333333,
+      "grad_norm": 0.3664645579737971,
+      "learning_rate": 4.2854879017217894e-05,
+      "loss": 0.5857,
+      "step": 1318
+    },
+    {
+      "epoch": 0.7034666666666667,
+      "grad_norm": 0.4384290774847837,
+      "learning_rate": 4.271315449981934e-05,
+      "loss": 0.7168,
+      "step": 1319
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4242032275841449,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.673,
+      "step": 1320
+    },
+    {
+      "epoch": 0.7045333333333333,
+      "grad_norm": 0.3634144295847843,
+      "learning_rate": 4.2430219089370823e-05,
+      "loss": 0.6246,
+      "step": 1321
+    },
+    {
+      "epoch": 0.7050666666666666,
+      "grad_norm": 0.40844376488531203,
+      "learning_rate": 4.228900904120895e-05,
+      "loss": 0.6668,
+      "step": 1322
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.5714199677512499,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.6787,
+      "step": 1323
+    },
+    {
+      "epoch": 0.7061333333333333,
+      "grad_norm": 0.39084600958483035,
+      "learning_rate": 4.200710636738189e-05,
+      "loss": 0.662,
+      "step": 1324
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 0.404196719321763,
+      "learning_rate": 4.1866414583520877e-05,
+      "loss": 0.6633,
+      "step": 1325
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.4421065143917544,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6901,
+      "step": 1326
+    },
+    {
+      "epoch": 0.7077333333333333,
+      "grad_norm": 0.35196517976013403,
+      "learning_rate": 4.158555222253771e-05,
+      "loss": 0.6108,
+      "step": 1327
+    },
+    {
+      "epoch": 0.7082666666666667,
+      "grad_norm": 0.37411979187532646,
+      "learning_rate": 4.14453824841132e-05,
+      "loss": 0.6329,
+      "step": 1328
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.42420550637332644,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.6884,
+      "step": 1329
+    },
+    {
+      "epoch": 0.7093333333333334,
+      "grad_norm": 0.42182543477882817,
+      "learning_rate": 4.1165567984237764e-05,
+      "loss": 0.6573,
+      "step": 1330
+    },
+    {
+      "epoch": 0.7098666666666666,
+      "grad_norm": 0.39889234567331394,
+      "learning_rate": 4.102592405835536e-05,
+      "loss": 0.6751,
+      "step": 1331
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.39676110200169845,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.6921,
+      "step": 1332
+    },
+    {
+      "epoch": 0.7109333333333333,
+      "grad_norm": 0.3938882372029621,
+      "learning_rate": 4.074716493968975e-05,
+      "loss": 0.6185,
+      "step": 1333
+    },
+    {
+      "epoch": 0.7114666666666667,
+      "grad_norm": 0.3869942510770041,
+      "learning_rate": 4.060805057932359e-05,
+      "loss": 0.5917,
+      "step": 1334
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.3922253159288967,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6634,
+      "step": 1335
+    },
+    {
+      "epoch": 0.7125333333333334,
+      "grad_norm": 0.41825733938280657,
+      "learning_rate": 4.0330354333606234e-05,
+      "loss": 0.7077,
+      "step": 1336
+    },
+    {
+      "epoch": 0.7130666666666666,
+      "grad_norm": 0.43317183056056374,
+      "learning_rate": 4.019177327749822e-05,
+      "loss": 0.7164,
+      "step": 1337
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.36150784828671517,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6391,
+      "step": 1338
+    },
+    {
+      "epoch": 0.7141333333333333,
+      "grad_norm": 0.36723145086122616,
+      "learning_rate": 3.991514736790258e-05,
+      "loss": 0.6315,
+      "step": 1339
+    },
+    {
+      "epoch": 0.7146666666666667,
+      "grad_norm": 0.4103678298964808,
+      "learning_rate": 3.977710334046193e-05,
+      "loss": 0.6759,
+      "step": 1340
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.4196417562429959,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.6958,
+      "step": 1341
+    },
+    {
+      "epoch": 0.7157333333333333,
+      "grad_norm": 0.39654831771035837,
+      "learning_rate": 3.950155520139581e-05,
+      "loss": 0.6697,
+      "step": 1342
+    },
+    {
+      "epoch": 0.7162666666666667,
+      "grad_norm": 0.3943983098689371,
+      "learning_rate": 3.936405191259891e-05,
+      "loss": 0.6214,
+      "step": 1343
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.39397632964641327,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6901,
+      "step": 1344
+    },
+    {
+      "epoch": 0.7173333333333334,
+      "grad_norm": 0.4496753311469288,
+      "learning_rate": 3.9089588949504655e-05,
+      "loss": 0.7014,
+      "step": 1345
+    },
+    {
+      "epoch": 0.7178666666666667,
+      "grad_norm": 0.37580320728220684,
+      "learning_rate": 3.895263009479534e-05,
+      "loss": 0.6636,
+      "step": 1346
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.42177084717029684,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.6555,
+      "step": 1347
+    },
+    {
+      "epoch": 0.7189333333333333,
+      "grad_norm": 0.40646810663190575,
+      "learning_rate": 3.867925968395085e-05,
+      "loss": 0.6469,
+      "step": 1348
+    },
+    {
+      "epoch": 0.7194666666666667,
+      "grad_norm": 0.41365508067329304,
+      "learning_rate": 3.854284894414122e-05,
+      "loss": 0.6685,
+      "step": 1349
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.34714694057200485,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.6352,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7205333333333334,
+      "grad_norm": 0.4151687622915957,
+      "learning_rate": 3.82705784324618e-05,
+      "loss": 0.6491,
+      "step": 1351
+    },
+    {
+      "epoch": 0.7210666666666666,
+      "grad_norm": 0.3632469527368112,
+      "learning_rate": 3.8134719473633094e-05,
+      "loss": 0.6121,
+      "step": 1352
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.3920323388470978,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6097,
+      "step": 1353
+    },
+    {
+      "epoch": 0.7221333333333333,
+      "grad_norm": 0.39744170045940946,
+      "learning_rate": 3.786355617847385e-05,
+      "loss": 0.6475,
+      "step": 1354
+    },
+    {
+      "epoch": 0.7226666666666667,
+      "grad_norm": 0.446471427295853,
+      "learning_rate": 3.772825265187802e-05,
+      "loss": 0.6765,
+      "step": 1355
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.4063003989199862,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.6193,
+      "step": 1356
+    },
+    {
+      "epoch": 0.7237333333333333,
+      "grad_norm": 0.37094394307109213,
+      "learning_rate": 3.7458203860837234e-05,
+      "loss": 0.6314,
+      "step": 1357
+    },
+    {
+      "epoch": 0.7242666666666666,
+      "grad_norm": 0.3863751730451606,
+      "learning_rate": 3.732345940279893e-05,
+      "loss": 0.6754,
+      "step": 1358
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.3946416135622672,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.6659,
+      "step": 1359
+    },
+    {
+      "epoch": 0.7253333333333334,
+      "grad_norm": 0.3668017613796513,
+      "learning_rate": 3.705453237352227e-05,
+      "loss": 0.6359,
+      "step": 1360
+    },
+    {
+      "epoch": 0.7258666666666667,
+      "grad_norm": 0.3532747947569028,
+      "learning_rate": 3.692035060534088e-05,
+      "loss": 0.6381,
+      "step": 1361
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.49264762378255045,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.7229,
+      "step": 1362
+    },
+    {
+      "epoch": 0.7269333333333333,
+      "grad_norm": 0.3979815736399481,
+      "learning_rate": 3.665255256532638e-05,
+      "loss": 0.6503,
+      "step": 1363
+    },
+    {
+      "epoch": 0.7274666666666667,
+      "grad_norm": 0.35860055835757615,
+      "learning_rate": 3.651893709317887e-05,
+      "loss": 0.5768,
+      "step": 1364
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.42806578552403296,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.7102,
+      "step": 1365
+    },
+    {
+      "epoch": 0.7285333333333334,
+      "grad_norm": 0.3711673262227997,
+      "learning_rate": 3.625227523958252e-05,
+      "loss": 0.6037,
+      "step": 1366
+    },
+    {
+      "epoch": 0.7290666666666666,
+      "grad_norm": 0.4260368302830914,
+      "learning_rate": 3.611922965442648e-05,
+      "loss": 0.6336,
+      "step": 1367
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.41828555619842467,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.7052,
+      "step": 1368
+    },
+    {
+      "epoch": 0.7301333333333333,
+      "grad_norm": 0.3678885155752684,
+      "learning_rate": 3.5853711153868965e-05,
+      "loss": 0.6019,
+      "step": 1369
+    },
+    {
+      "epoch": 0.7306666666666667,
+      "grad_norm": 0.40274233978981605,
+      "learning_rate": 3.5721239031346066e-05,
+      "loss": 0.7069,
+      "step": 1370
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.4600956157514961,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.7972,
+      "step": 1371
+    },
+    {
+      "epoch": 0.7317333333333333,
+      "grad_norm": 0.40978340103739513,
+      "learning_rate": 3.545687101972013e-05,
+      "loss": 0.6009,
+      "step": 1372
+    },
+    {
+      "epoch": 0.7322666666666666,
+      "grad_norm": 0.4443155799954472,
+      "learning_rate": 3.53249759200601e-05,
+      "loss": 0.7042,
+      "step": 1373
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.3615102462432067,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.5927,
+      "step": 1374
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.7930685099141982,
+      "learning_rate": 3.506176550233863e-05,
+      "loss": 0.681,
+      "step": 1375
+    },
+    {
+      "epoch": 0.7338666666666667,
+      "grad_norm": 0.4077551476687731,
+      "learning_rate": 3.4930450970263485e-05,
+      "loss": 0.6321,
+      "step": 1376
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.41118679347968756,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6219,
+      "step": 1377
+    },
+    {
+      "epoch": 0.7349333333333333,
+      "grad_norm": 0.39624353753623753,
+      "learning_rate": 3.46684052203088e-05,
+      "loss": 0.6079,
+      "step": 1378
+    },
+    {
+      "epoch": 0.7354666666666667,
+      "grad_norm": 0.4068515623858212,
+      "learning_rate": 3.4537674784937614e-05,
+      "loss": 0.6467,
+      "step": 1379
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.36293900621263614,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.6336,
+      "step": 1380
+    },
+    {
+      "epoch": 0.7365333333333334,
+      "grad_norm": 0.4069638502104282,
+      "learning_rate": 3.427680074531113e-05,
+      "loss": 0.6369,
+      "step": 1381
+    },
+    {
+      "epoch": 0.7370666666666666,
+      "grad_norm": 0.3858394209281249,
+      "learning_rate": 3.4146657920065285e-05,
+      "loss": 0.647,
+      "step": 1382
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.3643715909820167,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.6538,
+      "step": 1383
+    },
+    {
+      "epoch": 0.7381333333333333,
+      "grad_norm": 0.3771449057141627,
+      "learning_rate": 3.388696260183832e-05,
+      "loss": 0.6428,
+      "step": 1384
+    },
+    {
+      "epoch": 0.7386666666666667,
+      "grad_norm": 0.42924005537930676,
+      "learning_rate": 3.3757410884346894e-05,
+      "loss": 0.6639,
+      "step": 1385
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.38883926038556477,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6504,
+      "step": 1386
+    },
+    {
+      "epoch": 0.7397333333333334,
+      "grad_norm": 0.4096661812753556,
+      "learning_rate": 3.3498901266912396e-05,
+      "loss": 0.6578,
+      "step": 1387
+    },
+    {
+      "epoch": 0.7402666666666666,
+      "grad_norm": 0.36693699193296664,
+      "learning_rate": 3.336994413891828e-05,
+      "loss": 0.6088,
+      "step": 1388
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.3796701162868268,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.6002,
+      "step": 1389
+    },
+    {
+      "epoch": 0.7413333333333333,
+      "grad_norm": 0.4828987134519485,
+      "learning_rate": 3.3112627169802946e-05,
+      "loss": 0.6435,
+      "step": 1390
+    },
+    {
+      "epoch": 0.7418666666666667,
+      "grad_norm": 0.37167365428892346,
+      "learning_rate": 3.298426809706928e-05,
+      "loss": 0.6004,
+      "step": 1391
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.42296109076969385,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.728,
+      "step": 1392
+    },
+    {
+      "epoch": 0.7429333333333333,
+      "grad_norm": 0.41048258922277014,
+      "learning_rate": 3.2728150691747115e-05,
+      "loss": 0.6761,
+      "step": 1393
+    },
+    {
+      "epoch": 0.7434666666666667,
+      "grad_norm": 0.42100029204947237,
+      "learning_rate": 3.2600393123964113e-05,
+      "loss": 0.677,
+      "step": 1394
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.4329414151995111,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.6369,
+      "step": 1395
+    },
+    {
+      "epoch": 0.7445333333333334,
+      "grad_norm": 0.3863961244631767,
+      "learning_rate": 3.234548216567049e-05,
+      "loss": 0.659,
+      "step": 1396
+    },
+    {
+      "epoch": 0.7450666666666667,
+      "grad_norm": 0.38246873643359003,
+      "learning_rate": 3.2218329536362704e-05,
+      "loss": 0.6679,
+      "step": 1397
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.40424047037882044,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.6732,
+      "step": 1398
+    },
+    {
+      "epoch": 0.7461333333333333,
+      "grad_norm": 0.38861598750942106,
+      "learning_rate": 3.196463187590929e-05,
+      "loss": 0.6913,
+      "step": 1399
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.4260659280740861,
+      "learning_rate": 3.1838087602343344e-05,
+      "loss": 0.6512,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.39337819894073167,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.6274,
+      "step": 1401
+    },
+    {
+      "epoch": 0.7477333333333334,
+      "grad_norm": 0.45108459949637103,
+      "learning_rate": 3.158561005793402e-05,
+      "loss": 0.6799,
+      "step": 1402
+    },
+    {
+      "epoch": 0.7482666666666666,
+      "grad_norm": 0.427549109761383,
+      "learning_rate": 3.145967754102691e-05,
+      "loss": 0.6847,
+      "step": 1403
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.40913046484681276,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.6988,
+      "step": 1404
+    },
+    {
+      "epoch": 0.7493333333333333,
+      "grad_norm": 0.3766080702153803,
+      "learning_rate": 3.120842689807468e-05,
+      "loss": 0.6039,
+      "step": 1405
+    },
+    {
+      "epoch": 0.7498666666666667,
+      "grad_norm": 0.40369322638801514,
+      "learning_rate": 3.108310952230212e-05,
+      "loss": 0.6407,
+      "step": 1406
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.38091158526163255,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6456,
+      "step": 1407
+    },
+    {
+      "epoch": 0.7509333333333333,
+      "grad_norm": 0.3883533040585799,
+      "learning_rate": 3.083309253324651e-05,
+      "loss": 0.6068,
+      "step": 1408
+    },
+    {
+      "epoch": 0.7514666666666666,
+      "grad_norm": 0.36799108174224,
+      "learning_rate": 3.070839366655215e-05,
+      "loss": 0.6364,
+      "step": 1409
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.42384231178399634,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.6556,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7525333333333334,
+      "grad_norm": 0.50356351760391,
+      "learning_rate": 3.0459617050677868e-05,
+      "loss": 0.6359,
+      "step": 1411
+    },
+    {
+      "epoch": 0.7530666666666667,
+      "grad_norm": 0.5604868933950006,
+      "learning_rate": 3.0335540044382694e-05,
+      "loss": 0.6685,
+      "step": 1412
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.4299467016549516,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.6582,
+      "step": 1413
+    },
+    {
+      "epoch": 0.7541333333333333,
+      "grad_norm": 0.5161869734884429,
+      "learning_rate": 3.008801048763914e-05,
+      "loss": 0.737,
+      "step": 1414
+    },
+    {
+      "epoch": 0.7546666666666667,
+      "grad_norm": 0.4559428429138101,
+      "learning_rate": 2.996455867635155e-05,
+      "loss": 0.7054,
+      "step": 1415
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.4092761318759988,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.6518,
+      "step": 1416
+    },
+    {
+      "epoch": 0.7557333333333334,
+      "grad_norm": 0.39530471957129254,
+      "learning_rate": 2.9718282831172883e-05,
+      "loss": 0.6611,
+      "step": 1417
+    },
+    {
+      "epoch": 0.7562666666666666,
+      "grad_norm": 0.42085708140120626,
+      "learning_rate": 2.9595459532698854e-05,
+      "loss": 0.6925,
+      "step": 1418
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.44992897855631964,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6549,
+      "step": 1419
+    },
+    {
+      "epoch": 0.7573333333333333,
+      "grad_norm": 0.4092292675638527,
+      "learning_rate": 2.9350444017825385e-05,
+      "loss": 0.6093,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7578666666666667,
+      "grad_norm": 0.39009926160521424,
+      "learning_rate": 2.922825253307947e-05,
+      "loss": 0.6005,
+      "step": 1421
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.36085329648766035,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6585,
+      "step": 1422
+    },
+    {
+      "epoch": 0.7589333333333333,
+      "grad_norm": 0.4369052587142333,
+      "learning_rate": 2.898450393337977e-05,
+      "loss": 0.6494,
+      "step": 1423
+    },
+    {
+      "epoch": 0.7594666666666666,
+      "grad_norm": 0.3744016775256687,
+      "learning_rate": 2.8862947546296315e-05,
+      "loss": 0.5725,
+      "step": 1424
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.4121362965207311,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.6017,
+      "step": 1425
+    },
+    {
+      "epoch": 0.7605333333333333,
+      "grad_norm": 0.37726568920550346,
+      "learning_rate": 2.8620472412590228e-05,
+      "loss": 0.6836,
+      "step": 1426
+    },
+    {
+      "epoch": 0.7610666666666667,
+      "grad_norm": 0.4418938398867874,
+      "learning_rate": 2.8499554390035143e-05,
+      "loss": 0.703,
+      "step": 1427
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.47506338936522274,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.6722,
+      "step": 1428
+    },
+    {
+      "epoch": 0.7621333333333333,
+      "grad_norm": 0.44442248833841674,
+      "learning_rate": 2.8258359238917665e-05,
+      "loss": 0.6797,
+      "step": 1429
+    },
+    {
+      "epoch": 0.7626666666666667,
+      "grad_norm": 0.4621149821698225,
+      "learning_rate": 2.8138082830600554e-05,
+      "loss": 0.7252,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.3907540229713785,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6067,
+      "step": 1431
+    },
+    {
+      "epoch": 0.7637333333333334,
+      "grad_norm": 0.38864671047408117,
+      "learning_rate": 2.7898174144266732e-05,
+      "loss": 0.6273,
+      "step": 1432
+    },
+    {
+      "epoch": 0.7642666666666666,
+      "grad_norm": 0.4237723464624118,
+      "learning_rate": 2.7778542582653744e-05,
+      "loss": 0.6868,
+      "step": 1433
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.4190239875572504,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.657,
+      "step": 1434
+    },
+    {
+      "epoch": 0.7653333333333333,
+      "grad_norm": 0.37781107305407086,
+      "learning_rate": 2.753992680872457e-05,
+      "loss": 0.5972,
+      "step": 1435
+    },
+    {
+      "epoch": 0.7658666666666667,
+      "grad_norm": 0.38874215364153275,
+      "learning_rate": 2.7420943308951284e-05,
+      "loss": 0.6169,
+      "step": 1436
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.36940896101229764,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.6558,
+      "step": 1437
+    },
+    {
+      "epoch": 0.7669333333333334,
+      "grad_norm": 0.4291282188176774,
+      "learning_rate": 2.7183626860300247e-05,
+      "loss": 0.6454,
+      "step": 1438
+    },
+    {
+      "epoch": 0.7674666666666666,
+      "grad_norm": 0.3839284709830824,
+      "learning_rate": 2.7065294620085424e-05,
+      "loss": 0.6598,
+      "step": 1439
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4108258871751767,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6245,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7685333333333333,
+      "grad_norm": 0.41932992628992855,
+      "learning_rate": 2.6829283874666233e-05,
+      "loss": 0.7086,
+      "step": 1441
+    },
+    {
+      "epoch": 0.7690666666666667,
+      "grad_norm": 0.3731122474497691,
+      "learning_rate": 2.6711606074225782e-05,
+      "loss": 0.5769,
+      "step": 1442
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.40271118418721064,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.6305,
+      "step": 1443
+    },
+    {
+      "epoch": 0.7701333333333333,
+      "grad_norm": 0.3759988991491332,
+      "learning_rate": 2.647690737490106e-05,
+      "loss": 0.6101,
+      "step": 1444
+    },
+    {
+      "epoch": 0.7706666666666667,
+      "grad_norm": 0.357343527288196,
+      "learning_rate": 2.6359887176862718e-05,
+      "loss": 0.5465,
+      "step": 1445
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.38925040840668906,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.5655,
+      "step": 1446
+    },
+    {
+      "epoch": 0.7717333333333334,
+      "grad_norm": 0.48160768866848636,
+      "learning_rate": 2.6126506831233344e-05,
+      "loss": 0.641,
+      "step": 1447
+    },
+    {
+      "epoch": 0.7722666666666667,
+      "grad_norm": 0.4244775309893954,
+      "learning_rate": 2.6010147380551475e-05,
+      "loss": 0.6801,
+      "step": 1448
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.4052513125568629,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6314,
+      "step": 1449
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.4530633727705138,
+      "learning_rate": 2.577809166078716e-05,
+      "loss": 0.6483,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7738666666666667,
+      "grad_norm": 0.37044257317667545,
+      "learning_rate": 2.566239608465838e-05,
+      "loss": 0.5936,
+      "step": 1451
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.41393619426439865,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.6154,
+      "step": 1452
+    },
+    {
+      "epoch": 0.7749333333333334,
+      "grad_norm": 0.4321874501000778,
+      "learning_rate": 2.543167122732918e-05,
+      "loss": 0.6521,
+      "step": 1453
+    },
+    {
+      "epoch": 0.7754666666666666,
+      "grad_norm": 0.4096361984440969,
+      "learning_rate": 2.5316642635108244e-05,
+      "loss": 0.6606,
+      "step": 1454
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.41634198041514003,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.7042,
+      "step": 1455
+    },
+    {
+      "epoch": 0.7765333333333333,
+      "grad_norm": 0.3915463794585155,
+      "learning_rate": 2.508725484101684e-05,
+      "loss": 0.6521,
+      "step": 1456
+    },
+    {
+      "epoch": 0.7770666666666667,
+      "grad_norm": 0.4071448592159562,
+      "learning_rate": 2.4972896324133144e-05,
+      "loss": 0.6251,
+      "step": 1457
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.42708533840760865,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6636,
+      "step": 1458
+    },
+    {
+      "epoch": 0.7781333333333333,
+      "grad_norm": 0.41917061115204585,
+      "learning_rate": 2.4744851758148156e-05,
+      "loss": 0.6674,
+      "step": 1459
+    },
+    {
+      "epoch": 0.7786666666666666,
+      "grad_norm": 0.4201973493458763,
+      "learning_rate": 2.4631166390022574e-05,
+      "loss": 0.6569,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.43340039241864153,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.7164,
+      "step": 1461
+    },
+    {
+      "epoch": 0.7797333333333333,
+      "grad_norm": 0.3991819504268546,
+      "learning_rate": 2.4404471180913058e-05,
+      "loss": 0.6478,
+      "step": 1462
+    },
+    {
+      "epoch": 0.7802666666666667,
+      "grad_norm": 0.3983915458949484,
+      "learning_rate": 2.429146201687538e-05,
+      "loss": 0.678,
+      "step": 1463
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.42485343295316874,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.6346,
+      "step": 1464
+    },
+    {
+      "epoch": 0.7813333333333333,
+      "grad_norm": 0.3772056934359643,
+      "learning_rate": 2.4066122257145894e-05,
+      "loss": 0.6295,
+      "step": 1465
+    },
+    {
+      "epoch": 0.7818666666666667,
+      "grad_norm": 0.45201756095626,
+      "learning_rate": 2.3953792334352787e-05,
+      "loss": 0.6335,
+      "step": 1466
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.37976490138332747,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6227,
+      "step": 1467
+    },
+    {
+      "epoch": 0.7829333333333334,
+      "grad_norm": 0.3902287384849779,
+      "learning_rate": 2.3729814080079816e-05,
+      "loss": 0.6666,
+      "step": 1468
+    },
+    {
+      "epoch": 0.7834666666666666,
+      "grad_norm": 0.3442031570950758,
+      "learning_rate": 2.361816641743303e-05,
+      "loss": 0.5574,
+      "step": 1469
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4228465689249216,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.6154,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7845333333333333,
+      "grad_norm": 0.44808713358890007,
+      "learning_rate": 2.339555568810221e-05,
+      "loss": 0.6696,
+      "step": 1471
+    },
+    {
+      "epoch": 0.7850666666666667,
+      "grad_norm": 0.4116323837110524,
+      "learning_rate": 2.328459328616759e-05,
+      "loss": 0.6117,
+      "step": 1472
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.4118234638399567,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6487,
+      "step": 1473
+    },
+    {
+      "epoch": 0.7861333333333334,
+      "grad_norm": 0.3996291046457449,
+      "learning_rate": 2.306335606451181e-05,
+      "loss": 0.6403,
+      "step": 1474
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 0.4054129911232635,
+      "learning_rate": 2.295308190543859e-05,
+      "loss": 0.6725,
+      "step": 1475
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.37006654804032474,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.5553,
+      "step": 1476
+    },
+    {
+      "epoch": 0.7877333333333333,
+      "grad_norm": 0.4637948919550711,
+      "learning_rate": 2.2733224137277366e-05,
+      "loss": 0.7438,
+      "step": 1477
+    },
+    {
+      "epoch": 0.7882666666666667,
+      "grad_norm": 0.40112128444826517,
+      "learning_rate": 2.262364118471805e-05,
+      "loss": 0.6543,
+      "step": 1478
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.41915403147154934,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6369,
+      "step": 1479
+    },
+    {
+      "epoch": 0.7893333333333333,
+      "grad_norm": 0.3920247905350215,
+      "learning_rate": 2.2405168778797646e-05,
+      "loss": 0.6751,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7898666666666667,
+      "grad_norm": 0.3726364042887898,
+      "learning_rate": 2.2296279977828337e-05,
+      "loss": 0.5884,
+      "step": 1481
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.44360332125933893,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.6872,
+      "step": 1482
+    },
+    {
+      "epoch": 0.7909333333333334,
+      "grad_norm": 0.44282322726263496,
+      "learning_rate": 2.2079198805662914e-05,
+      "loss": 0.6916,
+      "step": 1483
+    },
+    {
+      "epoch": 0.7914666666666667,
+      "grad_norm": 0.3663176808737401,
+      "learning_rate": 2.1971007082704164e-05,
+      "loss": 0.6582,
+      "step": 1484
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.4173314589810813,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.7487,
+      "step": 1485
+    },
+    {
+      "epoch": 0.7925333333333333,
+      "grad_norm": 0.4177413822287383,
+      "learning_rate": 2.1755322978418137e-05,
+      "loss": 0.6562,
+      "step": 1486
+    },
+    {
+      "epoch": 0.7930666666666667,
+      "grad_norm": 0.4018296608992928,
+      "learning_rate": 2.1647831241156302e-05,
+      "loss": 0.6825,
+      "step": 1487
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.577563320408092,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6675,
+      "step": 1488
+    },
+    {
+      "epoch": 0.7941333333333334,
+      "grad_norm": 0.4187052591946463,
+      "learning_rate": 2.1433550001327373e-05,
+      "loss": 0.6964,
+      "step": 1489
+    },
+    {
+      "epoch": 0.7946666666666666,
+      "grad_norm": 0.45419004967260235,
+      "learning_rate": 2.1326761138636553e-05,
+      "loss": 0.7142,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.4749285727661252,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.7697,
+      "step": 1491
+    },
+    {
+      "epoch": 0.7957333333333333,
+      "grad_norm": 0.403606872806351,
+      "learning_rate": 2.111388852214001e-05,
+      "loss": 0.599,
+      "step": 1492
+    },
+    {
+      "epoch": 0.7962666666666667,
+      "grad_norm": 0.3800456307931478,
+      "learning_rate": 2.1007805404004242e-05,
+      "loss": 0.6197,
+      "step": 1493
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.39692274903271046,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.6435,
+      "step": 1494
+    },
+    {
+      "epoch": 0.7973333333333333,
+      "grad_norm": 0.3500391944449609,
+      "learning_rate": 2.0796347131858186e-05,
+      "loss": 0.5917,
+      "step": 1495
+    },
+    {
+      "epoch": 0.7978666666666666,
+      "grad_norm": 0.370806006504793,
+      "learning_rate": 2.069097260929439e-05,
+      "loss": 0.6059,
+      "step": 1496
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.4532424951174448,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6522,
+      "step": 1497
+    },
+    {
+      "epoch": 0.7989333333333334,
+      "grad_norm": 0.3812118982948848,
+      "learning_rate": 2.048093436450603e-05,
+      "loss": 0.6228,
+      "step": 1498
+    },
+    {
+      "epoch": 0.7994666666666667,
+      "grad_norm": 0.4042590705033425,
+      "learning_rate": 2.0376271269487514e-05,
+      "loss": 0.5912,
+      "step": 1499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.37605121562082366,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.5777,
+      "step": 1500
+    },
+    {
+      "epoch": 0.8005333333333333,
+      "grad_norm": 0.41893809793390974,
+      "learning_rate": 2.0167658696900317e-05,
+      "loss": 0.6606,
+      "step": 1501
+    },
+    {
+      "epoch": 0.8010666666666667,
+      "grad_norm": 0.4308812653975276,
+      "learning_rate": 2.0063709842280432e-05,
+      "loss": 0.6721,
+      "step": 1502
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.4086330696324356,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.7012,
+      "step": 1503
+    },
+    {
+      "epoch": 0.8021333333333334,
+      "grad_norm": 0.4317896937328017,
+      "learning_rate": 1.985652854842247e-05,
+      "loss": 0.7462,
+      "step": 1504
+    },
+    {
+      "epoch": 0.8026666666666666,
+      "grad_norm": 0.38177599549512226,
+      "learning_rate": 1.9753296727859195e-05,
+      "loss": 0.6161,
+      "step": 1505
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.4087168870395883,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6338,
+      "step": 1506
+    },
+    {
+      "epoch": 0.8037333333333333,
+      "grad_norm": 0.4577575610849982,
+      "learning_rate": 1.9547552280792524e-05,
+      "loss": 0.6694,
+      "step": 1507
+    },
+    {
+      "epoch": 0.8042666666666667,
+      "grad_norm": 0.5402252955049723,
+      "learning_rate": 1.9445040268673298e-05,
+      "loss": 0.7686,
+      "step": 1508
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.4564488714688111,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.7128,
+      "step": 1509
+    },
+    {
+      "epoch": 0.8053333333333333,
+      "grad_norm": 0.4256599732779054,
+      "learning_rate": 1.9240738197844278e-05,
+      "loss": 0.6829,
+      "step": 1510
+    },
+    {
+      "epoch": 0.8058666666666666,
+      "grad_norm": 0.43274994041108594,
+      "learning_rate": 1.9138948749211472e-05,
+      "loss": 0.6206,
+      "step": 1511
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.36086439574030127,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.6216,
+      "step": 1512
+    },
+    {
+      "epoch": 0.8069333333333333,
+      "grad_norm": 0.42244229931003574,
+      "learning_rate": 1.8936094545302095e-05,
+      "loss": 0.6722,
+      "step": 1513
+    },
+    {
+      "epoch": 0.8074666666666667,
+      "grad_norm": 0.42155871559748825,
+      "learning_rate": 1.883503039577894e-05,
+      "loss": 0.682,
+      "step": 1514
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.371315898744865,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.6007,
+      "step": 1515
+    },
+    {
+      "epoch": 0.8085333333333333,
+      "grad_norm": 0.43003284582729545,
+      "learning_rate": 1.8633629510559314e-05,
+      "loss": 0.6504,
+      "step": 1516
+    },
+    {
+      "epoch": 0.8090666666666667,
+      "grad_norm": 0.36370255523634354,
+      "learning_rate": 1.8533293376276472e-05,
+      "loss": 0.639,
+      "step": 1517
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.4704806704812013,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.6655,
+      "step": 1518
+    },
+    {
+      "epoch": 0.8101333333333334,
+      "grad_norm": 0.3815879147078225,
+      "learning_rate": 1.8333351222458407e-05,
+      "loss": 0.6149,
+      "step": 1519
+    },
+    {
+      "epoch": 0.8106666666666666,
+      "grad_norm": 0.4527025181288094,
+      "learning_rate": 1.8233745799980817e-05,
+      "loss": 0.748,
+      "step": 1520
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.4897101550964116,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.7656,
+      "step": 1521
+    },
+    {
+      "epoch": 0.8117333333333333,
+      "grad_norm": 0.37319089791901433,
+      "learning_rate": 1.803526775107217e-05,
+      "loss": 0.5999,
+      "step": 1522
+    },
+    {
+      "epoch": 0.8122666666666667,
+      "grad_norm": 0.3758775079088501,
+      "learning_rate": 1.7936395717326704e-05,
+      "loss": 0.6063,
+      "step": 1523
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.40838567930237774,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.7114,
+      "step": 1524
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 0.3973963251318473,
+      "learning_rate": 1.773938710748706e-05,
+      "loss": 0.6397,
+      "step": 1525
+    },
+    {
+      "epoch": 0.8138666666666666,
+      "grad_norm": 0.3579607168349245,
+      "learning_rate": 1.7641251119690505e-05,
+      "loss": 0.6355,
+      "step": 1526
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.3824067101446628,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.5695,
+      "step": 1527
+    },
+    {
+      "epoch": 0.8149333333333333,
+      "grad_norm": 0.4925954361146254,
+      "learning_rate": 1.744571724358789e-05,
+      "loss": 0.6677,
+      "step": 1528
+    },
+    {
+      "epoch": 0.8154666666666667,
+      "grad_norm": 0.4276981555335783,
+      "learning_rate": 1.7348319939175637e-05,
+      "loss": 0.6416,
+      "step": 1529
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.43558168937581576,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.6592,
+      "step": 1530
+    },
+    {
+      "epoch": 0.8165333333333333,
+      "grad_norm": 0.4238141183231073,
+      "learning_rate": 1.715426605184407e-05,
+      "loss": 0.6972,
+      "step": 1531
+    },
+    {
+      "epoch": 0.8170666666666667,
+      "grad_norm": 0.4244831238104071,
+      "learning_rate": 1.705761004839911e-05,
+      "loss": 0.6191,
+      "step": 1532
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.6300763843676416,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.625,
+      "step": 1533
+    },
+    {
+      "epoch": 0.8181333333333334,
+      "grad_norm": 0.3967432502181859,
+      "learning_rate": 1.6865041365097435e-05,
+      "loss": 0.6343,
+      "step": 1534
+    },
+    {
+      "epoch": 0.8186666666666667,
+      "grad_norm": 0.3996275527322477,
+      "learning_rate": 1.676912926028007e-05,
+      "loss": 0.6115,
+      "step": 1535
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.43478016649919016,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.683,
+      "step": 1536
+    },
+    {
+      "epoch": 0.8197333333333333,
+      "grad_norm": 0.3935437242031588,
+      "learning_rate": 1.6578050956351886e-05,
+      "loss": 0.6435,
+      "step": 1537
+    },
+    {
+      "epoch": 0.8202666666666667,
+      "grad_norm": 0.3912390364000557,
+      "learning_rate": 1.6482885327829913e-05,
+      "loss": 0.6355,
+      "step": 1538
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.4210375910070289,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6788,
+      "step": 1539
+    },
+    {
+      "epoch": 0.8213333333333334,
+      "grad_norm": 0.38713562077626823,
+      "learning_rate": 1.6293302538564382e-05,
+      "loss": 0.6462,
+      "step": 1540
+    },
+    {
+      "epoch": 0.8218666666666666,
+      "grad_norm": 0.37313864212152587,
+      "learning_rate": 1.619888594394382e-05,
+      "loss": 0.6321,
+      "step": 1541
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.4133781840555599,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.7128,
+      "step": 1542
+    },
+    {
+      "epoch": 0.8229333333333333,
+      "grad_norm": 0.41490502511206495,
+      "learning_rate": 1.601080376443763e-05,
+      "loss": 0.5811,
+      "step": 1543
+    },
+    {
+      "epoch": 0.8234666666666667,
+      "grad_norm": 0.4230171985342458,
+      "learning_rate": 1.5917138741193973e-05,
+      "loss": 0.701,
+      "step": 1544
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.3758267013887235,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.5755,
+      "step": 1545
+    },
+    {
+      "epoch": 0.8245333333333333,
+      "grad_norm": 0.4194968203142697,
+      "learning_rate": 1.573056222621453e-05,
+      "loss": 0.6747,
+      "step": 1546
+    },
+    {
+      "epoch": 0.8250666666666666,
+      "grad_norm": 0.3682826299485392,
+      "learning_rate": 1.5637651291624523e-05,
+      "loss": 0.5932,
+      "step": 1547
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.38930004217634334,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.6158,
+      "step": 1548
+    },
+    {
+      "epoch": 0.8261333333333334,
+      "grad_norm": 0.4158573333782977,
+      "learning_rate": 1.5452585455473977e-05,
+      "loss": 0.692,
+      "step": 1549
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.3742543536831887,
+      "learning_rate": 1.536043110654809e-05,
+      "loss": 0.6646,
+      "step": 1550
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.4565482831683287,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6546,
+      "step": 1551
+    },
+    {
+      "epoch": 0.8277333333333333,
+      "grad_norm": 0.4648477002260928,
+      "learning_rate": 1.5176880922928616e-05,
+      "loss": 0.6678,
+      "step": 1552
+    },
+    {
+      "epoch": 0.8282666666666667,
+      "grad_norm": 0.39928863792951164,
+      "learning_rate": 1.5085485636343755e-05,
+      "loss": 0.6211,
+      "step": 1553
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.4462595214667838,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.6837,
+      "step": 1554
+    },
+    {
+      "epoch": 0.8293333333333334,
+      "grad_norm": 0.3651232552582547,
+      "learning_rate": 1.4903456038223939e-05,
+      "loss": 0.5964,
+      "step": 1555
+    },
+    {
+      "epoch": 0.8298666666666666,
+      "grad_norm": 0.39227730756186174,
+      "learning_rate": 1.4812822270257009e-05,
+      "loss": 0.6489,
+      "step": 1556
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.3870925483846158,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.6095,
+      "step": 1557
+    },
+    {
+      "epoch": 0.8309333333333333,
+      "grad_norm": 0.42167014093711325,
+      "learning_rate": 1.4632318149739177e-05,
+      "loss": 0.6827,
+      "step": 1558
+    },
+    {
+      "epoch": 0.8314666666666667,
+      "grad_norm": 0.40072319805451717,
+      "learning_rate": 1.454244833620102e-05,
+      "loss": 0.657,
+      "step": 1559
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.39007928494064764,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.5955,
+      "step": 1560
+    },
+    {
+      "epoch": 0.8325333333333333,
+      "grad_norm": 0.37856632658691275,
+      "learning_rate": 1.4363474544389877e-05,
+      "loss": 0.6409,
+      "step": 1561
+    },
+    {
+      "epoch": 0.8330666666666666,
+      "grad_norm": 0.4041436839925096,
+      "learning_rate": 1.4274371100559791e-05,
+      "loss": 0.6301,
+      "step": 1562
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.3959519258693735,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.664,
+      "step": 1563
+    },
+    {
+      "epoch": 0.8341333333333333,
+      "grad_norm": 0.4337337537945873,
+      "learning_rate": 1.409693244743192e-05,
+      "loss": 0.6291,
+      "step": 1564
+    },
+    {
+      "epoch": 0.8346666666666667,
+      "grad_norm": 0.3433934396705534,
+      "learning_rate": 1.4008597767992871e-05,
+      "loss": 0.5915,
+      "step": 1565
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.4323739867974528,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.6997,
+      "step": 1566
+    },
+    {
+      "epoch": 0.8357333333333333,
+      "grad_norm": 0.36787797117790366,
+      "learning_rate": 1.3832699022267515e-05,
+      "loss": 0.5925,
+      "step": 1567
+    },
+    {
+      "epoch": 0.8362666666666667,
+      "grad_norm": 0.3485124873011298,
+      "learning_rate": 1.37451354812416e-05,
+      "loss": 0.5565,
+      "step": 1568
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.424398798101433,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.632,
+      "step": 1569
+    },
+    {
+      "epoch": 0.8373333333333334,
+      "grad_norm": 0.3828073131985413,
+      "learning_rate": 1.3570781370252582e-05,
+      "loss": 0.6554,
+      "step": 1570
+    },
+    {
+      "epoch": 0.8378666666666666,
+      "grad_norm": 0.33559104771661646,
+      "learning_rate": 1.3483991320937306e-05,
+      "loss": 0.5688,
+      "step": 1571
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.4400788944876675,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.6421,
+      "step": 1572
+    },
+    {
+      "epoch": 0.8389333333333333,
+      "grad_norm": 0.4182109230731653,
+      "learning_rate": 1.3311186530505838e-05,
+      "loss": 0.6735,
+      "step": 1573
+    },
+    {
+      "epoch": 0.8394666666666667,
+      "grad_norm": 0.39126204447831453,
+      "learning_rate": 1.322517230541096e-05,
+      "loss": 0.6732,
+      "step": 1574
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.35938102444571973,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.6573,
+      "step": 1575
+    },
+    {
+      "epoch": 0.8405333333333334,
+      "grad_norm": 0.4034326249578852,
+      "learning_rate": 1.30539214797198e-05,
+      "loss": 0.602,
+      "step": 1576
+    },
+    {
+      "epoch": 0.8410666666666666,
+      "grad_norm": 0.3653789457849746,
+      "learning_rate": 1.2968685390504465e-05,
+      "loss": 0.5865,
+      "step": 1577
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.45162387923875585,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6558,
+      "step": 1578
+    },
+    {
+      "epoch": 0.8421333333333333,
+      "grad_norm": 0.42801086323324233,
+      "learning_rate": 1.2798993131973091e-05,
+      "loss": 0.6498,
+      "step": 1579
+    },
+    {
+      "epoch": 0.8426666666666667,
+      "grad_norm": 0.42856373941025483,
+      "learning_rate": 1.2714537469383858e-05,
+      "loss": 0.6719,
+      "step": 1580
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.4210817050468532,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6711,
+      "step": 1581
+    },
+    {
+      "epoch": 0.8437333333333333,
+      "grad_norm": 0.36419654040815,
+      "learning_rate": 1.2546408338544769e-05,
+      "loss": 0.5764,
+      "step": 1582
+    },
+    {
+      "epoch": 0.8442666666666667,
+      "grad_norm": 0.38983772601087396,
+      "learning_rate": 1.2462735372353996e-05,
+      "loss": 0.6686,
+      "step": 1583
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.40031493797764406,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6085,
+      "step": 1584
+    },
+    {
+      "epoch": 0.8453333333333334,
+      "grad_norm": 0.42784491520692725,
+      "learning_rate": 1.2296173887730123e-05,
+      "loss": 0.5968,
+      "step": 1585
+    },
+    {
+      "epoch": 0.8458666666666667,
+      "grad_norm": 0.39096323767038194,
+      "learning_rate": 1.2213285866674905e-05,
+      "loss": 0.6616,
+      "step": 1586
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.4565965115949712,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6841,
+      "step": 1587
+    },
+    {
+      "epoch": 0.8469333333333333,
+      "grad_norm": 0.4562508675055286,
+      "learning_rate": 1.2048296504658207e-05,
+      "loss": 0.7019,
+      "step": 1588
+    },
+    {
+      "epoch": 0.8474666666666667,
+      "grad_norm": 0.4109227775066737,
+      "learning_rate": 1.1966195656380031e-05,
+      "loss": 0.7138,
+      "step": 1589
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3804421012815039,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6869,
+      "step": 1590
+    },
+    {
+      "epoch": 0.8485333333333334,
+      "grad_norm": 0.37864548372426227,
+      "learning_rate": 1.1802782851111205e-05,
+      "loss": 0.6293,
+      "step": 1591
+    },
+    {
+      "epoch": 0.8490666666666666,
+      "grad_norm": 0.3899734187282974,
+      "learning_rate": 1.1721471382096027e-05,
+      "loss": 0.6219,
+      "step": 1592
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.46146026784741373,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.7349,
+      "step": 1593
+    },
+    {
+      "epoch": 0.8501333333333333,
+      "grad_norm": 0.39563096569301015,
+      "learning_rate": 1.1559639525345311e-05,
+      "loss": 0.6012,
+      "step": 1594
+    },
+    {
+      "epoch": 0.8506666666666667,
+      "grad_norm": 0.36349028623938273,
+      "learning_rate": 1.1479119620864276e-05,
+      "loss": 0.6323,
+      "step": 1595
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.5530220253152613,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.7134,
+      "step": 1596
+    },
+    {
+      "epoch": 0.8517333333333333,
+      "grad_norm": 0.3633381559014149,
+      "learning_rate": 1.1318873061913405e-05,
+      "loss": 0.5883,
+      "step": 1597
+    },
+    {
+      "epoch": 0.8522666666666666,
+      "grad_norm": 0.524058103797069,
+      "learning_rate": 1.123914688596409e-05,
+      "loss": 0.6618,
+      "step": 1598
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.409430667623594,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6635,
+      "step": 1599
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.3302131801718762,
+      "learning_rate": 1.1080489931489391e-05,
+      "loss": 0.5556,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8538666666666667,
+      "grad_norm": 0.39402409614544565,
+      "learning_rate": 1.1001559626737756e-05,
+      "loss": 0.6399,
+      "step": 1601
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3859745387849683,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6593,
+      "step": 1602
+    },
+    {
+      "epoch": 0.8549333333333333,
+      "grad_norm": 0.318991288535117,
+      "learning_rate": 1.0844496540694515e-05,
+      "loss": 0.5662,
+      "step": 1603
+    },
+    {
+      "epoch": 0.8554666666666667,
+      "grad_norm": 0.39710258533822207,
+      "learning_rate": 1.0766364228417148e-05,
+      "loss": 0.6379,
+      "step": 1604
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.37860102198584894,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.5985,
+      "step": 1605
+    },
+    {
+      "epoch": 0.8565333333333334,
+      "grad_norm": 0.36232533598447775,
+      "learning_rate": 1.0610899231924886e-05,
+      "loss": 0.5908,
+      "step": 1606
+    },
+    {
+      "epoch": 0.8570666666666666,
+      "grad_norm": 0.43460142571501387,
+      "learning_rate": 1.0533567011952094e-05,
+      "loss": 0.6522,
+      "step": 1607
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3876925245069668,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.6188,
+      "step": 1608
+    },
+    {
+      "epoch": 0.8581333333333333,
+      "grad_norm": 0.41730345165972554,
+      "learning_rate": 1.0379704283181179e-05,
+      "loss": 0.6449,
+      "step": 1609
+    },
+    {
+      "epoch": 0.8586666666666667,
+      "grad_norm": 0.3698623883860055,
+      "learning_rate": 1.0303174233840528e-05,
+      "loss": 0.6304,
+      "step": 1610
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.3696486364986156,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.6236,
+      "step": 1611
+    },
+    {
+      "epoch": 0.8597333333333333,
+      "grad_norm": 0.37919659148999296,
+      "learning_rate": 1.0150917907899926e-05,
+      "loss": 0.6652,
+      "step": 1612
+    },
+    {
+      "epoch": 0.8602666666666666,
+      "grad_norm": 0.3969738606412844,
+      "learning_rate": 1.007519208596045e-05,
+      "loss": 0.63,
+      "step": 1613
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3890817919498963,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6088,
+      "step": 1614
+    },
+    {
+      "epoch": 0.8613333333333333,
+      "grad_norm": 0.375946683342908,
+      "learning_rate": 9.924546254786493e-06,
+      "loss": 0.5831,
+      "step": 1615
+    },
+    {
+      "epoch": 0.8618666666666667,
+      "grad_norm": 0.42534835273590144,
+      "learning_rate": 9.849626695403324e-06,
+      "loss": 0.6552,
+      "step": 1616
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.3813542531242084,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.5825,
+      "step": 1617
+    },
+    {
+      "epoch": 0.8629333333333333,
+      "grad_norm": 0.48512387811630203,
+      "learning_rate": 9.700595407649805e-06,
+      "loss": 0.7155,
+      "step": 1618
+    },
+    {
+      "epoch": 0.8634666666666667,
+      "grad_norm": 0.44487062533728505,
+      "learning_rate": 9.62648412430951e-06,
+      "loss": 0.6486,
+      "step": 1619
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.4144096224359702,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.6182,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8645333333333334,
+      "grad_norm": 0.4275581658139899,
+      "learning_rate": 9.479071385238892e-06,
+      "loss": 0.6368,
+      "step": 1621
+    },
+    {
+      "epoch": 0.8650666666666667,
+      "grad_norm": 0.41217208846370146,
+      "learning_rate": 9.40577036970538e-06,
+      "loss": 0.5757,
+      "step": 1622
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.4311514560125804,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.7516,
+      "step": 1623
+    },
+    {
+      "epoch": 0.8661333333333333,
+      "grad_norm": 0.4441582542928189,
+      "learning_rate": 9.259980141081115e-06,
+      "loss": 0.6473,
+      "step": 1624
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.42763724124452335,
+      "learning_rate": 9.187491363342093e-06,
+      "loss": 0.6022,
+      "step": 1625
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.4370781023309805,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6075,
+      "step": 1626
+    },
+    {
+      "epoch": 0.8677333333333334,
+      "grad_norm": 0.42278656845369844,
+      "learning_rate": 9.043327563322112e-06,
+      "loss": 0.6556,
+      "step": 1627
+    },
+    {
+      "epoch": 0.8682666666666666,
+      "grad_norm": 0.41799589574914214,
+      "learning_rate": 8.971652971536148e-06,
+      "loss": 0.6304,
+      "step": 1628
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.4095696789811881,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6205,
+      "step": 1629
+    },
+    {
+      "epoch": 0.8693333333333333,
+      "grad_norm": 0.3842838477008584,
+      "learning_rate": 8.829119474567671e-06,
+      "loss": 0.6719,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8698666666666667,
+      "grad_norm": 0.40098624249490195,
+      "learning_rate": 8.758260995011825e-06,
+      "loss": 0.6649,
+      "step": 1631
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.40500561553935305,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6583,
+      "step": 1632
+    },
+    {
+      "epoch": 0.8709333333333333,
+      "grad_norm": 0.36756166218603953,
+      "learning_rate": 8.617361631727138e-06,
+      "loss": 0.6197,
+      "step": 1633
+    },
+    {
+      "epoch": 0.8714666666666666,
+      "grad_norm": 0.5799918647625846,
+      "learning_rate": 8.547321168745193e-06,
+      "loss": 0.6944,
+      "step": 1634
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.39258357313133585,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.6348,
+      "step": 1635
+    },
+    {
+      "epoch": 0.8725333333333334,
+      "grad_norm": 0.42613114420474996,
+      "learning_rate": 8.408059725858719e-06,
+      "loss": 0.6787,
+      "step": 1636
+    },
+    {
+      "epoch": 0.8730666666666667,
+      "grad_norm": 0.36014420147296194,
+      "learning_rate": 8.338839161809997e-06,
+      "loss": 0.6289,
+      "step": 1637
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.38731797911735133,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.6178,
+      "step": 1638
+    },
+    {
+      "epoch": 0.8741333333333333,
+      "grad_norm": 0.4359243445420513,
+      "learning_rate": 8.201219382016556e-06,
+      "loss": 0.6695,
+      "step": 1639
+    },
+    {
+      "epoch": 0.8746666666666667,
+      "grad_norm": 0.36746766889294014,
+      "learning_rate": 8.132820577225387e-06,
+      "loss": 0.62,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.4188819623793009,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6775,
+      "step": 1641
+    },
+    {
+      "epoch": 0.8757333333333334,
+      "grad_norm": 0.3837328531647139,
+      "learning_rate": 7.996846159099557e-06,
+      "loss": 0.5817,
+      "step": 1642
+    },
+    {
+      "epoch": 0.8762666666666666,
+      "grad_norm": 0.4037215580067267,
+      "learning_rate": 7.929270951805178e-06,
+      "loss": 0.6739,
+      "step": 1643
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3890101154213983,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6209,
+      "step": 1644
+    },
+    {
+      "epoch": 0.8773333333333333,
+      "grad_norm": 0.3818105226730483,
+      "learning_rate": 7.794945549701993e-06,
+      "loss": 0.665,
+      "step": 1645
+    },
+    {
+      "epoch": 0.8778666666666667,
+      "grad_norm": 0.4446305625611964,
+      "learning_rate": 7.728195756009204e-06,
+      "loss": 0.6634,
+      "step": 1646
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.45066700064603565,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.732,
+      "step": 1647
+    },
+    {
+      "epoch": 0.8789333333333333,
+      "grad_norm": 0.5506659865294842,
+      "learning_rate": 7.595522979965819e-06,
+      "loss": 0.8025,
+      "step": 1648
+    },
+    {
+      "epoch": 0.8794666666666666,
+      "grad_norm": 0.3738317938225511,
+      "learning_rate": 7.529600393796232e-06,
+      "loss": 0.6668,
+      "step": 1649
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.4733935783945023,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.7105,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8805333333333333,
+      "grad_norm": 0.44189523647167056,
+      "learning_rate": 7.3985838094349444e-06,
+      "loss": 0.648,
+      "step": 1651
+    },
+    {
+      "epoch": 0.8810666666666667,
+      "grad_norm": 0.4445711433398203,
+      "learning_rate": 7.333490202478666e-06,
+      "loss": 0.7302,
+      "step": 1652
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.3728209155473441,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.605,
+      "step": 1653
+    },
+    {
+      "epoch": 0.8821333333333333,
+      "grad_norm": 0.4135229458348511,
+      "learning_rate": 7.204133330911178e-06,
+      "loss": 0.6648,
+      "step": 1654
+    },
+    {
+      "epoch": 0.8826666666666667,
+      "grad_norm": 0.534394080276321,
+      "learning_rate": 7.1398704525792e-06,
+      "loss": 0.7504,
+      "step": 1655
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.44158154206796363,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6495,
+      "step": 1656
+    },
+    {
+      "epoch": 0.8837333333333334,
+      "grad_norm": 0.34078346470207904,
+      "learning_rate": 7.012176770311862e-06,
+      "loss": 0.5454,
+      "step": 1657
+    },
+    {
+      "epoch": 0.8842666666666666,
+      "grad_norm": 0.3555839691863366,
+      "learning_rate": 6.948746347689183e-06,
+      "loss": 0.5889,
+      "step": 1658
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.371060102479731,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.631,
+      "step": 1659
+    },
+    {
+      "epoch": 0.8853333333333333,
+      "grad_norm": 0.39526111909680517,
+      "learning_rate": 6.8227192865295995e-06,
+      "loss": 0.6089,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8858666666666667,
+      "grad_norm": 0.4283808255240205,
+      "learning_rate": 6.760123024328624e-06,
+      "loss": 0.6206,
+      "step": 1661
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.3779760877382535,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.5989,
+      "step": 1662
+    },
+    {
+      "epoch": 0.8869333333333334,
+      "grad_norm": 0.3440769727719932,
+      "learning_rate": 6.635765971293484e-06,
+      "loss": 0.5457,
+      "step": 1663
+    },
+    {
+      "epoch": 0.8874666666666666,
+      "grad_norm": 0.44655105361023856,
+      "learning_rate": 6.5740055518083375e-06,
+      "loss": 0.6729,
+      "step": 1664
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.36877895299568914,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.5614,
+      "step": 1665
+    },
+    {
+      "epoch": 0.8885333333333333,
+      "grad_norm": 0.37325699927081074,
+      "learning_rate": 6.451321849032288e-06,
+      "loss": 0.5595,
+      "step": 1666
+    },
+    {
+      "epoch": 0.8890666666666667,
+      "grad_norm": 0.3786447935064768,
+      "learning_rate": 6.390398932093555e-06,
+      "loss": 0.6037,
+      "step": 1667
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.38025555154341495,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.5795,
+      "step": 1668
+    },
+    {
+      "epoch": 0.8901333333333333,
+      "grad_norm": 0.4152252835746442,
+      "learning_rate": 6.269391876739495e-06,
+      "loss": 0.6059,
+      "step": 1669
+    },
+    {
+      "epoch": 0.8906666666666667,
+      "grad_norm": 0.414720058216093,
+      "learning_rate": 6.209308099669597e-06,
+      "loss": 0.6171,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.3805423945862187,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.6218,
+      "step": 1671
+    },
+    {
+      "epoch": 0.8917333333333334,
+      "grad_norm": 0.4532193207806232,
+      "learning_rate": 6.089980943839924e-06,
+      "loss": 0.7583,
+      "step": 1672
+    },
+    {
+      "epoch": 0.8922666666666667,
+      "grad_norm": 0.41179295805269606,
+      "learning_rate": 6.030737921409169e-06,
+      "loss": 0.6657,
+      "step": 1673
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.4280068243529339,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6707,
+      "step": 1674
+    },
+    {
+      "epoch": 0.8933333333333333,
+      "grad_norm": 0.3545547059777766,
+      "learning_rate": 5.913093872058528e-06,
+      "loss": 0.5767,
+      "step": 1675
+    },
+    {
+      "epoch": 0.8938666666666667,
+      "grad_norm": 0.395744975669382,
+      "learning_rate": 5.854693196441641e-06,
+      "loss": 0.6055,
+      "step": 1676
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.40435885790982506,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6379,
+      "step": 1677
+    },
+    {
+      "epoch": 0.8949333333333334,
+      "grad_norm": 0.3814437771569836,
+      "learning_rate": 5.738735415290642e-06,
+      "loss": 0.618,
+      "step": 1678
+    },
+    {
+      "epoch": 0.8954666666666666,
+      "grad_norm": 0.811008855038527,
+      "learning_rate": 5.681178656024055e-06,
+      "loss": 0.6972,
+      "step": 1679
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.42911928712400943,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.6791,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8965333333333333,
+      "grad_norm": 0.5597486842534753,
+      "learning_rate": 5.566910259474289e-06,
+      "loss": 0.6987,
+      "step": 1681
+    },
+    {
+      "epoch": 0.8970666666666667,
+      "grad_norm": 0.41208830819453646,
+      "learning_rate": 5.510198963413881e-06,
+      "loss": 0.5736,
+      "step": 1682
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.4034401621145211,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6322,
+      "step": 1683
+    },
+    {
+      "epoch": 0.8981333333333333,
+      "grad_norm": 0.4093471387081946,
+      "learning_rate": 5.397623022464226e-06,
+      "loss": 0.6105,
+      "step": 1684
+    },
+    {
+      "epoch": 0.8986666666666666,
+      "grad_norm": 0.4238200528582358,
+      "learning_rate": 5.341758713743828e-06,
+      "loss": 0.6439,
+      "step": 1685
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.42386097679209056,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6438,
+      "step": 1686
+    },
+    {
+      "epoch": 0.8997333333333334,
+      "grad_norm": 0.4599432589024242,
+      "learning_rate": 5.230878253907912e-06,
+      "loss": 0.6614,
+      "step": 1687
+    },
+    {
+      "epoch": 0.9002666666666667,
+      "grad_norm": 0.4833338053488303,
+      "learning_rate": 5.175862433898282e-06,
+      "loss": 0.7221,
+      "step": 1688
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.4240188291338941,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.6379,
+      "step": 1689
+    },
+    {
+      "epoch": 0.9013333333333333,
+      "grad_norm": 0.41703674870985313,
+      "learning_rate": 5.066680435123106e-06,
+      "loss": 0.6714,
+      "step": 1690
+    },
+    {
+      "epoch": 0.9018666666666667,
+      "grad_norm": 0.3967394191445386,
+      "learning_rate": 5.012514582391592e-06,
+      "loss": 0.6442,
+      "step": 1691
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.4089089452183383,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.5991,
+      "step": 1692
+    },
+    {
+      "epoch": 0.9029333333333334,
+      "grad_norm": 0.3767937158524674,
+      "learning_rate": 4.905033978977491e-06,
+      "loss": 0.6341,
+      "step": 1693
+    },
+    {
+      "epoch": 0.9034666666666666,
+      "grad_norm": 0.38585512834586744,
+      "learning_rate": 4.851719549248301e-06,
+      "loss": 0.6273,
+      "step": 1694
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.4586049348519703,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.6217,
+      "step": 1695
+    },
+    {
+      "epoch": 0.9045333333333333,
+      "grad_norm": 0.40138686888256814,
+      "learning_rate": 4.745943229770122e-06,
+      "loss": 0.6547,
+      "step": 1696
+    },
+    {
+      "epoch": 0.9050666666666667,
+      "grad_norm": 0.43601738666780193,
+      "learning_rate": 4.693481655885257e-06,
+      "loss": 0.6823,
+      "step": 1697
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.420956972882786,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6285,
+      "step": 1698
+    },
+    {
+      "epoch": 0.9061333333333333,
+      "grad_norm": 0.38614646380632833,
+      "learning_rate": 4.58941246311464e-06,
+      "loss": 0.6359,
+      "step": 1699
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.4449228643853392,
+      "learning_rate": 4.537805154995278e-06,
+      "loss": 0.7102,
+      "step": 1700
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.3816940892253165,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.6253,
+      "step": 1701
+    },
+    {
+      "epoch": 0.9077333333333333,
+      "grad_norm": 0.4106320807724667,
+      "learning_rate": 4.435445885824285e-06,
+      "loss": 0.5993,
+      "step": 1702
+    },
+    {
+      "epoch": 0.9082666666666667,
+      "grad_norm": 0.40646652543853595,
+      "learning_rate": 4.384694230432984e-06,
+      "loss": 0.6506,
+      "step": 1703
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3710293297476454,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6178,
+      "step": 1704
+    },
+    {
+      "epoch": 0.9093333333333333,
+      "grad_norm": 0.44519568420444267,
+      "learning_rate": 4.2840476357989825e-06,
+      "loss": 0.6283,
+      "step": 1705
+    },
+    {
+      "epoch": 0.9098666666666667,
+      "grad_norm": 0.37961540701591967,
+      "learning_rate": 4.2341529971023255e-06,
+      "loss": 0.636,
+      "step": 1706
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.43152261029008687,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6442,
+      "step": 1707
+    },
+    {
+      "epoch": 0.9109333333333334,
+      "grad_norm": 0.3839027909818481,
+      "learning_rate": 4.135221781914034e-06,
+      "loss": 0.6797,
+      "step": 1708
+    },
+    {
+      "epoch": 0.9114666666666666,
+      "grad_norm": 0.43003305226010224,
+      "learning_rate": 4.0861855008460405e-06,
+      "loss": 0.5905,
+      "step": 1709
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.40204696343519214,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.6574,
+      "step": 1710
+    },
+    {
+      "epoch": 0.9125333333333333,
+      "grad_norm": 0.3901838374632061,
+      "learning_rate": 3.988972323910778e-06,
+      "loss": 0.634,
+      "step": 1711
+    },
+    {
+      "epoch": 0.9130666666666667,
+      "grad_norm": 0.441275045415316,
+      "learning_rate": 3.9407957183368095e-06,
+      "loss": 0.7004,
+      "step": 1712
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.374899335238422,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.5719,
+      "step": 1713
+    },
+    {
+      "epoch": 0.9141333333333334,
+      "grad_norm": 0.41052867678399346,
+      "learning_rate": 3.845303192289074e-06,
+      "loss": 0.6396,
+      "step": 1714
+    },
+    {
+      "epoch": 0.9146666666666666,
+      "grad_norm": 0.39889174261094934,
+      "learning_rate": 3.797987556970495e-06,
+      "loss": 0.6237,
+      "step": 1715
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.4486853289180069,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.6717,
+      "step": 1716
+    },
+    {
+      "epoch": 0.9157333333333333,
+      "grad_norm": 0.41471453140962095,
+      "learning_rate": 3.7042182482018075e-06,
+      "loss": 0.5692,
+      "step": 1717
+    },
+    {
+      "epoch": 0.9162666666666667,
+      "grad_norm": 0.393165175176718,
+      "learning_rate": 3.6577648547611033e-06,
+      "loss": 0.6424,
+      "step": 1718
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.40706528759649374,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.6497,
+      "step": 1719
+    },
+    {
+      "epoch": 0.9173333333333333,
+      "grad_norm": 0.4979494630382395,
+      "learning_rate": 3.565721283350931e-06,
+      "loss": 0.7129,
+      "step": 1720
+    },
+    {
+      "epoch": 0.9178666666666667,
+      "grad_norm": 0.3753930789086421,
+      "learning_rate": 3.5201313802375456e-06,
+      "loss": 0.6348,
+      "step": 1721
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.3559295297272381,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6077,
+      "step": 1722
+    },
+    {
+      "epoch": 0.9189333333333334,
+      "grad_norm": 0.46054781187370936,
+      "learning_rate": 3.4298160198856568e-06,
+      "loss": 0.6117,
+      "step": 1723
+    },
+    {
+      "epoch": 0.9194666666666667,
+      "grad_norm": 0.38305639769805905,
+      "learning_rate": 3.3850908323424967e-06,
+      "loss": 0.6267,
+      "step": 1724
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.39004070336122815,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.6533,
+      "step": 1725
+    },
+    {
+      "epoch": 0.9205333333333333,
+      "grad_norm": 0.41520161480351203,
+      "learning_rate": 3.296506110302422e-06,
+      "loss": 0.5985,
+      "step": 1726
+    },
+    {
+      "epoch": 0.9210666666666667,
+      "grad_norm": 0.41006185649112353,
+      "learning_rate": 3.252646840332918e-06,
+      "loss": 0.5896,
+      "step": 1727
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4624709342649789,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.696,
+      "step": 1728
+    },
+    {
+      "epoch": 0.9221333333333334,
+      "grad_norm": 0.4777229066513596,
+      "learning_rate": 3.1657951373467497e-06,
+      "loss": 0.704,
+      "step": 1729
+    },
+    {
+      "epoch": 0.9226666666666666,
+      "grad_norm": 0.409799860675126,
+      "learning_rate": 3.1228029636824475e-06,
+      "loss": 0.6518,
+      "step": 1730
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.378772725904361,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6191,
+      "step": 1731
+    },
+    {
+      "epoch": 0.9237333333333333,
+      "grad_norm": 0.384915561919326,
+      "learning_rate": 3.037686613916857e-06,
+      "loss": 0.6043,
+      "step": 1732
+    },
+    {
+      "epoch": 0.9242666666666667,
+      "grad_norm": 0.37544872803810414,
+      "learning_rate": 2.995562691985898e-06,
+      "loss": 0.5987,
+      "step": 1733
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.38668980561915756,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.5679,
+      "step": 1734
+    },
+    {
+      "epoch": 0.9253333333333333,
+      "grad_norm": 0.3710676543224045,
+      "learning_rate": 2.912183982969385e-06,
+      "loss": 0.5819,
+      "step": 1735
+    },
+    {
+      "epoch": 0.9258666666666666,
+      "grad_norm": 0.42198902606213534,
+      "learning_rate": 2.8709294448653225e-06,
+      "loss": 0.6122,
+      "step": 1736
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.4007596676485821,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.6589,
+      "step": 1737
+    },
+    {
+      "epoch": 0.9269333333333334,
+      "grad_norm": 0.37845868804855115,
+      "learning_rate": 2.789290617426765e-06,
+      "loss": 0.622,
+      "step": 1738
+    },
+    {
+      "epoch": 0.9274666666666667,
+      "grad_norm": 0.4458854404524312,
+      "learning_rate": 2.748906571878207e-06,
+      "loss": 0.6845,
+      "step": 1739
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.49399594676955133,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.7664,
+      "step": 1740
+    },
+    {
+      "epoch": 0.9285333333333333,
+      "grad_norm": 0.41750762623253024,
+      "learning_rate": 2.6690098200866098e-06,
+      "loss": 0.6847,
+      "step": 1741
+    },
+    {
+      "epoch": 0.9290666666666667,
+      "grad_norm": 0.38704341257194147,
+      "learning_rate": 2.6294973524274125e-06,
+      "loss": 0.6563,
+      "step": 1742
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.4564098292116241,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6974,
+      "step": 1743
+    },
+    {
+      "epoch": 0.9301333333333334,
+      "grad_norm": 0.3884655746811534,
+      "learning_rate": 2.551344823532964e-06,
+      "loss": 0.688,
+      "step": 1744
+    },
+    {
+      "epoch": 0.9306666666666666,
+      "grad_norm": 0.43801374114327774,
+      "learning_rate": 2.5127049956730207e-06,
+      "loss": 0.6638,
+      "step": 1745
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.4109793755063495,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.6284,
+      "step": 1746
+    },
+    {
+      "epoch": 0.9317333333333333,
+      "grad_norm": 0.3840239022455946,
+      "learning_rate": 2.436298790049363e-06,
+      "loss": 0.6098,
+      "step": 1747
+    },
+    {
+      "epoch": 0.9322666666666667,
+      "grad_norm": 0.47318182719166385,
+      "learning_rate": 2.3985326404461604e-06,
+      "loss": 0.67,
+      "step": 1748
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.4155046383093813,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.5981,
+      "step": 1749
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.3750203784525229,
+      "learning_rate": 2.3238748115339324e-06,
+      "loss": 0.6395,
+      "step": 1750
+    },
+    {
+      "epoch": 0.9338666666666666,
+      "grad_norm": 0.36006756331906975,
+      "learning_rate": 2.286983355164529e-06,
+      "loss": 0.5712,
+      "step": 1751
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.4320273428647143,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6927,
+      "step": 1752
+    },
+    {
+      "epoch": 0.9349333333333333,
+      "grad_norm": 0.4210155540691878,
+      "learning_rate": 2.2140759094162467e-06,
+      "loss": 0.5523,
+      "step": 1753
+    },
+    {
+      "epoch": 0.9354666666666667,
+      "grad_norm": 0.4310062764474239,
+      "learning_rate": 2.178060137750071e-06,
+      "loss": 0.6648,
+      "step": 1754
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.37958233122421753,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.6632,
+      "step": 1755
+    },
+    {
+      "epoch": 0.9365333333333333,
+      "grad_norm": 0.45581052332137084,
+      "learning_rate": 2.106905034576112e-06,
+      "loss": 0.6538,
+      "step": 1756
+    },
+    {
+      "epoch": 0.9370666666666667,
+      "grad_norm": 0.41234748540786764,
+      "learning_rate": 2.0717659155482738e-06,
+      "loss": 0.6398,
+      "step": 1757
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.36560884716950104,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.6237,
+      "step": 1758
+    },
+    {
+      "epoch": 0.9381333333333334,
+      "grad_norm": 0.4116871346092955,
+      "learning_rate": 2.002365067264289e-06,
+      "loss": 0.6125,
+      "step": 1759
+    },
+    {
+      "epoch": 0.9386666666666666,
+      "grad_norm": 0.9304496331226776,
+      "learning_rate": 1.968103545249611e-06,
+      "loss": 0.7179,
+      "step": 1760
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.3864823364743263,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6261,
+      "step": 1761
+    },
+    {
+      "epoch": 0.9397333333333333,
+      "grad_norm": 0.43235009161967447,
+      "learning_rate": 1.900458817025097e-06,
+      "loss": 0.6229,
+      "step": 1762
+    },
+    {
+      "epoch": 0.9402666666666667,
+      "grad_norm": 0.3849197493302353,
+      "learning_rate": 1.8670758128126909e-06,
+      "loss": 0.5848,
+      "step": 1763
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.40134861331251387,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.6163,
+      "step": 1764
+    },
+    {
+      "epoch": 0.9413333333333334,
+      "grad_norm": 0.35905598866420474,
+      "learning_rate": 1.8011890226208527e-06,
+      "loss": 0.5709,
+      "step": 1765
+    },
+    {
+      "epoch": 0.9418666666666666,
+      "grad_norm": 0.43085998890856214,
+      "learning_rate": 1.7686854333893833e-06,
+      "loss": 0.6428,
+      "step": 1766
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.3598609268107039,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6099,
+      "step": 1767
+    },
+    {
+      "epoch": 0.9429333333333333,
+      "grad_norm": 0.44991690422363384,
+      "learning_rate": 1.7045583519583074e-06,
+      "loss": 0.6848,
+      "step": 1768
+    },
+    {
+      "epoch": 0.9434666666666667,
+      "grad_norm": 0.39099610363033765,
+      "learning_rate": 1.6729350512519005e-06,
+      "loss": 0.6379,
+      "step": 1769
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.40579872459583405,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.645,
+      "step": 1770
+    },
+    {
+      "epoch": 0.9445333333333333,
+      "grad_norm": 0.39868018210085365,
+      "learning_rate": 1.6105694020169593e-06,
+      "loss": 0.5881,
+      "step": 1771
+    },
+    {
+      "epoch": 0.9450666666666667,
+      "grad_norm": 0.44413940180092004,
+      "learning_rate": 1.5798272397217095e-06,
+      "loss": 0.7001,
+      "step": 1772
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.41125705692688785,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6065,
+      "step": 1773
+    },
+    {
+      "epoch": 0.9461333333333334,
+      "grad_norm": 0.37943719234746764,
+      "learning_rate": 1.5192246987791981e-06,
+      "loss": 0.5538,
+      "step": 1774
+    },
+    {
+      "epoch": 0.9466666666666667,
+      "grad_norm": 0.37926866834963974,
+      "learning_rate": 1.489364501100332e-06,
+      "loss": 0.6136,
+      "step": 1775
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.3805039314657714,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.5837,
+      "step": 1776
+    },
+    {
+      "epoch": 0.9477333333333333,
+      "grad_norm": 0.42719801385702594,
+      "learning_rate": 1.430526697162482e-06,
+      "loss": 0.6021,
+      "step": 1777
+    },
+    {
+      "epoch": 0.9482666666666667,
+      "grad_norm": 0.40858697057497695,
+      "learning_rate": 1.4015492666021312e-06,
+      "loss": 0.6243,
+      "step": 1778
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.4487670605981078,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.7142,
+      "step": 1779
+    },
+    {
+      "epoch": 0.9493333333333334,
+      "grad_norm": 0.4224541816095461,
+      "learning_rate": 1.344477780953346e-06,
+      "loss": 0.6509,
+      "step": 1780
+    },
+    {
+      "epoch": 0.9498666666666666,
+      "grad_norm": 0.42497622693465,
+      "learning_rate": 1.3163838962890195e-06,
+      "loss": 0.6408,
+      "step": 1781
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.4364536219536688,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.6738,
+      "step": 1782
+    },
+    {
+      "epoch": 0.9509333333333333,
+      "grad_norm": 0.3725518080494089,
+      "learning_rate": 1.261080262743297e-06,
+      "loss": 0.6298,
+      "step": 1783
+    },
+    {
+      "epoch": 0.9514666666666667,
+      "grad_norm": 0.38823486298009524,
+      "learning_rate": 1.2338706790069431e-06,
+      "loss": 0.6046,
+      "step": 1784
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.43667157632158776,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.645,
+      "step": 1785
+    },
+    {
+      "epoch": 0.9525333333333333,
+      "grad_norm": 0.40600697636904476,
+      "learning_rate": 1.1803363838667092e-06,
+      "loss": 0.5678,
+      "step": 1786
+    },
+    {
+      "epoch": 0.9530666666666666,
+      "grad_norm": 0.4336830850484576,
+      "learning_rate": 1.1540118323243865e-06,
+      "loss": 0.6722,
+      "step": 1787
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.38487069921133976,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.6327,
+      "step": 1788
+    },
+    {
+      "epoch": 0.9541333333333334,
+      "grad_norm": 0.36479045347086214,
+      "learning_rate": 1.1022483143405705e-06,
+      "loss": 0.591,
+      "step": 1789
+    },
+    {
+      "epoch": 0.9546666666666667,
+      "grad_norm": 0.37428770582908677,
+      "learning_rate": 1.076809502472831e-06,
+      "loss": 0.5923,
+      "step": 1790
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.4314768621928627,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6457,
+      "step": 1791
+    },
+    {
+      "epoch": 0.9557333333333333,
+      "grad_norm": 0.4147033583145242,
+      "learning_rate": 1.0268181528061749e-06,
+      "loss": 0.6127,
+      "step": 1792
+    },
+    {
+      "epoch": 0.9562666666666667,
+      "grad_norm": 0.42763395930014286,
+      "learning_rate": 1.0022657642890231e-06,
+      "loss": 0.6814,
+      "step": 1793
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3870328118798832,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.5969,
+      "step": 1794
+    },
+    {
+      "epoch": 0.9573333333333334,
+      "grad_norm": 0.34963742138630666,
+      "learning_rate": 9.540479264726676e-07,
+      "loss": 0.5615,
+      "step": 1795
+    },
+    {
+      "epoch": 0.9578666666666666,
+      "grad_norm": 0.3763924966431932,
+      "learning_rate": 9.303826211592315e-07,
+      "loss": 0.6232,
+      "step": 1796
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.4838892027395187,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.73,
+      "step": 1797
+    },
+    {
+      "epoch": 0.9589333333333333,
+      "grad_norm": 0.40597714008074315,
+      "learning_rate": 8.839395910626213e-07,
+      "loss": 0.6208,
+      "step": 1798
+    },
+    {
+      "epoch": 0.9594666666666667,
+      "grad_norm": 0.3817389249841561,
+      "learning_rate": 8.611620049653879e-07,
+      "loss": 0.6221,
+      "step": 1799
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.4032564775019703,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6758,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9605333333333334,
+      "grad_norm": 0.368876604368028,
+      "learning_rate": 8.16495030759501e-07,
+      "loss": 0.603,
+      "step": 1801
+    },
+    {
+      "epoch": 0.9610666666666666,
+      "grad_norm": 0.360160879563739,
+      "learning_rate": 7.946057760332193e-07,
+      "loss": 0.5842,
+      "step": 1802
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.4175421047933967,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.5573,
+      "step": 1803
+    },
+    {
+      "epoch": 0.9621333333333333,
+      "grad_norm": 0.46124172093356386,
+      "learning_rate": 7.517160581569372e-07,
+      "loss": 0.7338,
+      "step": 1804
+    },
+    {
+      "epoch": 0.9626666666666667,
+      "grad_norm": 0.41660671080560063,
+      "learning_rate": 7.307157230821426e-07,
+      "loss": 0.599,
+      "step": 1805
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.39971810732775126,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.6217,
+      "step": 1806
+    },
+    {
+      "epoch": 0.9637333333333333,
+      "grad_norm": 0.4240233802100902,
+      "learning_rate": 6.896044142100433e-07,
+      "loss": 0.6787,
+      "step": 1807
+    },
+    {
+      "epoch": 0.9642666666666667,
+      "grad_norm": 0.4106099086169289,
+      "learning_rate": 6.694935631773258e-07,
+      "loss": 0.6475,
+      "step": 1808
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.4484919955657655,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.7082,
+      "step": 1809
+    },
+    {
+      "epoch": 0.9653333333333334,
+      "grad_norm": 0.5168506298073032,
+      "learning_rate": 6.301617681886863e-07,
+      "loss": 0.7011,
+      "step": 1810
+    },
+    {
+      "epoch": 0.9658666666666667,
+      "grad_norm": 0.37421652043750936,
+      "learning_rate": 6.109409416834688e-07,
+      "loss": 0.6033,
+      "step": 1811
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.3825406307257205,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6274,
+      "step": 1812
+    },
+    {
+      "epoch": 0.9669333333333333,
+      "grad_norm": 0.36370105383226464,
+      "learning_rate": 5.733897176325665e-07,
+      "loss": 0.6131,
+      "step": 1813
+    },
+    {
+      "epoch": 0.9674666666666667,
+      "grad_norm": 0.4362826352937529,
+      "learning_rate": 5.550594322205504e-07,
+      "loss": 0.6963,
+      "step": 1814
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.38562300339047656,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6073,
+      "step": 1815
+    },
+    {
+      "epoch": 0.9685333333333334,
+      "grad_norm": 0.4101889385189077,
+      "learning_rate": 5.192897883082747e-07,
+      "loss": 0.6342,
+      "step": 1816
+    },
+    {
+      "epoch": 0.9690666666666666,
+      "grad_norm": 0.4084494005339445,
+      "learning_rate": 5.018505366216175e-07,
+      "loss": 0.6384,
+      "step": 1817
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.393190863886088,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.7246,
+      "step": 1818
+    },
+    {
+      "epoch": 0.9701333333333333,
+      "grad_norm": 0.4130874391359839,
+      "learning_rate": 4.678634341683252e-07,
+      "loss": 0.6573,
+      "step": 1819
+    },
+    {
+      "epoch": 0.9706666666666667,
+      "grad_norm": 0.43866952692769,
+      "learning_rate": 4.5131568489236166e-07,
+      "loss": 0.6903,
+      "step": 1820
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.4756296648677467,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6519,
+      "step": 1821
+    },
+    {
+      "epoch": 0.9717333333333333,
+      "grad_norm": 0.3920615394297473,
+      "learning_rate": 4.191120373120749e-07,
+      "loss": 0.5875,
+      "step": 1822
+    },
+    {
+      "epoch": 0.9722666666666666,
+      "grad_norm": 0.3901570236370613,
+      "learning_rate": 4.034562351727389e-07,
+      "loss": 0.613,
+      "step": 1823
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.39027291245367424,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.618,
+      "step": 1824
+    },
+    {
+      "epoch": 0.9733333333333334,
+      "grad_norm": 0.4044380141653884,
+      "learning_rate": 3.73036907948543e-07,
+      "loss": 0.6489,
+      "step": 1825
+    },
+    {
+      "epoch": 0.9738666666666667,
+      "grad_norm": 0.5909282315563071,
+      "learning_rate": 3.582734737004101e-07,
+      "loss": 0.6663,
+      "step": 1826
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.4864962177965382,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.7379,
+      "step": 1827
+    },
+    {
+      "epoch": 0.9749333333333333,
+      "grad_norm": 0.37261687755147305,
+      "learning_rate": 3.296392843612273e-07,
+      "loss": 0.614,
+      "step": 1828
+    },
+    {
+      "epoch": 0.9754666666666667,
+      "grad_norm": 0.5168912163553879,
+      "learning_rate": 3.1576861477621287e-07,
+      "loss": 0.6986,
+      "step": 1829
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4050537228349556,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6398,
+      "step": 1830
+    },
+    {
+      "epoch": 0.9765333333333334,
+      "grad_norm": 0.46744115057105257,
+      "learning_rate": 2.889203328748424e-07,
+      "loss": 0.6044,
+      "step": 1831
+    },
+    {
+      "epoch": 0.9770666666666666,
+      "grad_norm": 0.4117992538107304,
+      "learning_rate": 2.759428007315212e-07,
+      "loss": 0.6387,
+      "step": 1832
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.37111934186497864,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6629,
+      "step": 1833
+    },
+    {
+      "epoch": 0.9781333333333333,
+      "grad_norm": 0.4541830541597025,
+      "learning_rate": 2.5088114782392257e-07,
+      "loss": 0.6629,
+      "step": 1834
+    },
+    {
+      "epoch": 0.9786666666666667,
+      "grad_norm": 0.43280828716229025,
+      "learning_rate": 2.3879710189753656e-07,
+      "loss": 0.6523,
+      "step": 1835
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.6536403315965769,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.657,
+      "step": 1836
+    },
+    {
+      "epoch": 0.9797333333333333,
+      "grad_norm": 0.4258932375978754,
+      "learning_rate": 2.15522751523467e-07,
+      "loss": 0.6978,
+      "step": 1837
+    },
+    {
+      "epoch": 0.9802666666666666,
+      "grad_norm": 0.39518358503813816,
+      "learning_rate": 2.0433251657653308e-07,
+      "loss": 0.6285,
+      "step": 1838
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.3928729122003721,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.6128,
+      "step": 1839
+    },
+    {
+      "epoch": 0.9813333333333333,
+      "grad_norm": 0.38598983650286456,
+      "learning_rate": 1.8284609424142895e-07,
+      "loss": 0.5799,
+      "step": 1840
+    },
+    {
+      "epoch": 0.9818666666666667,
+      "grad_norm": 0.3948251574075436,
+      "learning_rate": 1.7254997101500137e-07,
+      "loss": 0.5937,
+      "step": 1841
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.426109880344021,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6211,
+      "step": 1842
+    },
+    {
+      "epoch": 0.9829333333333333,
+      "grad_norm": 0.4236366004620384,
+      "learning_rate": 1.5285205417319149e-07,
+      "loss": 0.6471,
+      "step": 1843
+    },
+    {
+      "epoch": 0.9834666666666667,
+      "grad_norm": 0.3762315325341303,
+      "learning_rate": 1.4345031937879062e-07,
+      "loss": 0.5637,
+      "step": 1844
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.5431328948750804,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.699,
+      "step": 1845
+    },
+    {
+      "epoch": 0.9845333333333334,
+      "grad_norm": 0.4327306904052105,
+      "learning_rate": 1.255414374179531e-07,
+      "loss": 0.6687,
+      "step": 1846
+    },
+    {
+      "epoch": 0.9850666666666666,
+      "grad_norm": 0.3750053372630263,
+      "learning_rate": 1.170343437301491e-07,
+      "loss": 0.6314,
+      "step": 1847
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.4392820289743807,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.6429,
+      "step": 1848
+    },
+    {
+      "epoch": 0.9861333333333333,
+      "grad_norm": 0.38998112576797356,
+      "learning_rate": 1.0091497795706728e-07,
+      "loss": 0.5897,
+      "step": 1849
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.40427089101594027,
+      "learning_rate": 9.330275400666332e-08,
+      "loss": 0.6488,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.4000013580423989,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.625,
+      "step": 1851
+    },
+    {
+      "epoch": 0.9877333333333334,
+      "grad_norm": 0.4229711291114719,
+      "learning_rate": 7.8973337634336e-08,
+      "loss": 0.6235,
+      "step": 1852
+    },
+    {
+      "epoch": 0.9882666666666666,
+      "grad_norm": 0.39532708500394637,
+      "learning_rate": 7.225618800222877e-08,
+      "loss": 0.6519,
+      "step": 1853
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.430113144669036,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6273,
+      "step": 1854
+    },
+    {
+      "epoch": 0.9893333333333333,
+      "grad_norm": 0.4070632065693182,
+      "learning_rate": 5.971710613821291e-08,
+      "loss": 0.5986,
+      "step": 1855
+    },
+    {
+      "epoch": 0.9898666666666667,
+      "grad_norm": 0.4303476610897834,
+      "learning_rate": 5.389521134989695e-08,
+      "loss": 0.6181,
+      "step": 1856
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.49460620856969917,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6842,
+      "step": 1857
+    },
+    {
+      "epoch": 0.9909333333333333,
+      "grad_norm": 0.3731127474613181,
+      "learning_rate": 4.314680098592705e-08,
+      "loss": 0.5763,
+      "step": 1858
+    },
+    {
+      "epoch": 0.9914666666666667,
+      "grad_norm": 0.3603271539682308,
+      "learning_rate": 3.8220317506654226e-08,
+      "loss": 0.5601,
+      "step": 1859
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4061619820938722,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.5951,
+      "step": 1860
+    },
+    {
+      "epoch": 0.9925333333333334,
+      "grad_norm": 0.3852902775921212,
+      "learning_rate": 2.9262867509605163e-08,
+      "loss": 0.5901,
+      "step": 1861
+    },
+    {
+      "epoch": 0.9930666666666667,
+      "grad_norm": 0.44172896935187084,
+      "learning_rate": 2.5231927740154704e-08,
+      "loss": 0.6538,
+      "step": 1862
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.4048509328319604,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6207,
+      "step": 1863
+    },
+    {
+      "epoch": 0.9941333333333333,
+      "grad_norm": 0.40031133738130914,
+      "learning_rate": 1.8065678844314538e-08,
+      "loss": 0.6316,
+      "step": 1864
+    },
+    {
+      "epoch": 0.9946666666666667,
+      "grad_norm": 0.35283342528313516,
+      "learning_rate": 1.4930391117451426e-08,
+      "loss": 0.5664,
+      "step": 1865
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.4048802276679688,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6218,
+      "step": 1866
+    },
+    {
+      "epoch": 0.9957333333333334,
+      "grad_norm": 0.4112765662482519,
+      "learning_rate": 9.555535917993297e-09,
+      "loss": 0.6512,
+      "step": 1867
+    },
+    {
+      "epoch": 0.9962666666666666,
+      "grad_norm": 0.40104607022600364,
+      "learning_rate": 7.315984495548378e-09,
+      "loss": 0.6564,
+      "step": 1868
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.41389154069855416,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.5588,
+      "step": 1869
+    },
+    {
+      "epoch": 0.9973333333333333,
+      "grad_norm": 0.36243414198821367,
+      "learning_rate": 3.732667443390181e-09,
+      "loss": 0.5944,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9978666666666667,
+      "grad_norm": 0.4525820312381907,
+      "learning_rate": 2.388912514017516e-09,
+      "loss": 0.6749,
+      "step": 1871
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.45642704463778055,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.6813,
+      "step": 1872
+    },
+    {
+      "epoch": 0.9989333333333333,
+      "grad_norm": 0.39115268818088905,
+      "learning_rate": 5.972299119250125e-10,
+      "loss": 0.6345,
+      "step": 1873
+    },
+    {
+      "epoch": 0.9994666666666666,
+      "grad_norm": 0.388520173659741,
+      "learning_rate": 1.4930758944764479e-10,
+      "loss": 0.6512,
+      "step": 1874
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.40510235035171605,
+      "learning_rate": 0.0,
+      "loss": 0.6228,
+      "step": 1875
+    },
+    {
+      "epoch": 1.0,
+      "step": 1875,
+      "total_flos": 1687683967877120.0,
+      "train_loss": 0.7127048384984335,
+      "train_runtime": 29360.6535,
+      "train_samples_per_second": 1.022,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1687683967877120.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a02d8b3c14d5889ddb631db01ebdbac41ec62e3
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "q_proj",
+    "o_proj",
+    "down_proj",
+    "k_proj",
+    "v_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e141a974a7f8557f54a8b6076af31572577416d3
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4292f08b8ec20f19c73067870e2f74efe60f1dd89419bb8d5d405c2c00557294
+size 671150064
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0feecc49996d6d6865779480a5b949e8948bd878
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b8462bc1aee5e57a6201e3313a32350c879a0c4a5883072868e11f4a84398e7
+size 918507402
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..65639ca9d4c0ee3cce8cdf60c9525248b28c2fe4
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,13167 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005333333333333334,
+      "grad_norm": 0.6944415840873915,
+      "learning_rate": 3.5087719298245615e-06,
+      "loss": 1.2371,
+      "step": 1
+    },
+    {
+      "epoch": 0.0010666666666666667,
+      "grad_norm": 0.9237516073149323,
+      "learning_rate": 7.017543859649123e-06,
+      "loss": 1.346,
+      "step": 2
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.8898403652257272,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 1.4487,
+      "step": 3
+    },
+    {
+      "epoch": 0.0021333333333333334,
+      "grad_norm": 0.8272542005004054,
+      "learning_rate": 1.4035087719298246e-05,
+      "loss": 1.3012,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026666666666666666,
+      "grad_norm": 0.672855335636144,
+      "learning_rate": 1.7543859649122806e-05,
+      "loss": 1.2082,
+      "step": 5
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.7541778585932635,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 1.3162,
+      "step": 6
+    },
+    {
+      "epoch": 0.0037333333333333333,
+      "grad_norm": 0.7218365791439992,
+      "learning_rate": 2.456140350877193e-05,
+      "loss": 1.2852,
+      "step": 7
+    },
+    {
+      "epoch": 0.004266666666666667,
+      "grad_norm": 0.6028645378944271,
+      "learning_rate": 2.8070175438596492e-05,
+      "loss": 1.1151,
+      "step": 8
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.6733007950798171,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 1.1556,
+      "step": 9
+    },
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 0.5602874051811487,
+      "learning_rate": 3.508771929824561e-05,
+      "loss": 0.9803,
+      "step": 10
+    },
+    {
+      "epoch": 0.005866666666666667,
+      "grad_norm": 0.9179182307731018,
+      "learning_rate": 3.859649122807018e-05,
+      "loss": 1.1182,
+      "step": 11
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.751234536943168,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 0.9436,
+      "step": 12
+    },
+    {
+      "epoch": 0.006933333333333333,
+      "grad_norm": 0.6826276028025567,
+      "learning_rate": 4.56140350877193e-05,
+      "loss": 0.9327,
+      "step": 13
+    },
+    {
+      "epoch": 0.007466666666666667,
+      "grad_norm": 0.7852409823004107,
+      "learning_rate": 4.912280701754386e-05,
+      "loss": 1.0438,
+      "step": 14
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.5992997187296086,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 0.8858,
+      "step": 15
+    },
+    {
+      "epoch": 0.008533333333333334,
+      "grad_norm": 0.5874456800487622,
+      "learning_rate": 5.6140350877192984e-05,
+      "loss": 0.9388,
+      "step": 16
+    },
+    {
+      "epoch": 0.009066666666666667,
+      "grad_norm": 0.5822876374605616,
+      "learning_rate": 5.9649122807017544e-05,
+      "loss": 0.974,
+      "step": 17
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.5604867750525585,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 0.9109,
+      "step": 18
+    },
+    {
+      "epoch": 0.010133333333333333,
+      "grad_norm": 0.5783177884247204,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 1.0157,
+      "step": 19
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": 0.5432343133587975,
+      "learning_rate": 7.017543859649122e-05,
+      "loss": 0.9145,
+      "step": 20
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.4983104199001195,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 0.9487,
+      "step": 21
+    },
+    {
+      "epoch": 0.011733333333333333,
+      "grad_norm": 0.5526259301736156,
+      "learning_rate": 7.719298245614036e-05,
+      "loss": 0.9286,
+      "step": 22
+    },
+    {
+      "epoch": 0.012266666666666667,
+      "grad_norm": 0.4465931752876837,
+      "learning_rate": 8.070175438596491e-05,
+      "loss": 0.9112,
+      "step": 23
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.41106759052665426,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.8381,
+      "step": 24
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 0.43819578112888324,
+      "learning_rate": 8.771929824561403e-05,
+      "loss": 0.8083,
+      "step": 25
+    },
+    {
+      "epoch": 0.013866666666666666,
+      "grad_norm": 0.5415670158779432,
+      "learning_rate": 9.12280701754386e-05,
+      "loss": 0.9781,
+      "step": 26
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.4239940951437761,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 0.8188,
+      "step": 27
+    },
+    {
+      "epoch": 0.014933333333333333,
+      "grad_norm": 0.5164350404936713,
+      "learning_rate": 9.824561403508771e-05,
+      "loss": 0.9685,
+      "step": 28
+    },
+    {
+      "epoch": 0.015466666666666667,
+      "grad_norm": 0.5372237610117722,
+      "learning_rate": 0.0001017543859649123,
+      "loss": 0.976,
+      "step": 29
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.5209825847266434,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 0.8238,
+      "step": 30
+    },
+    {
+      "epoch": 0.016533333333333334,
+      "grad_norm": 0.5055305931315417,
+      "learning_rate": 0.00010877192982456141,
+      "loss": 0.8523,
+      "step": 31
+    },
+    {
+      "epoch": 0.017066666666666667,
+      "grad_norm": 0.5877233369375162,
+      "learning_rate": 0.00011228070175438597,
+      "loss": 1.002,
+      "step": 32
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.5175698032973742,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 0.8617,
+      "step": 33
+    },
+    {
+      "epoch": 0.018133333333333335,
+      "grad_norm": 0.43747111829307783,
+      "learning_rate": 0.00011929824561403509,
+      "loss": 0.8128,
+      "step": 34
+    },
+    {
+      "epoch": 0.018666666666666668,
+      "grad_norm": 0.49290333964536837,
+      "learning_rate": 0.00012280701754385965,
+      "loss": 0.9131,
+      "step": 35
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.46999372987087223,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.8599,
+      "step": 36
+    },
+    {
+      "epoch": 0.019733333333333332,
+      "grad_norm": 0.43591327572579525,
+      "learning_rate": 0.0001298245614035088,
+      "loss": 0.7806,
+      "step": 37
+    },
+    {
+      "epoch": 0.020266666666666665,
+      "grad_norm": 0.4548235931595958,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.8169,
+      "step": 38
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.4883958779391726,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.887,
+      "step": 39
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 0.5010048183676208,
+      "learning_rate": 0.00014035087719298245,
+      "loss": 0.9388,
+      "step": 40
+    },
+    {
+      "epoch": 0.021866666666666666,
+      "grad_norm": 0.42906779053114247,
+      "learning_rate": 0.00014385964912280703,
+      "loss": 0.8438,
+      "step": 41
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.4664966442277581,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.8671,
+      "step": 42
+    },
+    {
+      "epoch": 0.022933333333333333,
+      "grad_norm": 0.4853123832044512,
+      "learning_rate": 0.00015087719298245616,
+      "loss": 0.8684,
+      "step": 43
+    },
+    {
+      "epoch": 0.023466666666666667,
+      "grad_norm": 0.4303822583017822,
+      "learning_rate": 0.0001543859649122807,
+      "loss": 0.7864,
+      "step": 44
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.46357800086937717,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.8621,
+      "step": 45
+    },
+    {
+      "epoch": 0.024533333333333334,
+      "grad_norm": 0.4546670490950593,
+      "learning_rate": 0.00016140350877192982,
+      "loss": 0.8015,
+      "step": 46
+    },
+    {
+      "epoch": 0.025066666666666668,
+      "grad_norm": 0.44919262163237095,
+      "learning_rate": 0.0001649122807017544,
+      "loss": 0.8074,
+      "step": 47
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.4951024350936205,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.8764,
+      "step": 48
+    },
+    {
+      "epoch": 0.026133333333333335,
+      "grad_norm": 0.4585135436466692,
+      "learning_rate": 0.00017192982456140353,
+      "loss": 0.8491,
+      "step": 49
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.5448653801108341,
+      "learning_rate": 0.00017543859649122806,
+      "loss": 0.9423,
+      "step": 50
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.46467634169454985,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.8535,
+      "step": 51
+    },
+    {
+      "epoch": 0.027733333333333332,
+      "grad_norm": 0.4601959184314359,
+      "learning_rate": 0.0001824561403508772,
+      "loss": 0.8206,
+      "step": 52
+    },
+    {
+      "epoch": 0.028266666666666666,
+      "grad_norm": 0.4116027116956907,
+      "learning_rate": 0.00018596491228070177,
+      "loss": 0.7936,
+      "step": 53
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.45507183958473724,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.7862,
+      "step": 54
+    },
+    {
+      "epoch": 0.029333333333333333,
+      "grad_norm": 0.450173247372,
+      "learning_rate": 0.00019298245614035088,
+      "loss": 0.8451,
+      "step": 55
+    },
+    {
+      "epoch": 0.029866666666666666,
+      "grad_norm": 0.4762619236369063,
+      "learning_rate": 0.00019649122807017543,
+      "loss": 0.8945,
+      "step": 56
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.5336252193775527,
+      "learning_rate": 0.0002,
+      "loss": 0.9459,
+      "step": 57
+    },
+    {
+      "epoch": 0.030933333333333334,
+      "grad_norm": 0.4798122373225587,
+      "learning_rate": 0.00019999985069241055,
+      "loss": 0.8528,
+      "step": 58
+    },
+    {
+      "epoch": 0.031466666666666664,
+      "grad_norm": 0.4432560945756295,
+      "learning_rate": 0.00019999940277008808,
+      "loss": 0.8344,
+      "step": 59
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.447266664393156,
+      "learning_rate": 0.00019999865623437013,
+      "loss": 0.8578,
+      "step": 60
+    },
+    {
+      "epoch": 0.03253333333333333,
+      "grad_norm": 0.429591571022727,
+      "learning_rate": 0.00019999761108748597,
+      "loss": 0.7733,
+      "step": 61
+    },
+    {
+      "epoch": 0.03306666666666667,
+      "grad_norm": 0.45565926368809806,
+      "learning_rate": 0.00019999626733255662,
+      "loss": 0.8608,
+      "step": 62
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.4403742976437441,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 0.8869,
+      "step": 63
+    },
+    {
+      "epoch": 0.034133333333333335,
+      "grad_norm": 0.4886789451410554,
+      "learning_rate": 0.00019999268401550447,
+      "loss": 0.8765,
+      "step": 64
+    },
+    {
+      "epoch": 0.034666666666666665,
+      "grad_norm": 0.49902160576321314,
+      "learning_rate": 0.000199990444464082,
+      "loss": 0.9175,
+      "step": 65
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.42451260180558853,
+      "learning_rate": 0.00019998790632601496,
+      "loss": 0.8445,
+      "step": 66
+    },
+    {
+      "epoch": 0.03573333333333333,
+      "grad_norm": 0.5036458418688877,
+      "learning_rate": 0.00019998506960888256,
+      "loss": 0.8685,
+      "step": 67
+    },
+    {
+      "epoch": 0.03626666666666667,
+      "grad_norm": 0.5215355346442807,
+      "learning_rate": 0.00019998193432115572,
+      "loss": 0.886,
+      "step": 68
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.4810706495362761,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 0.83,
+      "step": 69
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 0.4752802062241432,
+      "learning_rate": 0.00019997476807225985,
+      "loss": 0.8572,
+      "step": 70
+    },
+    {
+      "epoch": 0.037866666666666667,
+      "grad_norm": 0.48887680784833526,
+      "learning_rate": 0.0001999707371324904,
+      "loss": 0.8122,
+      "step": 71
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.4464455656623525,
+      "learning_rate": 0.00019996640766492543,
+      "loss": 0.7583,
+      "step": 72
+    },
+    {
+      "epoch": 0.038933333333333334,
+      "grad_norm": 0.4502376355976084,
+      "learning_rate": 0.00019996177968249334,
+      "loss": 0.8145,
+      "step": 73
+    },
+    {
+      "epoch": 0.039466666666666664,
+      "grad_norm": 0.47371965043639064,
+      "learning_rate": 0.0001999568531990141,
+      "loss": 0.8902,
+      "step": 74
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.43692843711423124,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 0.8395,
+      "step": 75
+    },
+    {
+      "epoch": 0.04053333333333333,
+      "grad_norm": 0.5391365712735926,
+      "learning_rate": 0.00019994610478865011,
+      "loss": 0.9224,
+      "step": 76
+    },
+    {
+      "epoch": 0.04106666666666667,
+      "grad_norm": 0.4679510818463831,
+      "learning_rate": 0.0001999402828938618,
+      "loss": 0.8445,
+      "step": 77
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4410149228044899,
+      "learning_rate": 0.00019993416256221895,
+      "loss": 0.7968,
+      "step": 78
+    },
+    {
+      "epoch": 0.042133333333333335,
+      "grad_norm": 0.41427675369684946,
+      "learning_rate": 0.00019992774381199778,
+      "loss": 0.735,
+      "step": 79
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 0.4984554811446533,
+      "learning_rate": 0.00019992102666236566,
+      "loss": 0.8434,
+      "step": 80
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.43592960041981665,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 0.7589,
+      "step": 81
+    },
+    {
+      "epoch": 0.04373333333333333,
+      "grad_norm": 0.45766415866525306,
+      "learning_rate": 0.00019990669724599336,
+      "loss": 0.827,
+      "step": 82
+    },
+    {
+      "epoch": 0.04426666666666667,
+      "grad_norm": 0.5162995961208674,
+      "learning_rate": 0.00019989908502204292,
+      "loss": 0.8072,
+      "step": 83
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.4737438201236418,
+      "learning_rate": 0.00019989117448426108,
+      "loss": 0.8065,
+      "step": 84
+    },
+    {
+      "epoch": 0.04533333333333334,
+      "grad_norm": 0.5186994360802084,
+      "learning_rate": 0.00019988296565626987,
+      "loss": 0.8736,
+      "step": 85
+    },
+    {
+      "epoch": 0.04586666666666667,
+      "grad_norm": 0.4618359832966522,
+      "learning_rate": 0.00019987445856258206,
+      "loss": 0.8005,
+      "step": 86
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.4574079671827261,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 0.7259,
+      "step": 87
+    },
+    {
+      "epoch": 0.046933333333333334,
+      "grad_norm": 0.46175694382654586,
+      "learning_rate": 0.00019985654968062122,
+      "loss": 0.85,
+      "step": 88
+    },
+    {
+      "epoch": 0.047466666666666664,
+      "grad_norm": 0.4291930547499251,
+      "learning_rate": 0.00019984714794582683,
+      "loss": 0.7666,
+      "step": 89
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.41983085824612487,
+      "learning_rate": 0.00019983744805229296,
+      "loss": 0.8257,
+      "step": 90
+    },
+    {
+      "epoch": 0.04853333333333333,
+      "grad_norm": 0.44957430818236727,
+      "learning_rate": 0.000199827450028985,
+      "loss": 0.853,
+      "step": 91
+    },
+    {
+      "epoch": 0.04906666666666667,
+      "grad_norm": 0.4600564721570993,
+      "learning_rate": 0.00019981715390575858,
+      "loss": 0.8087,
+      "step": 92
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.6253489587923583,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 0.7901,
+      "step": 93
+    },
+    {
+      "epoch": 0.050133333333333335,
+      "grad_norm": 0.5281010561079615,
+      "learning_rate": 0.00019979566748342347,
+      "loss": 0.7997,
+      "step": 94
+    },
+    {
+      "epoch": 0.050666666666666665,
+      "grad_norm": 0.45303075638233853,
+      "learning_rate": 0.00019978447724847652,
+      "loss": 0.7914,
+      "step": 95
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.41303206939142767,
+      "learning_rate": 0.00019977298904193437,
+      "loss": 0.7501,
+      "step": 96
+    },
+    {
+      "epoch": 0.05173333333333333,
+      "grad_norm": 0.40984017273528417,
+      "learning_rate": 0.00019976120289810247,
+      "loss": 0.7245,
+      "step": 97
+    },
+    {
+      "epoch": 0.05226666666666667,
+      "grad_norm": 0.4650303499074836,
+      "learning_rate": 0.00019974911885217608,
+      "loss": 0.858,
+      "step": 98
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.5035130651520404,
+      "learning_rate": 0.00019973673694024,
+      "loss": 0.8126,
+      "step": 99
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.4054722904374148,
+      "learning_rate": 0.0001997240571992685,
+      "loss": 0.7733,
+      "step": 100
+    },
+    {
+      "epoch": 0.05386666666666667,
+      "grad_norm": 0.46034678879795327,
+      "learning_rate": 0.00019971107966712518,
+      "loss": 0.7837,
+      "step": 101
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.46416172423665386,
+      "learning_rate": 0.00019969780438256293,
+      "loss": 0.8471,
+      "step": 102
+    },
+    {
+      "epoch": 0.054933333333333334,
+      "grad_norm": 0.39898803977700154,
+      "learning_rate": 0.0001996842313852238,
+      "loss": 0.7586,
+      "step": 103
+    },
+    {
+      "epoch": 0.055466666666666664,
+      "grad_norm": 0.41641998385733875,
+      "learning_rate": 0.00019967036071563877,
+      "loss": 0.7471,
+      "step": 104
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.4794689638062486,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 0.8725,
+      "step": 105
+    },
+    {
+      "epoch": 0.05653333333333333,
+      "grad_norm": 0.39489918088446685,
+      "learning_rate": 0.0001996417265262996,
+      "loss": 0.7614,
+      "step": 106
+    },
+    {
+      "epoch": 0.05706666666666667,
+      "grad_norm": 0.44357484154729226,
+      "learning_rate": 0.00019962696309205148,
+      "loss": 0.8444,
+      "step": 107
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.4255774958424041,
+      "learning_rate": 0.0001996119021565693,
+      "loss": 0.7657,
+      "step": 108
+    },
+    {
+      "epoch": 0.058133333333333335,
+      "grad_norm": 0.4262064191118899,
+      "learning_rate": 0.0001995965437648273,
+      "loss": 0.8007,
+      "step": 109
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 0.3821179705066038,
+      "learning_rate": 0.00019958088796268793,
+      "loss": 0.7376,
+      "step": 110
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.42732811385386976,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 0.717,
+      "step": 111
+    },
+    {
+      "epoch": 0.05973333333333333,
+      "grad_norm": 0.5120400356904402,
+      "learning_rate": 0.00019954868431510764,
+      "loss": 0.7882,
+      "step": 112
+    },
+    {
+      "epoch": 0.06026666666666667,
+      "grad_norm": 0.4868550700656959,
+      "learning_rate": 0.00019953213656583168,
+      "loss": 0.8392,
+      "step": 113
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.4184263053969323,
+      "learning_rate": 0.00019951529159848805,
+      "loss": 0.7542,
+      "step": 114
+    },
+    {
+      "epoch": 0.06133333333333333,
+      "grad_norm": 0.6929701997915731,
+      "learning_rate": 0.00019949814946337838,
+      "loss": 0.8437,
+      "step": 115
+    },
+    {
+      "epoch": 0.06186666666666667,
+      "grad_norm": 0.4566990560424279,
+      "learning_rate": 0.00019948071021169174,
+      "loss": 0.7528,
+      "step": 116
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.4121649246666184,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 0.7822,
+      "step": 117
+    },
+    {
+      "epoch": 0.06293333333333333,
+      "grad_norm": 0.4253471446520878,
+      "learning_rate": 0.00019944494056777946,
+      "loss": 0.8152,
+      "step": 118
+    },
+    {
+      "epoch": 0.06346666666666667,
+      "grad_norm": 0.45888022790076605,
+      "learning_rate": 0.00019942661028236745,
+      "loss": 0.8862,
+      "step": 119
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4636889247958515,
+      "learning_rate": 0.00019940798309400526,
+      "loss": 0.802,
+      "step": 120
+    },
+    {
+      "epoch": 0.06453333333333333,
+      "grad_norm": 0.45048156433075803,
+      "learning_rate": 0.00019938905905831654,
+      "loss": 0.8117,
+      "step": 121
+    },
+    {
+      "epoch": 0.06506666666666666,
+      "grad_norm": 0.4406797785869058,
+      "learning_rate": 0.00019936983823181132,
+      "loss": 0.8154,
+      "step": 122
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.4589102632667276,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 0.8003,
+      "step": 123
+    },
+    {
+      "epoch": 0.06613333333333334,
+      "grad_norm": 0.4838485949407181,
+      "learning_rate": 0.00019933050643682269,
+      "loss": 0.7771,
+      "step": 124
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.44507222590884743,
+      "learning_rate": 0.00019931039558578997,
+      "loss": 0.8383,
+      "step": 125
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.37569952847305876,
+      "learning_rate": 0.00019928998817884182,
+      "loss": 0.785,
+      "step": 126
+    },
+    {
+      "epoch": 0.06773333333333334,
+      "grad_norm": 0.4117299081017675,
+      "learning_rate": 0.00019926928427691786,
+      "loss": 0.7823,
+      "step": 127
+    },
+    {
+      "epoch": 0.06826666666666667,
+      "grad_norm": 0.44170602136108966,
+      "learning_rate": 0.00019924828394184306,
+      "loss": 0.8539,
+      "step": 128
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.4218611277342427,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 0.8121,
+      "step": 129
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 0.5418620836987449,
+      "learning_rate": 0.0001992053942239668,
+      "loss": 0.9242,
+      "step": 130
+    },
+    {
+      "epoch": 0.06986666666666666,
+      "grad_norm": 0.42686140928716687,
+      "learning_rate": 0.0001991835049692405,
+      "loss": 0.779,
+      "step": 131
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.4741185695499851,
+      "learning_rate": 0.00019916131953751342,
+      "loss": 0.8416,
+      "step": 132
+    },
+    {
+      "epoch": 0.07093333333333333,
+      "grad_norm": 0.5113869593445932,
+      "learning_rate": 0.0001991388379950346,
+      "loss": 0.8909,
+      "step": 133
+    },
+    {
+      "epoch": 0.07146666666666666,
+      "grad_norm": 0.44021669828848753,
+      "learning_rate": 0.0001991160604089374,
+      "loss": 0.8145,
+      "step": 134
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.4331612856730797,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 0.8013,
+      "step": 135
+    },
+    {
+      "epoch": 0.07253333333333334,
+      "grad_norm": 0.42869556780040946,
+      "learning_rate": 0.00019906961737884077,
+      "loss": 0.7729,
+      "step": 136
+    },
+    {
+      "epoch": 0.07306666666666667,
+      "grad_norm": 0.4798784272020571,
+      "learning_rate": 0.00019904595207352737,
+      "loss": 0.7671,
+      "step": 137
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.41352724710087835,
+      "learning_rate": 0.00019902199100196697,
+      "loss": 0.7767,
+      "step": 138
+    },
+    {
+      "epoch": 0.07413333333333333,
+      "grad_norm": 0.47713623847853925,
+      "learning_rate": 0.000198997734235711,
+      "loss": 0.8163,
+      "step": 139
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.4256501161580274,
+      "learning_rate": 0.00019897318184719385,
+      "loss": 0.7838,
+      "step": 140
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.40691106752334844,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 0.7611,
+      "step": 141
+    },
+    {
+      "epoch": 0.07573333333333333,
+      "grad_norm": 0.43022441599079425,
+      "learning_rate": 0.0001989231904975272,
+      "loss": 0.789,
+      "step": 142
+    },
+    {
+      "epoch": 0.07626666666666666,
+      "grad_norm": 0.45737675791473775,
+      "learning_rate": 0.00019889775168565943,
+      "loss": 0.7935,
+      "step": 143
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.40825035185860664,
+      "learning_rate": 0.00019887201755009357,
+      "loss": 0.8085,
+      "step": 144
+    },
+    {
+      "epoch": 0.07733333333333334,
+      "grad_norm": 0.4137346124558555,
+      "learning_rate": 0.00019884598816767563,
+      "loss": 0.7492,
+      "step": 145
+    },
+    {
+      "epoch": 0.07786666666666667,
+      "grad_norm": 0.5480064578296908,
+      "learning_rate": 0.0001988196636161333,
+      "loss": 0.8338,
+      "step": 146
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.4811157456470868,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 0.8479,
+      "step": 147
+    },
+    {
+      "epoch": 0.07893333333333333,
+      "grad_norm": 0.42586764755902906,
+      "learning_rate": 0.00019876612932099308,
+      "loss": 0.7396,
+      "step": 148
+    },
+    {
+      "epoch": 0.07946666666666667,
+      "grad_norm": 0.479091998968582,
+      "learning_rate": 0.0001987389197372567,
+      "loss": 0.8647,
+      "step": 149
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.4386709001907441,
+      "learning_rate": 0.00019871141530411853,
+      "loss": 0.7694,
+      "step": 150
+    },
+    {
+      "epoch": 0.08053333333333333,
+      "grad_norm": 0.4790956554257266,
+      "learning_rate": 0.00019868361610371097,
+      "loss": 0.8169,
+      "step": 151
+    },
+    {
+      "epoch": 0.08106666666666666,
+      "grad_norm": 0.4745228037719035,
+      "learning_rate": 0.00019865552221904665,
+      "loss": 0.869,
+      "step": 152
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.5205449581005507,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 0.7765,
+      "step": 153
+    },
+    {
+      "epoch": 0.08213333333333334,
+      "grad_norm": 0.6000799801790433,
+      "learning_rate": 0.00019859845073339787,
+      "loss": 0.9171,
+      "step": 154
+    },
+    {
+      "epoch": 0.08266666666666667,
+      "grad_norm": 0.44788944508580925,
+      "learning_rate": 0.00019856947330283752,
+      "loss": 0.8149,
+      "step": 155
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.4157662736567421,
+      "learning_rate": 0.00019854020152886814,
+      "loss": 0.7555,
+      "step": 156
+    },
+    {
+      "epoch": 0.08373333333333334,
+      "grad_norm": 0.3832806174309398,
+      "learning_rate": 0.0001985106354988997,
+      "loss": 0.7223,
+      "step": 157
+    },
+    {
+      "epoch": 0.08426666666666667,
+      "grad_norm": 0.4699115996004792,
+      "learning_rate": 0.00019848077530122083,
+      "loss": 0.8032,
+      "step": 158
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.4169777330014803,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 0.7244,
+      "step": 159
+    },
+    {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.46018201985666296,
+      "learning_rate": 0.00019842017276027832,
+      "loss": 0.8126,
+      "step": 160
+    },
+    {
+      "epoch": 0.08586666666666666,
+      "grad_norm": 0.4847459842380028,
+      "learning_rate": 0.00019838943059798304,
+      "loss": 0.7947,
+      "step": 161
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.3933096013250647,
+      "learning_rate": 0.00019835839462991361,
+      "loss": 0.7736,
+      "step": 162
+    },
+    {
+      "epoch": 0.08693333333333333,
+      "grad_norm": 0.45243818185571316,
+      "learning_rate": 0.0001983270649487481,
+      "loss": 0.8184,
+      "step": 163
+    },
+    {
+      "epoch": 0.08746666666666666,
+      "grad_norm": 0.4503595945797378,
+      "learning_rate": 0.0001982954416480417,
+      "loss": 0.8227,
+      "step": 164
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.44893678123815417,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 0.8695,
+      "step": 165
+    },
+    {
+      "epoch": 0.08853333333333334,
+      "grad_norm": 0.4213210460142999,
+      "learning_rate": 0.00019823131456661063,
+      "loss": 0.825,
+      "step": 166
+    },
+    {
+      "epoch": 0.08906666666666667,
+      "grad_norm": 0.4426924506344481,
+      "learning_rate": 0.00019819881097737915,
+      "loss": 0.7672,
+      "step": 167
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4869073165660452,
+      "learning_rate": 0.00019816601415159263,
+      "loss": 0.8762,
+      "step": 168
+    },
+    {
+      "epoch": 0.09013333333333333,
+      "grad_norm": 0.40015259673376496,
+      "learning_rate": 0.00019813292418718732,
+      "loss": 0.785,
+      "step": 169
+    },
+    {
+      "epoch": 0.09066666666666667,
+      "grad_norm": 0.46373832017025285,
+      "learning_rate": 0.0001980995411829749,
+      "loss": 0.8625,
+      "step": 170
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.4598836311824102,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 0.8046,
+      "step": 171
+    },
+    {
+      "epoch": 0.09173333333333333,
+      "grad_norm": 0.45784081632691526,
+      "learning_rate": 0.0001980318964547504,
+      "loss": 0.8193,
+      "step": 172
+    },
+    {
+      "epoch": 0.09226666666666666,
+      "grad_norm": 0.4392932987043056,
+      "learning_rate": 0.0001979976349327357,
+      "loss": 0.8307,
+      "step": 173
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4463844552503445,
+      "learning_rate": 0.00019796308077490817,
+      "loss": 0.7874,
+      "step": 174
+    },
+    {
+      "epoch": 0.09333333333333334,
+      "grad_norm": 0.4418559505901228,
+      "learning_rate": 0.00019792823408445174,
+      "loss": 0.8132,
+      "step": 175
+    },
+    {
+      "epoch": 0.09386666666666667,
+      "grad_norm": 0.4521536727961204,
+      "learning_rate": 0.0001978930949654239,
+      "loss": 0.7608,
+      "step": 176
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.4758650892647243,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 0.8387,
+      "step": 177
+    },
+    {
+      "epoch": 0.09493333333333333,
+      "grad_norm": 0.4768329724525299,
+      "learning_rate": 0.00019782193986224995,
+      "loss": 0.8534,
+      "step": 178
+    },
+    {
+      "epoch": 0.09546666666666667,
+      "grad_norm": 0.42032771227871046,
+      "learning_rate": 0.00019778592409058378,
+      "loss": 0.8143,
+      "step": 179
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4144302224539125,
+      "learning_rate": 0.00019774961631530545,
+      "loss": 0.807,
+      "step": 180
+    },
+    {
+      "epoch": 0.09653333333333333,
+      "grad_norm": 0.39784095989327356,
+      "learning_rate": 0.0001977130166448355,
+      "loss": 0.7541,
+      "step": 181
+    },
+    {
+      "epoch": 0.09706666666666666,
+      "grad_norm": 0.42574876432860503,
+      "learning_rate": 0.00019767612518846608,
+      "loss": 0.7526,
+      "step": 182
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.5914800123257209,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 0.8341,
+      "step": 183
+    },
+    {
+      "epoch": 0.09813333333333334,
+      "grad_norm": 0.42203359317691,
+      "learning_rate": 0.00019760146735955388,
+      "loss": 0.7692,
+      "step": 184
+    },
+    {
+      "epoch": 0.09866666666666667,
+      "grad_norm": 0.4410858516109663,
+      "learning_rate": 0.00019756370120995066,
+      "loss": 0.8106,
+      "step": 185
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4567139293491327,
+      "learning_rate": 0.00019752564372032657,
+      "loss": 0.7652,
+      "step": 186
+    },
+    {
+      "epoch": 0.09973333333333333,
+      "grad_norm": 0.37682007465653333,
+      "learning_rate": 0.000197487295004327,
+      "loss": 0.7596,
+      "step": 187
+    },
+    {
+      "epoch": 0.10026666666666667,
+      "grad_norm": 0.46226419690327275,
+      "learning_rate": 0.00019744865517646706,
+      "loss": 0.7881,
+      "step": 188
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.45747812537419225,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 0.8334,
+      "step": 189
+    },
+    {
+      "epoch": 0.10133333333333333,
+      "grad_norm": 0.40377810796691194,
+      "learning_rate": 0.0001973705026475726,
+      "loss": 0.7971,
+      "step": 190
+    },
+    {
+      "epoch": 0.10186666666666666,
+      "grad_norm": 0.4053289604581338,
+      "learning_rate": 0.00019733099017991341,
+      "loss": 0.7793,
+      "step": 191
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4541349423682215,
+      "learning_rate": 0.00019729118706714375,
+      "loss": 0.7613,
+      "step": 192
+    },
+    {
+      "epoch": 0.10293333333333334,
+      "grad_norm": 0.43739594831129835,
+      "learning_rate": 0.0001972510934281218,
+      "loss": 0.8187,
+      "step": 193
+    },
+    {
+      "epoch": 0.10346666666666667,
+      "grad_norm": 0.41117651532362703,
+      "learning_rate": 0.00019721070938257324,
+      "loss": 0.8145,
+      "step": 194
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.4432280712699174,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 0.7855,
+      "step": 195
+    },
+    {
+      "epoch": 0.10453333333333334,
+      "grad_norm": 0.40787314100649424,
+      "learning_rate": 0.0001971290705551347,
+      "loss": 0.8185,
+      "step": 196
+    },
+    {
+      "epoch": 0.10506666666666667,
+      "grad_norm": 0.35008609049841144,
+      "learning_rate": 0.00019708781601703065,
+      "loss": 0.748,
+      "step": 197
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.45400474655538375,
+      "learning_rate": 0.00019704627155997108,
+      "loss": 0.8242,
+      "step": 198
+    },
+    {
+      "epoch": 0.10613333333333333,
+      "grad_norm": 0.4831309851229042,
+      "learning_rate": 0.00019700443730801413,
+      "loss": 0.827,
+      "step": 199
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.44584475765875253,
+      "learning_rate": 0.00019696231338608316,
+      "loss": 0.8331,
+      "step": 200
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.45461271032216954,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 0.7473,
+      "step": 201
+    },
+    {
+      "epoch": 0.10773333333333333,
+      "grad_norm": 0.467815209535859,
+      "learning_rate": 0.00019687719703631755,
+      "loss": 0.8023,
+      "step": 202
+    },
+    {
+      "epoch": 0.10826666666666666,
+      "grad_norm": 0.4936230902501373,
+      "learning_rate": 0.00019683420486265327,
+      "loss": 0.876,
+      "step": 203
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.4465659727029938,
+      "learning_rate": 0.0001967909235273549,
+      "loss": 0.7856,
+      "step": 204
+    },
+    {
+      "epoch": 0.10933333333333334,
+      "grad_norm": 0.42387291428457186,
+      "learning_rate": 0.0001967473531596671,
+      "loss": 0.8244,
+      "step": 205
+    },
+    {
+      "epoch": 0.10986666666666667,
+      "grad_norm": 0.4481368721925528,
+      "learning_rate": 0.0001967034938896976,
+      "loss": 0.7337,
+      "step": 206
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.4274311370798581,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 0.7881,
+      "step": 207
+    },
+    {
+      "epoch": 0.11093333333333333,
+      "grad_norm": 0.4391148343206368,
+      "learning_rate": 0.0001966149091676575,
+      "loss": 0.8075,
+      "step": 208
+    },
+    {
+      "epoch": 0.11146666666666667,
+      "grad_norm": 0.49054062967453904,
+      "learning_rate": 0.00019657018398011434,
+      "loss": 0.8624,
+      "step": 209
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5772965435325145,
+      "learning_rate": 0.00019652517041934356,
+      "loss": 0.7413,
+      "step": 210
+    },
+    {
+      "epoch": 0.11253333333333333,
+      "grad_norm": 0.4477575306069233,
+      "learning_rate": 0.00019647986861976246,
+      "loss": 0.7927,
+      "step": 211
+    },
+    {
+      "epoch": 0.11306666666666666,
+      "grad_norm": 0.4780165280100849,
+      "learning_rate": 0.0001964342787166491,
+      "loss": 0.829,
+      "step": 212
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.4072145848307551,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 0.7687,
+      "step": 213
+    },
+    {
+      "epoch": 0.11413333333333334,
+      "grad_norm": 0.4287471275372935,
+      "learning_rate": 0.0001963422351452389,
+      "loss": 0.7509,
+      "step": 214
+    },
+    {
+      "epoch": 0.11466666666666667,
+      "grad_norm": 0.495437619970543,
+      "learning_rate": 0.0001962957817517982,
+      "loss": 0.9653,
+      "step": 215
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.40277028765977974,
+      "learning_rate": 0.00019624904080453655,
+      "loss": 0.786,
+      "step": 216
+    },
+    {
+      "epoch": 0.11573333333333333,
+      "grad_norm": 0.4181818299463913,
+      "learning_rate": 0.00019620201244302952,
+      "loss": 0.8017,
+      "step": 217
+    },
+    {
+      "epoch": 0.11626666666666667,
+      "grad_norm": 0.5017230228205042,
+      "learning_rate": 0.00019615469680771096,
+      "loss": 0.8789,
+      "step": 218
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.4944205151128549,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 0.8088,
+      "step": 219
+    },
+    {
+      "epoch": 0.11733333333333333,
+      "grad_norm": 0.43279911669567134,
+      "learning_rate": 0.00019605920428166323,
+      "loss": 0.8379,
+      "step": 220
+    },
+    {
+      "epoch": 0.11786666666666666,
+      "grad_norm": 0.4185114711196277,
+      "learning_rate": 0.00019601102767608923,
+      "loss": 0.738,
+      "step": 221
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.48305042625641237,
+      "learning_rate": 0.00019596256436701324,
+      "loss": 0.8525,
+      "step": 222
+    },
+    {
+      "epoch": 0.11893333333333334,
+      "grad_norm": 0.40438229405832726,
+      "learning_rate": 0.00019591381449915397,
+      "loss": 0.7681,
+      "step": 223
+    },
+    {
+      "epoch": 0.11946666666666667,
+      "grad_norm": 0.46405607397443416,
+      "learning_rate": 0.00019586477821808597,
+      "loss": 0.8542,
+      "step": 224
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.4112789795870178,
+      "learning_rate": 0.000195815455670239,
+      "loss": 0.7951,
+      "step": 225
+    },
+    {
+      "epoch": 0.12053333333333334,
+      "grad_norm": 0.45655927001958874,
+      "learning_rate": 0.00019576584700289768,
+      "loss": 0.7571,
+      "step": 226
+    },
+    {
+      "epoch": 0.12106666666666667,
+      "grad_norm": 0.4656737851703908,
+      "learning_rate": 0.00019571595236420102,
+      "loss": 0.8518,
+      "step": 227
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.4387941097031272,
+      "learning_rate": 0.00019566577190314197,
+      "loss": 0.7522,
+      "step": 228
+    },
+    {
+      "epoch": 0.12213333333333333,
+      "grad_norm": 0.40736854091545094,
+      "learning_rate": 0.00019561530576956703,
+      "loss": 0.7703,
+      "step": 229
+    },
+    {
+      "epoch": 0.12266666666666666,
+      "grad_norm": 0.4359474487913427,
+      "learning_rate": 0.00019556455411417573,
+      "loss": 0.8037,
+      "step": 230
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.3881636760799097,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 0.6854,
+      "step": 231
+    },
+    {
+      "epoch": 0.12373333333333333,
+      "grad_norm": 0.43904303453450677,
+      "learning_rate": 0.00019546219484500475,
+      "loss": 0.7743,
+      "step": 232
+    },
+    {
+      "epoch": 0.12426666666666666,
+      "grad_norm": 0.41029410599851096,
+      "learning_rate": 0.00019541058753688538,
+      "loss": 0.7017,
+      "step": 233
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.43878985252506475,
+      "learning_rate": 0.00019535869531826937,
+      "loss": 0.7386,
+      "step": 234
+    },
+    {
+      "epoch": 0.12533333333333332,
+      "grad_norm": 0.4732376116029275,
+      "learning_rate": 0.00019530651834411474,
+      "loss": 0.7848,
+      "step": 235
+    },
+    {
+      "epoch": 0.12586666666666665,
+      "grad_norm": 0.42511634202293586,
+      "learning_rate": 0.00019525405677022989,
+      "loss": 0.7902,
+      "step": 236
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.4196252128724456,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 0.7847,
+      "step": 237
+    },
+    {
+      "epoch": 0.12693333333333334,
+      "grad_norm": 0.48381972568587917,
+      "learning_rate": 0.0001951482804507517,
+      "loss": 0.7752,
+      "step": 238
+    },
+    {
+      "epoch": 0.12746666666666667,
+      "grad_norm": 0.42465414849497046,
+      "learning_rate": 0.00019509496602102252,
+      "loss": 0.7862,
+      "step": 239
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.45180889888062,
+      "learning_rate": 0.00019504136762329047,
+      "loss": 0.8044,
+      "step": 240
+    },
+    {
+      "epoch": 0.12853333333333333,
+      "grad_norm": 0.44489280485939253,
+      "learning_rate": 0.00019498748541760846,
+      "loss": 0.8037,
+      "step": 241
+    },
+    {
+      "epoch": 0.12906666666666666,
+      "grad_norm": 0.40437378265677226,
+      "learning_rate": 0.0001949333195648769,
+      "loss": 0.718,
+      "step": 242
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.4109141127236639,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 0.777,
+      "step": 243
+    },
+    {
+      "epoch": 0.13013333333333332,
+      "grad_norm": 0.4707454791021639,
+      "learning_rate": 0.00019482413756610173,
+      "loss": 0.8686,
+      "step": 244
+    },
+    {
+      "epoch": 0.13066666666666665,
+      "grad_norm": 0.47712190941329363,
+      "learning_rate": 0.0001947691217460921,
+      "loss": 0.8097,
+      "step": 245
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.3992104315095112,
+      "learning_rate": 0.00019471382293110003,
+      "loss": 0.7352,
+      "step": 246
+    },
+    {
+      "epoch": 0.13173333333333334,
+      "grad_norm": 0.4137650229412225,
+      "learning_rate": 0.00019465824128625617,
+      "loss": 0.7848,
+      "step": 247
+    },
+    {
+      "epoch": 0.13226666666666667,
+      "grad_norm": 0.3861068942107445,
+      "learning_rate": 0.00019460237697753577,
+      "loss": 0.754,
+      "step": 248
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.42687614828833265,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 0.8449,
+      "step": 249
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.4772395079126402,
+      "learning_rate": 0.00019448980103658613,
+      "loss": 0.8213,
+      "step": 250
+    },
+    {
+      "epoch": 0.13386666666666666,
+      "grad_norm": 0.4561418527462904,
+      "learning_rate": 0.0001944330897405257,
+      "loss": 0.764,
+      "step": 251
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.45324284614374627,
+      "learning_rate": 0.00019437609645292546,
+      "loss": 0.8145,
+      "step": 252
+    },
+    {
+      "epoch": 0.13493333333333332,
+      "grad_norm": 0.42990871413569814,
+      "learning_rate": 0.00019431882134397598,
+      "loss": 0.757,
+      "step": 253
+    },
+    {
+      "epoch": 0.13546666666666668,
+      "grad_norm": 0.4242442269517252,
+      "learning_rate": 0.00019426126458470936,
+      "loss": 0.758,
+      "step": 254
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.4961090507914223,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 0.9201,
+      "step": 255
+    },
+    {
+      "epoch": 0.13653333333333334,
+      "grad_norm": 0.4388177110934772,
+      "learning_rate": 0.00019414530680355837,
+      "loss": 0.7998,
+      "step": 256
+    },
+    {
+      "epoch": 0.13706666666666667,
+      "grad_norm": 0.4334726427754465,
+      "learning_rate": 0.00019408690612794148,
+      "loss": 0.7472,
+      "step": 257
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5245679474958836,
+      "learning_rate": 0.00019402822449454153,
+      "loss": 0.7815,
+      "step": 258
+    },
+    {
+      "epoch": 0.13813333333333333,
+      "grad_norm": 0.3984525106489841,
+      "learning_rate": 0.00019396926207859084,
+      "loss": 0.7792,
+      "step": 259
+    },
+    {
+      "epoch": 0.13866666666666666,
+      "grad_norm": 0.5470105815848786,
+      "learning_rate": 0.0001939100190561601,
+      "loss": 0.7804,
+      "step": 260
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.4190062148806821,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 0.7437,
+      "step": 261
+    },
+    {
+      "epoch": 0.13973333333333332,
+      "grad_norm": 0.4280626833445742,
+      "learning_rate": 0.0001937906919003304,
+      "loss": 0.7799,
+      "step": 262
+    },
+    {
+      "epoch": 0.14026666666666668,
+      "grad_norm": 0.40012436181706396,
+      "learning_rate": 0.00019373060812326052,
+      "loss": 0.7688,
+      "step": 263
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.45684070383709646,
+      "learning_rate": 0.00019367024445236754,
+      "loss": 0.8578,
+      "step": 264
+    },
+    {
+      "epoch": 0.14133333333333334,
+      "grad_norm": 0.4242078593000147,
+      "learning_rate": 0.00019360960106790643,
+      "loss": 0.8441,
+      "step": 265
+    },
+    {
+      "epoch": 0.14186666666666667,
+      "grad_norm": 0.44327384760435773,
+      "learning_rate": 0.0001935486781509677,
+      "loss": 0.791,
+      "step": 266
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.4129859009310087,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 0.7783,
+      "step": 267
+    },
+    {
+      "epoch": 0.14293333333333333,
+      "grad_norm": 0.46098278051698005,
+      "learning_rate": 0.00019342599444819168,
+      "loss": 0.8192,
+      "step": 268
+    },
+    {
+      "epoch": 0.14346666666666666,
+      "grad_norm": 0.43637705533574567,
+      "learning_rate": 0.00019336423402870653,
+      "loss": 0.8436,
+      "step": 269
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.38549987157479443,
+      "learning_rate": 0.00019330219480944694,
+      "loss": 0.6746,
+      "step": 270
+    },
+    {
+      "epoch": 0.14453333333333335,
+      "grad_norm": 0.3686476192402348,
+      "learning_rate": 0.0001932398769756714,
+      "loss": 0.7152,
+      "step": 271
+    },
+    {
+      "epoch": 0.14506666666666668,
+      "grad_norm": 0.4476828610032896,
+      "learning_rate": 0.0001931772807134704,
+      "loss": 0.7038,
+      "step": 272
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.41171180003013674,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 0.7968,
+      "step": 273
+    },
+    {
+      "epoch": 0.14613333333333334,
+      "grad_norm": 0.402059085652968,
+      "learning_rate": 0.00019305125365231084,
+      "loss": 0.7333,
+      "step": 274
+    },
+    {
+      "epoch": 0.14666666666666667,
+      "grad_norm": 0.4056707425304524,
+      "learning_rate": 0.00019298782322968815,
+      "loss": 0.7333,
+      "step": 275
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.43689537157450414,
+      "learning_rate": 0.0001929241151313108,
+      "loss": 0.8249,
+      "step": 276
+    },
+    {
+      "epoch": 0.14773333333333333,
+      "grad_norm": 0.4538939992672516,
+      "learning_rate": 0.0001928601295474208,
+      "loss": 0.7963,
+      "step": 277
+    },
+    {
+      "epoch": 0.14826666666666666,
+      "grad_norm": 0.40284862318826553,
+      "learning_rate": 0.00019279586666908884,
+      "loss": 0.7815,
+      "step": 278
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.4216230725957816,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 0.7518,
+      "step": 279
+    },
+    {
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.3900139810053542,
+      "learning_rate": 0.00019266650979752136,
+      "loss": 0.7589,
+      "step": 280
+    },
+    {
+      "epoch": 0.14986666666666668,
+      "grad_norm": 0.4149093035848671,
+      "learning_rate": 0.00019260141619056507,
+      "loss": 0.7943,
+      "step": 281
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.48297996377698577,
+      "learning_rate": 0.00019253604606172417,
+      "loss": 0.9175,
+      "step": 282
+    },
+    {
+      "epoch": 0.15093333333333334,
+      "grad_norm": 0.4351076487672295,
+      "learning_rate": 0.0001924703996062038,
+      "loss": 0.8047,
+      "step": 283
+    },
+    {
+      "epoch": 0.15146666666666667,
+      "grad_norm": 0.39083187688691845,
+      "learning_rate": 0.0001924044770200342,
+      "loss": 0.742,
+      "step": 284
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.48315077907731574,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 0.8193,
+      "step": 285
+    },
+    {
+      "epoch": 0.15253333333333333,
+      "grad_norm": 0.46103679574880274,
+      "learning_rate": 0.0001922718042439908,
+      "loss": 0.8755,
+      "step": 286
+    },
+    {
+      "epoch": 0.15306666666666666,
+      "grad_norm": 0.43190474218702174,
+      "learning_rate": 0.000192205054450298,
+      "loss": 0.7402,
+      "step": 287
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.4417586707445263,
+      "learning_rate": 0.00019213802931831696,
+      "loss": 0.825,
+      "step": 288
+    },
+    {
+      "epoch": 0.15413333333333334,
+      "grad_norm": 0.5454072533865187,
+      "learning_rate": 0.00019207072904819486,
+      "loss": 0.8484,
+      "step": 289
+    },
+    {
+      "epoch": 0.15466666666666667,
+      "grad_norm": 0.45900909358959874,
+      "learning_rate": 0.00019200315384090044,
+      "loss": 0.8793,
+      "step": 290
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.4418764722298391,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 0.7728,
+      "step": 291
+    },
+    {
+      "epoch": 0.15573333333333333,
+      "grad_norm": 0.41682311824111307,
+      "learning_rate": 0.00019186717942277462,
+      "loss": 0.8164,
+      "step": 292
+    },
+    {
+      "epoch": 0.15626666666666666,
+      "grad_norm": 0.4920244659928006,
+      "learning_rate": 0.00019179878061798347,
+      "loss": 0.8679,
+      "step": 293
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.36330437546048105,
+      "learning_rate": 0.00019173010768809933,
+      "loss": 0.692,
+      "step": 294
+    },
+    {
+      "epoch": 0.15733333333333333,
+      "grad_norm": 0.4142764492365718,
+      "learning_rate": 0.00019166116083819002,
+      "loss": 0.8216,
+      "step": 295
+    },
+    {
+      "epoch": 0.15786666666666666,
+      "grad_norm": 0.498936164950068,
+      "learning_rate": 0.00019159194027414128,
+      "loss": 0.78,
+      "step": 296
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.42441210039175953,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 0.8191,
+      "step": 297
+    },
+    {
+      "epoch": 0.15893333333333334,
+      "grad_norm": 0.42509304195789493,
+      "learning_rate": 0.00019145267883125482,
+      "loss": 0.7581,
+      "step": 298
+    },
+    {
+      "epoch": 0.15946666666666667,
+      "grad_norm": 0.4267126964232961,
+      "learning_rate": 0.00019138263836827288,
+      "loss": 0.8104,
+      "step": 299
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4226704907969013,
+      "learning_rate": 0.00019131232502286188,
+      "loss": 0.8098,
+      "step": 300
+    },
+    {
+      "epoch": 0.16053333333333333,
+      "grad_norm": 0.4798403273069166,
+      "learning_rate": 0.00019124173900498818,
+      "loss": 0.8074,
+      "step": 301
+    },
+    {
+      "epoch": 0.16106666666666666,
+      "grad_norm": 0.44649240559336156,
+      "learning_rate": 0.00019117088052543233,
+      "loss": 0.7971,
+      "step": 302
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.4961604744806426,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 0.8425,
+      "step": 303
+    },
+    {
+      "epoch": 0.16213333333333332,
+      "grad_norm": 0.35220160746231455,
+      "learning_rate": 0.00019102834702846387,
+      "loss": 0.7055,
+      "step": 304
+    },
+    {
+      "epoch": 0.16266666666666665,
+      "grad_norm": 0.43901112282862115,
+      "learning_rate": 0.0001909566724366779,
+      "loss": 0.7687,
+      "step": 305
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.4341322213872506,
+      "learning_rate": 0.00019088472623446183,
+      "loss": 0.7817,
+      "step": 306
+    },
+    {
+      "epoch": 0.16373333333333334,
+      "grad_norm": 0.39210795988100916,
+      "learning_rate": 0.00019081250863665794,
+      "loss": 0.7374,
+      "step": 307
+    },
+    {
+      "epoch": 0.16426666666666667,
+      "grad_norm": 0.4249436264642188,
+      "learning_rate": 0.0001907400198589189,
+      "loss": 0.8085,
+      "step": 308
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.40231474289508434,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 0.7013,
+      "step": 309
+    },
+    {
+      "epoch": 0.16533333333333333,
+      "grad_norm": 0.46152241459367194,
+      "learning_rate": 0.00019059422963029464,
+      "loss": 0.7498,
+      "step": 310
+    },
+    {
+      "epoch": 0.16586666666666666,
+      "grad_norm": 0.44690053275739583,
+      "learning_rate": 0.0001905209286147611,
+      "loss": 0.7858,
+      "step": 311
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4569472159606826,
+      "learning_rate": 0.0001904473572899947,
+      "loss": 0.8488,
+      "step": 312
+    },
+    {
+      "epoch": 0.16693333333333332,
+      "grad_norm": 0.3862654675484178,
+      "learning_rate": 0.0001903735158756905,
+      "loss": 0.7148,
+      "step": 313
+    },
+    {
+      "epoch": 0.16746666666666668,
+      "grad_norm": 0.4469532424952468,
+      "learning_rate": 0.0001902994045923502,
+      "loss": 0.9058,
+      "step": 314
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.41512658359168486,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 0.7564,
+      "step": 315
+    },
+    {
+      "epoch": 0.16853333333333334,
+      "grad_norm": 0.43849788868737033,
+      "learning_rate": 0.0001901503733045967,
+      "loss": 0.8146,
+      "step": 316
+    },
+    {
+      "epoch": 0.16906666666666667,
+      "grad_norm": 0.42578035574036255,
+      "learning_rate": 0.00019007545374521355,
+      "loss": 0.7647,
+      "step": 317
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.4287376466249776,
+      "learning_rate": 0.00019000026520685302,
+      "loss": 0.7968,
+      "step": 318
+    },
+    {
+      "epoch": 0.17013333333333333,
+      "grad_norm": 0.3998036758901615,
+      "learning_rate": 0.00018992480791403958,
+      "loss": 0.7385,
+      "step": 319
+    },
+    {
+      "epoch": 0.17066666666666666,
+      "grad_norm": 0.43530756222877093,
+      "learning_rate": 0.0001898490820921001,
+      "loss": 0.7785,
+      "step": 320
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.44817237110630764,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 0.7894,
+      "step": 321
+    },
+    {
+      "epoch": 0.17173333333333332,
+      "grad_norm": 0.4045125640797333,
+      "learning_rate": 0.0001896968257661595,
+      "loss": 0.7022,
+      "step": 322
+    },
+    {
+      "epoch": 0.17226666666666668,
+      "grad_norm": 0.38106638033280676,
+      "learning_rate": 0.00018962029571681886,
+      "loss": 0.7275,
+      "step": 323
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.4322287827747288,
+      "learning_rate": 0.00018954349804767184,
+      "loss": 0.7832,
+      "step": 324
+    },
+    {
+      "epoch": 0.17333333333333334,
+      "grad_norm": 0.3840790514719407,
+      "learning_rate": 0.00018946643298804793,
+      "loss": 0.7287,
+      "step": 325
+    },
+    {
+      "epoch": 0.17386666666666667,
+      "grad_norm": 0.4923056202155615,
+      "learning_rate": 0.00018938910076807513,
+      "loss": 0.8511,
+      "step": 326
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.38688686764024316,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 0.6976,
+      "step": 327
+    },
+    {
+      "epoch": 0.17493333333333333,
+      "grad_norm": 0.4228516097817739,
+      "learning_rate": 0.0001892336357715829,
+      "loss": 0.7898,
+      "step": 328
+    },
+    {
+      "epoch": 0.17546666666666666,
+      "grad_norm": 0.3591065525009827,
+      "learning_rate": 0.0001891555034593055,
+      "loss": 0.6503,
+      "step": 329
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.37535288131691275,
+      "learning_rate": 0.00018907710491516199,
+      "loss": 0.7257,
+      "step": 330
+    },
+    {
+      "epoch": 0.17653333333333332,
+      "grad_norm": 0.4189502201973536,
+      "learning_rate": 0.00018899844037326225,
+      "loss": 0.8156,
+      "step": 331
+    },
+    {
+      "epoch": 0.17706666666666668,
+      "grad_norm": 0.41085459937546664,
+      "learning_rate": 0.0001889195100685106,
+      "loss": 0.7641,
+      "step": 332
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.41399903331638765,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 0.7879,
+      "step": 333
+    },
+    {
+      "epoch": 0.17813333333333334,
+      "grad_norm": 0.42894816323716317,
+      "learning_rate": 0.00018876085311403593,
+      "loss": 0.8017,
+      "step": 334
+    },
+    {
+      "epoch": 0.17866666666666667,
+      "grad_norm": 0.49675744246390297,
+      "learning_rate": 0.00018868112693808665,
+      "loss": 0.8291,
+      "step": 335
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.460752202699587,
+      "learning_rate": 0.00018860113594683148,
+      "loss": 0.8788,
+      "step": 336
+    },
+    {
+      "epoch": 0.17973333333333333,
+      "grad_norm": 0.4637953576214901,
+      "learning_rate": 0.00018852088037913577,
+      "loss": 0.8106,
+      "step": 337
+    },
+    {
+      "epoch": 0.18026666666666666,
+      "grad_norm": 0.4201668333522856,
+      "learning_rate": 0.0001884403604746547,
+      "loss": 0.773,
+      "step": 338
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.43774102524247854,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 0.8039,
+      "step": 339
+    },
+    {
+      "epoch": 0.18133333333333335,
+      "grad_norm": 0.3587546918505062,
+      "learning_rate": 0.00018827852861790398,
+      "loss": 0.705,
+      "step": 340
+    },
+    {
+      "epoch": 0.18186666666666668,
+      "grad_norm": 0.4126796284740049,
+      "learning_rate": 0.00018819721714888877,
+      "loss": 0.7536,
+      "step": 341
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.39217298838322884,
+      "learning_rate": 0.00018811564230959588,
+      "loss": 0.7286,
+      "step": 342
+    },
+    {
+      "epoch": 0.18293333333333334,
+      "grad_norm": 0.4216981617215113,
+      "learning_rate": 0.00018803380434362,
+      "loss": 0.7836,
+      "step": 343
+    },
+    {
+      "epoch": 0.18346666666666667,
+      "grad_norm": 0.427309007762194,
+      "learning_rate": 0.0001879517034953418,
+      "loss": 0.8332,
+      "step": 344
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.4393362068035181,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 0.7638,
+      "step": 345
+    },
+    {
+      "epoch": 0.18453333333333333,
+      "grad_norm": 0.4563013922481514,
+      "learning_rate": 0.00018778671413332513,
+      "loss": 0.8204,
+      "step": 346
+    },
+    {
+      "epoch": 0.18506666666666666,
+      "grad_norm": 0.4069115922933758,
+      "learning_rate": 0.00018770382611226987,
+      "loss": 0.8024,
+      "step": 347
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.4019532370457461,
+      "learning_rate": 0.00018762067619427746,
+      "loss": 0.7084,
+      "step": 348
+    },
+    {
+      "epoch": 0.18613333333333335,
+      "grad_norm": 0.399054521750121,
+      "learning_rate": 0.000187537264627646,
+      "loss": 0.6556,
+      "step": 349
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.41008227332881475,
+      "learning_rate": 0.00018745359166145523,
+      "loss": 0.7185,
+      "step": 350
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.41658475803580297,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 0.7656,
+      "step": 351
+    },
+    {
+      "epoch": 0.18773333333333334,
+      "grad_norm": 0.4615586902622352,
+      "learning_rate": 0.00018728546253061614,
+      "loss": 0.7824,
+      "step": 352
+    },
+    {
+      "epoch": 0.18826666666666667,
+      "grad_norm": 0.5238526606339945,
+      "learning_rate": 0.00018720100686802694,
+      "loss": 0.8099,
+      "step": 353
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.42702857946584744,
+      "learning_rate": 0.00018711629080999504,
+      "loss": 0.7898,
+      "step": 354
+    },
+    {
+      "epoch": 0.18933333333333333,
+      "grad_norm": 0.4866882638590615,
+      "learning_rate": 0.00018703131460949554,
+      "loss": 0.8483,
+      "step": 355
+    },
+    {
+      "epoch": 0.18986666666666666,
+      "grad_norm": 0.4276742751000265,
+      "learning_rate": 0.0001869460785202802,
+      "loss": 0.7809,
+      "step": 356
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.4100965474634774,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 0.7363,
+      "step": 357
+    },
+    {
+      "epoch": 0.19093333333333334,
+      "grad_norm": 0.4664754062600203,
+      "learning_rate": 0.00018677482769458904,
+      "loss": 0.7483,
+      "step": 358
+    },
+    {
+      "epoch": 0.19146666666666667,
+      "grad_norm": 0.37361963672851983,
+      "learning_rate": 0.00018668881346949417,
+      "loss": 0.717,
+      "step": 359
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.43233657296858197,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.7615,
+      "step": 360
+    },
+    {
+      "epoch": 0.19253333333333333,
+      "grad_norm": 0.44458200899568584,
+      "learning_rate": 0.00018651600867906272,
+      "loss": 0.7609,
+      "step": 361
+    },
+    {
+      "epoch": 0.19306666666666666,
+      "grad_norm": 0.4048535668894861,
+      "learning_rate": 0.00018642921862974742,
+      "loss": 0.7531,
+      "step": 362
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.4045867025921511,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 0.757,
+      "step": 363
+    },
+    {
+      "epoch": 0.19413333333333332,
+      "grad_norm": 0.4605392568978197,
+      "learning_rate": 0.00018625486451875843,
+      "loss": 0.8182,
+      "step": 364
+    },
+    {
+      "epoch": 0.19466666666666665,
+      "grad_norm": 0.4769558757285831,
+      "learning_rate": 0.0001861673009777325,
+      "loss": 0.8203,
+      "step": 365
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.40387606807518167,
+      "learning_rate": 0.0001860794801280666,
+      "loss": 0.7414,
+      "step": 366
+    },
+    {
+      "epoch": 0.19573333333333334,
+      "grad_norm": 0.4492733554879574,
+      "learning_rate": 0.00018599140223200716,
+      "loss": 0.7475,
+      "step": 367
+    },
+    {
+      "epoch": 0.19626666666666667,
+      "grad_norm": 0.4760445921931973,
+      "learning_rate": 0.0001859030675525681,
+      "loss": 0.8217,
+      "step": 368
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.4466095157119117,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 0.821,
+      "step": 369
+    },
+    {
+      "epoch": 0.19733333333333333,
+      "grad_norm": 0.36784926334970397,
+      "learning_rate": 0.0001857256288994402,
+      "loss": 0.708,
+      "step": 370
+    },
+    {
+      "epoch": 0.19786666666666666,
+      "grad_norm": 0.4664739978613938,
+      "learning_rate": 0.00018563652545561013,
+      "loss": 0.7819,
+      "step": 371
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.48964202895281556,
+      "learning_rate": 0.0001855471662881164,
+      "loss": 0.8729,
+      "step": 372
+    },
+    {
+      "epoch": 0.19893333333333332,
+      "grad_norm": 0.40185566940796785,
+      "learning_rate": 0.000185457551663799,
+      "loss": 0.767,
+      "step": 373
+    },
+    {
+      "epoch": 0.19946666666666665,
+      "grad_norm": 0.43300712005598424,
+      "learning_rate": 0.00018536768185026083,
+      "loss": 0.8244,
+      "step": 374
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.4275700118931833,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 0.8337,
+      "step": 375
+    },
+    {
+      "epoch": 0.20053333333333334,
+      "grad_norm": 0.4206666916148894,
+      "learning_rate": 0.00018518717772974302,
+      "loss": 0.7714,
+      "step": 376
+    },
+    {
+      "epoch": 0.20106666666666667,
+      "grad_norm": 0.4836231192137557,
+      "learning_rate": 0.00018509654396177609,
+      "loss": 0.7585,
+      "step": 377
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.42887079426279345,
+      "learning_rate": 0.00018500565608261214,
+      "loss": 0.7611,
+      "step": 378
+    },
+    {
+      "epoch": 0.20213333333333333,
+      "grad_norm": 0.3923492742652943,
+      "learning_rate": 0.00018491451436365627,
+      "loss": 0.7655,
+      "step": 379
+    },
+    {
+      "epoch": 0.20266666666666666,
+      "grad_norm": 0.37381754593752975,
+      "learning_rate": 0.0001848231190770714,
+      "loss": 0.7016,
+      "step": 380
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.45699872442911477,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 0.7581,
+      "step": 381
+    },
+    {
+      "epoch": 0.20373333333333332,
+      "grad_norm": 0.4103872069758534,
+      "learning_rate": 0.00018463956889345194,
+      "loss": 0.7452,
+      "step": 382
+    },
+    {
+      "epoch": 0.20426666666666668,
+      "grad_norm": 0.4019382931211576,
+      "learning_rate": 0.00018454741454452603,
+      "loss": 0.7456,
+      "step": 383
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4308887476747637,
+      "learning_rate": 0.00018445500772418697,
+      "loss": 0.7786,
+      "step": 384
+    },
+    {
+      "epoch": 0.20533333333333334,
+      "grad_norm": 0.4349446345312603,
+      "learning_rate": 0.00018436234870837547,
+      "loss": 0.8067,
+      "step": 385
+    },
+    {
+      "epoch": 0.20586666666666667,
+      "grad_norm": 0.4169251670596901,
+      "learning_rate": 0.00018426943777378552,
+      "loss": 0.7929,
+      "step": 386
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.4264377711670068,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 0.7701,
+      "step": 387
+    },
+    {
+      "epoch": 0.20693333333333333,
+      "grad_norm": 0.43985200886507086,
+      "learning_rate": 0.00018408286125880604,
+      "loss": 0.7834,
+      "step": 388
+    },
+    {
+      "epoch": 0.20746666666666666,
+      "grad_norm": 0.4209543131074095,
+      "learning_rate": 0.00018398919623556238,
+      "loss": 0.8136,
+      "step": 389
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.36892798632147605,
+      "learning_rate": 0.00018389528040783012,
+      "loss": 0.7237,
+      "step": 390
+    },
+    {
+      "epoch": 0.20853333333333332,
+      "grad_norm": 0.4567328302763399,
+      "learning_rate": 0.0001838011140560562,
+      "loss": 0.7535,
+      "step": 391
+    },
+    {
+      "epoch": 0.20906666666666668,
+      "grad_norm": 0.42234184406921543,
+      "learning_rate": 0.00018370669746143564,
+      "loss": 0.7248,
+      "step": 392
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.41738156945110905,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 0.7513,
+      "step": 393
+    },
+    {
+      "epoch": 0.21013333333333334,
+      "grad_norm": 0.41387492453563235,
+      "learning_rate": 0.0001835171146721701,
+      "loss": 0.762,
+      "step": 394
+    },
+    {
+      "epoch": 0.21066666666666667,
+      "grad_norm": 0.40940820687224067,
+      "learning_rate": 0.00018342194904364813,
+      "loss": 0.7426,
+      "step": 395
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.41896736068507107,
+      "learning_rate": 0.00018332653430452376,
+      "loss": 0.7974,
+      "step": 396
+    },
+    {
+      "epoch": 0.21173333333333333,
+      "grad_norm": 0.42726819872348015,
+      "learning_rate": 0.00018323087073971993,
+      "loss": 0.7633,
+      "step": 397
+    },
+    {
+      "epoch": 0.21226666666666666,
+      "grad_norm": 0.483129624128463,
+      "learning_rate": 0.00018313495863490258,
+      "loss": 0.7791,
+      "step": 398
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.42936893725682196,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 0.6965,
+      "step": 399
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.49994335105567145,
+      "learning_rate": 0.00018294238995160094,
+      "loss": 0.7847,
+      "step": 400
+    },
+    {
+      "epoch": 0.21386666666666668,
+      "grad_norm": 0.42322594921195633,
+      "learning_rate": 0.00018284573394815597,
+      "loss": 0.7118,
+      "step": 401
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.48254626226918723,
+      "learning_rate": 0.00018274883055477436,
+      "loss": 0.829,
+      "step": 402
+    },
+    {
+      "epoch": 0.21493333333333334,
+      "grad_norm": 0.39378361115420063,
+      "learning_rate": 0.00018265168006082437,
+      "loss": 0.6769,
+      "step": 403
+    },
+    {
+      "epoch": 0.21546666666666667,
+      "grad_norm": 0.47535999469541607,
+      "learning_rate": 0.00018255428275641214,
+      "loss": 0.768,
+      "step": 404
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.41693174477042727,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 0.7807,
+      "step": 405
+    },
+    {
+      "epoch": 0.21653333333333333,
+      "grad_norm": 0.48129339335753435,
+      "learning_rate": 0.0001823587488803095,
+      "loss": 0.8272,
+      "step": 406
+    },
+    {
+      "epoch": 0.21706666666666666,
+      "grad_norm": 0.41287072690689025,
+      "learning_rate": 0.00018226061289251298,
+      "loss": 0.7748,
+      "step": 407
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4398542226304193,
+      "learning_rate": 0.00018216223126204007,
+      "loss": 0.7408,
+      "step": 408
+    },
+    {
+      "epoch": 0.21813333333333335,
+      "grad_norm": 0.41730831330764306,
+      "learning_rate": 0.00018206360428267332,
+      "loss": 0.7336,
+      "step": 409
+    },
+    {
+      "epoch": 0.21866666666666668,
+      "grad_norm": 0.421603882985244,
+      "learning_rate": 0.00018196473224892784,
+      "loss": 0.7724,
+      "step": 410
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.4283142935953124,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 0.7544,
+      "step": 411
+    },
+    {
+      "epoch": 0.21973333333333334,
+      "grad_norm": 0.44427983108831265,
+      "learning_rate": 0.0001817662542000192,
+      "loss": 0.7871,
+      "step": 412
+    },
+    {
+      "epoch": 0.22026666666666667,
+      "grad_norm": 0.4638134985671546,
+      "learning_rate": 0.0001816666487775416,
+      "loss": 0.7739,
+      "step": 413
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.41864721891489537,
+      "learning_rate": 0.00018156679948605467,
+      "loss": 0.763,
+      "step": 414
+    },
+    {
+      "epoch": 0.22133333333333333,
+      "grad_norm": 0.3889516114988014,
+      "learning_rate": 0.00018146670662372354,
+      "loss": 0.7374,
+      "step": 415
+    },
+    {
+      "epoch": 0.22186666666666666,
+      "grad_norm": 0.42782093903969304,
+      "learning_rate": 0.0001813663704894407,
+      "loss": 0.7829,
+      "step": 416
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.4301643227706815,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 0.743,
+      "step": 417
+    },
+    {
+      "epoch": 0.22293333333333334,
+      "grad_norm": 0.3965266913429562,
+      "learning_rate": 0.00018116496960422107,
+      "loss": 0.7429,
+      "step": 418
+    },
+    {
+      "epoch": 0.22346666666666667,
+      "grad_norm": 0.43728064114296455,
+      "learning_rate": 0.00018106390545469795,
+      "loss": 0.7701,
+      "step": 419
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3620570638238654,
+      "learning_rate": 0.0001809625992360485,
+      "loss": 0.6786,
+      "step": 420
+    },
+    {
+      "epoch": 0.22453333333333333,
+      "grad_norm": 0.45283954918424174,
+      "learning_rate": 0.00018086105125078857,
+      "loss": 0.8196,
+      "step": 421
+    },
+    {
+      "epoch": 0.22506666666666666,
+      "grad_norm": 0.4343397005833213,
+      "learning_rate": 0.00018075926180215576,
+      "loss": 0.7969,
+      "step": 422
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.4079287424387631,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 0.7261,
+      "step": 423
+    },
+    {
+      "epoch": 0.22613333333333333,
+      "grad_norm": 0.4222242988822073,
+      "learning_rate": 0.0001805549597313267,
+      "loss": 0.8046,
+      "step": 424
+    },
+    {
+      "epoch": 0.22666666666666666,
+      "grad_norm": 0.4293398231847534,
+      "learning_rate": 0.0001804524477192075,
+      "loss": 0.7713,
+      "step": 425
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.4798233731304898,
+      "learning_rate": 0.00018034969546386757,
+      "loss": 0.8631,
+      "step": 426
+    },
+    {
+      "epoch": 0.22773333333333334,
+      "grad_norm": 0.37203957607612326,
+      "learning_rate": 0.00018024670327214084,
+      "loss": 0.6513,
+      "step": 427
+    },
+    {
+      "epoch": 0.22826666666666667,
+      "grad_norm": 0.38700273162228754,
+      "learning_rate": 0.00018014347145157755,
+      "loss": 0.7383,
+      "step": 428
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.39274314684924067,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 0.7127,
+      "step": 429
+    },
+    {
+      "epoch": 0.22933333333333333,
+      "grad_norm": 0.4220019961457572,
+      "learning_rate": 0.0001799362901577196,
+      "loss": 0.7666,
+      "step": 430
+    },
+    {
+      "epoch": 0.22986666666666666,
+      "grad_norm": 0.40725013616980993,
+      "learning_rate": 0.00017983234130309968,
+      "loss": 0.7364,
+      "step": 431
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.4210188085980111,
+      "learning_rate": 0.00017972815405699103,
+      "loss": 0.7196,
+      "step": 432
+    },
+    {
+      "epoch": 0.23093333333333332,
+      "grad_norm": 0.38157409826429195,
+      "learning_rate": 0.00017962372873051252,
+      "loss": 0.7486,
+      "step": 433
+    },
+    {
+      "epoch": 0.23146666666666665,
+      "grad_norm": 0.4619994851529084,
+      "learning_rate": 0.00017951906563549397,
+      "loss": 0.8306,
+      "step": 434
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.3870858054183792,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 0.7092,
+      "step": 435
+    },
+    {
+      "epoch": 0.23253333333333334,
+      "grad_norm": 0.44663797567474073,
+      "learning_rate": 0.00017930902739070562,
+      "loss": 0.8126,
+      "step": 436
+    },
+    {
+      "epoch": 0.23306666666666667,
+      "grad_norm": 0.43388290831328646,
+      "learning_rate": 0.00017920365286814183,
+      "loss": 0.7088,
+      "step": 437
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4699874469397872,
+      "learning_rate": 0.0001790980418314484,
+      "loss": 0.7455,
+      "step": 438
+    },
+    {
+      "epoch": 0.23413333333333333,
+      "grad_norm": 0.4404749844802004,
+      "learning_rate": 0.0001789921945959958,
+      "loss": 0.7763,
+      "step": 439
+    },
+    {
+      "epoch": 0.23466666666666666,
+      "grad_norm": 0.42788428607178386,
+      "learning_rate": 0.00017888611147786002,
+      "loss": 0.7964,
+      "step": 440
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.3989626138037562,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 0.7574,
+      "step": 441
+    },
+    {
+      "epoch": 0.23573333333333332,
+      "grad_norm": 0.39554981991833266,
+      "learning_rate": 0.00017867323886136348,
+      "loss": 0.7237,
+      "step": 442
+    },
+    {
+      "epoch": 0.23626666666666668,
+      "grad_norm": 0.38639141646450237,
+      "learning_rate": 0.00017856644999867264,
+      "loss": 0.6986,
+      "step": 443
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.44363977875016986,
+      "learning_rate": 0.0001784594265246366,
+      "loss": 0.8224,
+      "step": 444
+    },
+    {
+      "epoch": 0.23733333333333334,
+      "grad_norm": 0.39983749874740654,
+      "learning_rate": 0.00017835216875884368,
+      "loss": 0.7154,
+      "step": 445
+    },
+    {
+      "epoch": 0.23786666666666667,
+      "grad_norm": 0.39074455401397934,
+      "learning_rate": 0.0001782446770215819,
+      "loss": 0.6708,
+      "step": 446
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.4382862374773125,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 0.7888,
+      "step": 447
+    },
+    {
+      "epoch": 0.23893333333333333,
+      "grad_norm": 0.45623092899339585,
+      "learning_rate": 0.00017802899291729585,
+      "loss": 0.7406,
+      "step": 448
+    },
+    {
+      "epoch": 0.23946666666666666,
+      "grad_norm": 0.35555959303217444,
+      "learning_rate": 0.0001779208011943371,
+      "loss": 0.6977,
+      "step": 449
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3764169462428527,
+      "learning_rate": 0.00017781237678803847,
+      "loss": 0.7668,
+      "step": 450
+    },
+    {
+      "epoch": 0.24053333333333332,
+      "grad_norm": 0.44188810666729844,
+      "learning_rate": 0.00017770372002217172,
+      "loss": 0.7776,
+      "step": 451
+    },
+    {
+      "epoch": 0.24106666666666668,
+      "grad_norm": 0.4061500921489558,
+      "learning_rate": 0.00017759483122120238,
+      "loss": 0.7578,
+      "step": 452
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.38125870005018797,
+      "learning_rate": 0.000177485710710289,
+      "loss": 0.6708,
+      "step": 453
+    },
+    {
+      "epoch": 0.24213333333333334,
+      "grad_norm": 0.3786000392085864,
+      "learning_rate": 0.00017737635881528196,
+      "loss": 0.743,
+      "step": 454
+    },
+    {
+      "epoch": 0.24266666666666667,
+      "grad_norm": 0.44304176475420287,
+      "learning_rate": 0.00017726677586272263,
+      "loss": 0.7115,
+      "step": 455
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.44540392013839997,
+      "learning_rate": 0.00017715696217984235,
+      "loss": 0.855,
+      "step": 456
+    },
+    {
+      "epoch": 0.24373333333333333,
+      "grad_norm": 0.35413155866160545,
+      "learning_rate": 0.00017704691809456143,
+      "loss": 0.6517,
+      "step": 457
+    },
+    {
+      "epoch": 0.24426666666666666,
+      "grad_norm": 0.4198272028109067,
+      "learning_rate": 0.0001769366439354882,
+      "loss": 0.7056,
+      "step": 458
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.45676985184117036,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 0.7924,
+      "step": 459
+    },
+    {
+      "epoch": 0.24533333333333332,
+      "grad_norm": 0.38917203783210363,
+      "learning_rate": 0.00017671540671383243,
+      "loss": 0.6827,
+      "step": 460
+    },
+    {
+      "epoch": 0.24586666666666668,
+      "grad_norm": 0.4621210509235434,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 0.8037,
+      "step": 461
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.39189614997837624,
+      "learning_rate": 0.00017649325315746478,
+      "loss": 0.6711,
+      "step": 462
+    },
+    {
+      "epoch": 0.24693333333333334,
+      "grad_norm": 0.42029604735547044,
+      "learning_rate": 0.00017638183358256696,
+      "loss": 0.7949,
+      "step": 463
+    },
+    {
+      "epoch": 0.24746666666666667,
+      "grad_norm": 0.38936531570922495,
+      "learning_rate": 0.00017627018591992018,
+      "loss": 0.7519,
+      "step": 464
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.378696399369877,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 0.7308,
+      "step": 465
+    },
+    {
+      "epoch": 0.24853333333333333,
+      "grad_norm": 0.39504712205717357,
+      "learning_rate": 0.00017604620766564723,
+      "loss": 0.7411,
+      "step": 466
+    },
+    {
+      "epoch": 0.24906666666666666,
+      "grad_norm": 0.3718348859138365,
+      "learning_rate": 0.00017593387774285412,
+      "loss": 0.7069,
+      "step": 467
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.3962793562795794,
+      "learning_rate": 0.00017582132106997616,
+      "loss": 0.7673,
+      "step": 468
+    },
+    {
+      "epoch": 0.2501333333333333,
+      "grad_norm": 0.37748344421953184,
+      "learning_rate": 0.0001757085379831246,
+      "loss": 0.7166,
+      "step": 469
+    },
+    {
+      "epoch": 0.25066666666666665,
+      "grad_norm": 0.44195969497521115,
+      "learning_rate": 0.00017559552881908695,
+      "loss": 0.7465,
+      "step": 470
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.46254545669392455,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 0.7834,
+      "step": 471
+    },
+    {
+      "epoch": 0.2517333333333333,
+      "grad_norm": 0.40132248932853,
+      "learning_rate": 0.00017536883360997743,
+      "loss": 0.6922,
+      "step": 472
+    },
+    {
+      "epoch": 0.25226666666666664,
+      "grad_norm": 0.45874268019124786,
+      "learning_rate": 0.00017525514824185185,
+      "loss": 0.7523,
+      "step": 473
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.469745709659557,
+      "learning_rate": 0.00017514123815043074,
+      "loss": 0.756,
+      "step": 474
+    },
+    {
+      "epoch": 0.25333333333333335,
+      "grad_norm": 0.3987742896758139,
+      "learning_rate": 0.00017502710367586687,
+      "loss": 0.7892,
+      "step": 475
+    },
+    {
+      "epoch": 0.2538666666666667,
+      "grad_norm": 0.41833025224807635,
+      "learning_rate": 0.0001749127451589832,
+      "loss": 0.7323,
+      "step": 476
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.45771691916950347,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 0.8732,
+      "step": 477
+    },
+    {
+      "epoch": 0.25493333333333335,
+      "grad_norm": 0.4326273075120264,
+      "learning_rate": 0.00017468335736489177,
+      "loss": 0.7967,
+      "step": 478
+    },
+    {
+      "epoch": 0.2554666666666667,
+      "grad_norm": 0.42879572658964055,
+      "learning_rate": 0.00017456832877267084,
+      "loss": 0.7728,
+      "step": 479
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.42443080229742036,
+      "learning_rate": 0.0001744530775081015,
+      "loss": 0.6936,
+      "step": 480
+    },
+    {
+      "epoch": 0.25653333333333334,
+      "grad_norm": 0.3775595399172083,
+      "learning_rate": 0.00017433760391534167,
+      "loss": 0.7445,
+      "step": 481
+    },
+    {
+      "epoch": 0.25706666666666667,
+      "grad_norm": 0.37562992179622606,
+      "learning_rate": 0.00017422190833921283,
+      "loss": 0.6713,
+      "step": 482
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.4183413380348096,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 0.7204,
+      "step": 483
+    },
+    {
+      "epoch": 0.2581333333333333,
+      "grad_norm": 0.4047559155366429,
+      "learning_rate": 0.00017398985261944856,
+      "loss": 0.6899,
+      "step": 484
+    },
+    {
+      "epoch": 0.25866666666666666,
+      "grad_norm": 0.40330462953906643,
+      "learning_rate": 0.00017387349316876666,
+      "loss": 0.7682,
+      "step": 485
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.40961753152529895,
+      "learning_rate": 0.000173756913120621,
+      "loss": 0.731,
+      "step": 486
+    },
+    {
+      "epoch": 0.2597333333333333,
+      "grad_norm": 0.46143460044277884,
+      "learning_rate": 0.0001736401128231373,
+      "loss": 0.8323,
+      "step": 487
+    },
+    {
+      "epoch": 0.26026666666666665,
+      "grad_norm": 0.3902872125584795,
+      "learning_rate": 0.00017352309262509894,
+      "loss": 0.7389,
+      "step": 488
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.41083855940817166,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 0.7407,
+      "step": 489
+    },
+    {
+      "epoch": 0.2613333333333333,
+      "grad_norm": 0.4375489330025397,
+      "learning_rate": 0.0001732883939257742,
+      "loss": 0.7795,
+      "step": 490
+    },
+    {
+      "epoch": 0.2618666666666667,
+      "grad_norm": 0.38274951421925013,
+      "learning_rate": 0.0001731707161253338,
+      "loss": 0.7433,
+      "step": 491
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.4422124748989928,
+      "learning_rate": 0.0001730528198260285,
+      "loss": 0.7672,
+      "step": 492
+    },
+    {
+      "epoch": 0.26293333333333335,
+      "grad_norm": 0.3959184333586809,
+      "learning_rate": 0.00017293470537991463,
+      "loss": 0.6795,
+      "step": 493
+    },
+    {
+      "epoch": 0.2634666666666667,
+      "grad_norm": 0.6970076830225627,
+      "learning_rate": 0.00017281637313969978,
+      "loss": 0.8282,
+      "step": 494
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.43077764137115643,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 0.7355,
+      "step": 495
+    },
+    {
+      "epoch": 0.26453333333333334,
+      "grad_norm": 0.37717081863037677,
+      "learning_rate": 0.00017257905669104874,
+      "loss": 0.703,
+      "step": 496
+    },
+    {
+      "epoch": 0.2650666666666667,
+      "grad_norm": 0.47406791963685385,
+      "learning_rate": 0.00017246007319127545,
+      "loss": 0.7975,
+      "step": 497
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.4078671203332442,
+      "learning_rate": 0.00017234087331472497,
+      "loss": 0.7517,
+      "step": 498
+    },
+    {
+      "epoch": 0.26613333333333333,
+      "grad_norm": 0.467097270780955,
+      "learning_rate": 0.00017222145741734626,
+      "loss": 0.7894,
+      "step": 499
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.3787523725179631,
+      "learning_rate": 0.00017210182585573327,
+      "loss": 0.6397,
+      "step": 500
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.43220939878711917,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 0.7411,
+      "step": 501
+    },
+    {
+      "epoch": 0.2677333333333333,
+      "grad_norm": 0.41565474656753976,
+      "learning_rate": 0.00017186191716939944,
+      "loss": 0.7615,
+      "step": 502
+    },
+    {
+      "epoch": 0.26826666666666665,
+      "grad_norm": 0.49033042942470395,
+      "learning_rate": 0.0001717416407610824,
+      "loss": 0.8386,
+      "step": 503
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4011579642645517,
+      "learning_rate": 0.00017162115012133643,
+      "loss": 0.722,
+      "step": 504
+    },
+    {
+      "epoch": 0.2693333333333333,
+      "grad_norm": 0.45892566198535245,
+      "learning_rate": 0.00017150044560996488,
+      "loss": 0.8441,
+      "step": 505
+    },
+    {
+      "epoch": 0.26986666666666664,
+      "grad_norm": 0.41270635318007476,
+      "learning_rate": 0.00017137952758740978,
+      "loss": 0.7665,
+      "step": 506
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.4670966351858991,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 0.7195,
+      "step": 507
+    },
+    {
+      "epoch": 0.27093333333333336,
+      "grad_norm": 0.4369010227224225,
+      "learning_rate": 0.00017113705245370368,
+      "loss": 0.7719,
+      "step": 508
+    },
+    {
+      "epoch": 0.2714666666666667,
+      "grad_norm": 0.36176267074976143,
+      "learning_rate": 0.00017101549606662024,
+      "loss": 0.6905,
+      "step": 509
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.451818585521191,
+      "learning_rate": 0.00017089372761648616,
+      "loss": 0.7777,
+      "step": 510
+    },
+    {
+      "epoch": 0.27253333333333335,
+      "grad_norm": 0.40613248875745106,
+      "learning_rate": 0.00017077174746692056,
+      "loss": 0.7077,
+      "step": 511
+    },
+    {
+      "epoch": 0.2730666666666667,
+      "grad_norm": 0.3843397450463577,
+      "learning_rate": 0.00017064955598217462,
+      "loss": 0.6993,
+      "step": 512
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.4272003459053182,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 0.7265,
+      "step": 513
+    },
+    {
+      "epoch": 0.27413333333333334,
+      "grad_norm": 0.42543241976806057,
+      "learning_rate": 0.00017040454046730115,
+      "loss": 0.7444,
+      "step": 514
+    },
+    {
+      "epoch": 0.27466666666666667,
+      "grad_norm": 0.4720208024279849,
+      "learning_rate": 0.00017028171716882714,
+      "loss": 0.8301,
+      "step": 515
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.42181444045209004,
+      "learning_rate": 0.00017015868399847768,
+      "loss": 0.7391,
+      "step": 516
+    },
+    {
+      "epoch": 0.27573333333333333,
+      "grad_norm": 0.3736111879553547,
+      "learning_rate": 0.00017003544132364846,
+      "loss": 0.7053,
+      "step": 517
+    },
+    {
+      "epoch": 0.27626666666666666,
+      "grad_norm": 0.42019625065875815,
+      "learning_rate": 0.00016991198951236088,
+      "loss": 0.7105,
+      "step": 518
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.3999899407680853,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 0.7934,
+      "step": 519
+    },
+    {
+      "epoch": 0.2773333333333333,
+      "grad_norm": 0.4183766494907831,
+      "learning_rate": 0.00016966445995561727,
+      "loss": 0.78,
+      "step": 520
+    },
+    {
+      "epoch": 0.27786666666666665,
+      "grad_norm": 0.4011487543794493,
+      "learning_rate": 0.00016954038294932216,
+      "loss": 0.7253,
+      "step": 521
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.41258677704545643,
+      "learning_rate": 0.00016941609828488807,
+      "loss": 0.764,
+      "step": 522
+    },
+    {
+      "epoch": 0.2789333333333333,
+      "grad_norm": 0.3844441162274048,
+      "learning_rate": 0.0001692916063334479,
+      "loss": 0.7392,
+      "step": 523
+    },
+    {
+      "epoch": 0.27946666666666664,
+      "grad_norm": 0.3906556434475999,
+      "learning_rate": 0.0001691669074667535,
+      "loss": 0.7455,
+      "step": 524
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.3898053859460415,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 0.7486,
+      "step": 525
+    },
+    {
+      "epoch": 0.28053333333333336,
+      "grad_norm": 0.40164869224700533,
+      "learning_rate": 0.0001689168904776979,
+      "loss": 0.7393,
+      "step": 526
+    },
+    {
+      "epoch": 0.2810666666666667,
+      "grad_norm": 0.3711709136714381,
+      "learning_rate": 0.00016879157310192535,
+      "loss": 0.6806,
+      "step": 527
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.37875601700616585,
+      "learning_rate": 0.0001686660503040737,
+      "loss": 0.6296,
+      "step": 528
+    },
+    {
+      "epoch": 0.28213333333333335,
+      "grad_norm": 0.4475727202564068,
+      "learning_rate": 0.00016854032245897308,
+      "loss": 0.7294,
+      "step": 529
+    },
+    {
+      "epoch": 0.2826666666666667,
+      "grad_norm": 0.41107973700270495,
+      "learning_rate": 0.00016841438994206595,
+      "loss": 0.7163,
+      "step": 530
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.4278871213826476,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 0.7175,
+      "step": 531
+    },
+    {
+      "epoch": 0.28373333333333334,
+      "grad_norm": 0.47003295264775447,
+      "learning_rate": 0.00016816191239765667,
+      "loss": 0.7789,
+      "step": 532
+    },
+    {
+      "epoch": 0.28426666666666667,
+      "grad_norm": 0.4569330956737104,
+      "learning_rate": 0.00016803536812409075,
+      "loss": 0.7781,
+      "step": 533
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.4096933006186417,
+      "learning_rate": 0.0001679086206865886,
+      "loss": 0.7598,
+      "step": 534
+    },
+    {
+      "epoch": 0.2853333333333333,
+      "grad_norm": 0.4191183195577611,
+      "learning_rate": 0.00016778167046363734,
+      "loss": 0.7314,
+      "step": 535
+    },
+    {
+      "epoch": 0.28586666666666666,
+      "grad_norm": 0.3838781414701778,
+      "learning_rate": 0.00016765451783432953,
+      "loss": 0.7359,
+      "step": 536
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.43993470287325687,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 0.8266,
+      "step": 537
+    },
+    {
+      "epoch": 0.2869333333333333,
+      "grad_norm": 0.4998185831352091,
+      "learning_rate": 0.0001673996068760359,
+      "loss": 0.7512,
+      "step": 538
+    },
+    {
+      "epoch": 0.28746666666666665,
+      "grad_norm": 0.4474662471696046,
+      "learning_rate": 0.00016727184930825288,
+      "loss": 0.7365,
+      "step": 539
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.41684893319551,
+      "learning_rate": 0.0001671438908565167,
+      "loss": 0.7478,
+      "step": 540
+    },
+    {
+      "epoch": 0.2885333333333333,
+      "grad_norm": 0.4081069161723271,
+      "learning_rate": 0.00016701573190293077,
+      "loss": 0.7478,
+      "step": 541
+    },
+    {
+      "epoch": 0.2890666666666667,
+      "grad_norm": 0.5424920296466795,
+      "learning_rate": 0.00016688737283019706,
+      "loss": 0.7227,
+      "step": 542
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.4148912934325243,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 0.7182,
+      "step": 543
+    },
+    {
+      "epoch": 0.29013333333333335,
+      "grad_norm": 0.4231329818416088,
+      "learning_rate": 0.00016663005586108176,
+      "loss": 0.7365,
+      "step": 544
+    },
+    {
+      "epoch": 0.2906666666666667,
+      "grad_norm": 0.4149981816868053,
+      "learning_rate": 0.00016650109873308765,
+      "loss": 0.7979,
+      "step": 545
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.49680839154614576,
+      "learning_rate": 0.0001663719430227186,
+      "loss": 0.8492,
+      "step": 546
+    },
+    {
+      "epoch": 0.29173333333333334,
+      "grad_norm": 0.444439051760402,
+      "learning_rate": 0.0001662425891156531,
+      "loss": 0.8233,
+      "step": 547
+    },
+    {
+      "epoch": 0.2922666666666667,
+      "grad_norm": 0.4121540003234031,
+      "learning_rate": 0.00016611303739816168,
+      "loss": 0.7628,
+      "step": 548
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.4291469833584802,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 0.7534,
+      "step": 549
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.3798516076774825,
+      "learning_rate": 0.00016585334207993476,
+      "loss": 0.7333,
+      "step": 550
+    },
+    {
+      "epoch": 0.29386666666666666,
+      "grad_norm": 0.4065165333597523,
+      "learning_rate": 0.00016572319925468892,
+      "loss": 0.7981,
+      "step": 551
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4692693747077233,
+      "learning_rate": 0.000165592860169994,
+      "loss": 0.8722,
+      "step": 552
+    },
+    {
+      "epoch": 0.2949333333333333,
+      "grad_norm": 0.3895284877001692,
+      "learning_rate": 0.0001654623252150624,
+      "loss": 0.6476,
+      "step": 553
+    },
+    {
+      "epoch": 0.29546666666666666,
+      "grad_norm": 0.355722365184838,
+      "learning_rate": 0.00016533159477969122,
+      "loss": 0.6832,
+      "step": 554
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.4430359057897852,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 0.8036,
+      "step": 555
+    },
+    {
+      "epoch": 0.2965333333333333,
+      "grad_norm": 0.4217197639087501,
+      "learning_rate": 0.00016506954902973655,
+      "loss": 0.757,
+      "step": 556
+    },
+    {
+      "epoch": 0.29706666666666665,
+      "grad_norm": 0.3724598561245106,
+      "learning_rate": 0.00016493823449766136,
+      "loss": 0.6265,
+      "step": 557
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.39196229611640815,
+      "learning_rate": 0.0001648067260501611,
+      "loss": 0.6382,
+      "step": 558
+    },
+    {
+      "epoch": 0.2981333333333333,
+      "grad_norm": 0.351739632446928,
+      "learning_rate": 0.00016467502407993992,
+      "loss": 0.7245,
+      "step": 559
+    },
+    {
+      "epoch": 0.2986666666666667,
+      "grad_norm": 0.3646881424861877,
+      "learning_rate": 0.0001645431289802799,
+      "loss": 0.6625,
+      "step": 560
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.4580483452429409,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 0.8263,
+      "step": 561
+    },
+    {
+      "epoch": 0.29973333333333335,
+      "grad_norm": 0.4223892248868995,
+      "learning_rate": 0.00016427876096865394,
+      "loss": 0.7199,
+      "step": 562
+    },
+    {
+      "epoch": 0.3002666666666667,
+      "grad_norm": 0.4459595943654794,
+      "learning_rate": 0.00016414628884613107,
+      "loss": 0.7978,
+      "step": 563
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.4389200618647841,
+      "learning_rate": 0.00016401362517305296,
+      "loss": 0.7366,
+      "step": 564
+    },
+    {
+      "epoch": 0.30133333333333334,
+      "grad_norm": 0.44107537261661384,
+      "learning_rate": 0.00016388077034557355,
+      "loss": 0.7633,
+      "step": 565
+    },
+    {
+      "epoch": 0.30186666666666667,
+      "grad_norm": 0.37709812051391756,
+      "learning_rate": 0.00016374772476041748,
+      "loss": 0.7322,
+      "step": 566
+    },
+    {
+      "epoch": 0.3024,
+      "grad_norm": 0.40402394223372934,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 0.766,
+      "step": 567
+    },
+    {
+      "epoch": 0.30293333333333333,
+      "grad_norm": 0.48578289266442454,
+      "learning_rate": 0.00016348106290682118,
+      "loss": 0.8789,
+      "step": 568
+    },
+    {
+      "epoch": 0.30346666666666666,
+      "grad_norm": 0.39376794347238314,
+      "learning_rate": 0.00016334744743467364,
+      "loss": 0.7705,
+      "step": 569
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.3721003314014055,
+      "learning_rate": 0.00016321364279743266,
+      "loss": 0.635,
+      "step": 570
+    },
+    {
+      "epoch": 0.3045333333333333,
+      "grad_norm": 0.49231908734668456,
+      "learning_rate": 0.00016307964939465914,
+      "loss": 0.8152,
+      "step": 571
+    },
+    {
+      "epoch": 0.30506666666666665,
+      "grad_norm": 0.4112118850245869,
+      "learning_rate": 0.00016294546762647775,
+      "loss": 0.7467,
+      "step": 572
+    },
+    {
+      "epoch": 0.3056,
+      "grad_norm": 0.4852694643675637,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 0.85,
+      "step": 573
+    },
+    {
+      "epoch": 0.3061333333333333,
+      "grad_norm": 0.4276293897464289,
+      "learning_rate": 0.0001626765405972011,
+      "loss": 0.7662,
+      "step": 574
+    },
+    {
+      "epoch": 0.30666666666666664,
+      "grad_norm": 0.3908743732434518,
+      "learning_rate": 0.00016254179613916278,
+      "loss": 0.7291,
+      "step": 575
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.42742623034741456,
+      "learning_rate": 0.00016240686492182804,
+      "loss": 0.7115,
+      "step": 576
+    },
+    {
+      "epoch": 0.30773333333333336,
+      "grad_norm": 0.3823576940969926,
+      "learning_rate": 0.000162271747348122,
+      "loss": 0.7108,
+      "step": 577
+    },
+    {
+      "epoch": 0.3082666666666667,
+      "grad_norm": 0.4139648570198011,
+      "learning_rate": 0.0001621364438215262,
+      "loss": 0.788,
+      "step": 578
+    },
+    {
+      "epoch": 0.3088,
+      "grad_norm": 0.4430555218440438,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 0.7621,
+      "step": 579
+    },
+    {
+      "epoch": 0.30933333333333335,
+      "grad_norm": 0.4083726535659133,
+      "learning_rate": 0.00016186528052636692,
+      "loss": 0.6616,
+      "step": 580
+    },
+    {
+      "epoch": 0.3098666666666667,
+      "grad_norm": 0.39823684344449684,
+      "learning_rate": 0.0001617294215675382,
+      "loss": 0.7481,
+      "step": 581
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.43717230620111364,
+      "learning_rate": 0.00016159337827528685,
+      "loss": 0.6599,
+      "step": 582
+    },
+    {
+      "epoch": 0.31093333333333334,
+      "grad_norm": 0.4055751859684295,
+      "learning_rate": 0.0001614571510558588,
+      "loss": 0.7394,
+      "step": 583
+    },
+    {
+      "epoch": 0.31146666666666667,
+      "grad_norm": 0.37267406752774485,
+      "learning_rate": 0.00016132074031604917,
+      "loss": 0.6659,
+      "step": 584
+    },
+    {
+      "epoch": 0.312,
+      "grad_norm": 0.47133028091157125,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 0.7821,
+      "step": 585
+    },
+    {
+      "epoch": 0.31253333333333333,
+      "grad_norm": 0.42685033370576386,
+      "learning_rate": 0.00016104736990520468,
+      "loss": 0.7143,
+      "step": 586
+    },
+    {
+      "epoch": 0.31306666666666666,
+      "grad_norm": 0.35100355558466984,
+      "learning_rate": 0.0001609104110504954,
+      "loss": 0.6531,
+      "step": 587
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.4134879399525361,
+      "learning_rate": 0.0001607732703080532,
+      "loss": 0.781,
+      "step": 588
+    },
+    {
+      "epoch": 0.3141333333333333,
+      "grad_norm": 0.4253772794714469,
+      "learning_rate": 0.00016063594808740113,
+      "loss": 0.7855,
+      "step": 589
+    },
+    {
+      "epoch": 0.31466666666666665,
+      "grad_norm": 0.45883456420996127,
+      "learning_rate": 0.00016049844479860422,
+      "loss": 0.7806,
+      "step": 590
+    },
+    {
+      "epoch": 0.3152,
+      "grad_norm": 0.41643029403734655,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 0.738,
+      "step": 591
+    },
+    {
+      "epoch": 0.3157333333333333,
+      "grad_norm": 0.42932450527570937,
+      "learning_rate": 0.00016022289665953808,
+      "loss": 0.7621,
+      "step": 592
+    },
+    {
+      "epoch": 0.31626666666666664,
+      "grad_norm": 0.40734119656325934,
+      "learning_rate": 0.00016008485263209742,
+      "loss": 0.7203,
+      "step": 593
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.3705935827101678,
+      "learning_rate": 0.0001599466291821666,
+      "loss": 0.7055,
+      "step": 594
+    },
+    {
+      "epoch": 0.31733333333333336,
+      "grad_norm": 0.3991816141148357,
+      "learning_rate": 0.0001598082267225018,
+      "loss": 0.7175,
+      "step": 595
+    },
+    {
+      "epoch": 0.3178666666666667,
+      "grad_norm": 0.4104785870045507,
+      "learning_rate": 0.0001596696456663938,
+      "loss": 0.6824,
+      "step": 596
+    },
+    {
+      "epoch": 0.3184,
+      "grad_norm": 0.3705219630510094,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 0.6752,
+      "step": 597
+    },
+    {
+      "epoch": 0.31893333333333335,
+      "grad_norm": 0.41335389806428185,
+      "learning_rate": 0.00015939194942067646,
+      "loss": 0.7045,
+      "step": 598
+    },
+    {
+      "epoch": 0.3194666666666667,
+      "grad_norm": 0.41064141240153385,
+      "learning_rate": 0.0001592528350603103,
+      "loss": 0.6749,
+      "step": 599
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.38639297771168474,
+      "learning_rate": 0.0001591135437619847,
+      "loss": 0.7281,
+      "step": 600
+    },
+    {
+      "epoch": 0.32053333333333334,
+      "grad_norm": 0.4192855338902871,
+      "learning_rate": 0.00015897407594164467,
+      "loss": 0.7633,
+      "step": 601
+    },
+    {
+      "epoch": 0.32106666666666667,
+      "grad_norm": 0.4073470734439687,
+      "learning_rate": 0.00015883443201576225,
+      "loss": 0.7452,
+      "step": 602
+    },
+    {
+      "epoch": 0.3216,
+      "grad_norm": 0.4381908278901834,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 0.7854,
+      "step": 603
+    },
+    {
+      "epoch": 0.3221333333333333,
+      "grad_norm": 0.4733147031707669,
+      "learning_rate": 0.00015855461751588677,
+      "loss": 0.792,
+      "step": 604
+    },
+    {
+      "epoch": 0.32266666666666666,
+      "grad_norm": 0.4262777998983014,
+      "learning_rate": 0.0001584144477774623,
+      "loss": 0.7347,
+      "step": 605
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.4183765799440559,
+      "learning_rate": 0.0001582741036046301,
+      "loss": 0.7265,
+      "step": 606
+    },
+    {
+      "epoch": 0.3237333333333333,
+      "grad_norm": 0.4435370888035211,
+      "learning_rate": 0.00015813358541647915,
+      "loss": 0.7408,
+      "step": 607
+    },
+    {
+      "epoch": 0.32426666666666665,
+      "grad_norm": 0.40791137350226997,
+      "learning_rate": 0.00015799289363261813,
+      "loss": 0.7421,
+      "step": 608
+    },
+    {
+      "epoch": 0.3248,
+      "grad_norm": 0.40533755962194024,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 0.6517,
+      "step": 609
+    },
+    {
+      "epoch": 0.3253333333333333,
+      "grad_norm": 0.41043330111795495,
+      "learning_rate": 0.00015771099095879108,
+      "loss": 0.6577,
+      "step": 610
+    },
+    {
+      "epoch": 0.3258666666666667,
+      "grad_norm": 0.40590348596194553,
+      "learning_rate": 0.0001575697809106292,
+      "loss": 0.7266,
+      "step": 611
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4566038347829871,
+      "learning_rate": 0.00015742839895036305,
+      "loss": 0.725,
+      "step": 612
+    },
+    {
+      "epoch": 0.32693333333333335,
+      "grad_norm": 0.46166239710286383,
+      "learning_rate": 0.00015728684550018064,
+      "loss": 0.8159,
+      "step": 613
+    },
+    {
+      "epoch": 0.3274666666666667,
+      "grad_norm": 0.3927204087715734,
+      "learning_rate": 0.0001571451209827821,
+      "loss": 0.6838,
+      "step": 614
+    },
+    {
+      "epoch": 0.328,
+      "grad_norm": 0.38145862694035654,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 0.7287,
+      "step": 615
+    },
+    {
+      "epoch": 0.32853333333333334,
+      "grad_norm": 0.41014410375622384,
+      "learning_rate": 0.00015686116043968972,
+      "loss": 0.7326,
+      "step": 616
+    },
+    {
+      "epoch": 0.3290666666666667,
+      "grad_norm": 0.4128926143802422,
+      "learning_rate": 0.00015671892526194516,
+      "loss": 0.7686,
+      "step": 617
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.4496946594750262,
+      "learning_rate": 0.0001565765207128805,
+      "loss": 0.8154,
+      "step": 618
+    },
+    {
+      "epoch": 0.33013333333333333,
+      "grad_norm": 0.4081721073142836,
+      "learning_rate": 0.0001564339472177373,
+      "loss": 0.6923,
+      "step": 619
+    },
+    {
+      "epoch": 0.33066666666666666,
+      "grad_norm": 0.39323053922880735,
+      "learning_rate": 0.00015629120520226165,
+      "loss": 0.7472,
+      "step": 620
+    },
+    {
+      "epoch": 0.3312,
+      "grad_norm": 0.3889464467389498,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 0.6888,
+      "step": 621
+    },
+    {
+      "epoch": 0.3317333333333333,
+      "grad_norm": 0.3782721459841835,
+      "learning_rate": 0.0001560052173158123,
+      "loss": 0.7008,
+      "step": 622
+    },
+    {
+      "epoch": 0.33226666666666665,
+      "grad_norm": 0.45711474908188415,
+      "learning_rate": 0.00015586197229884184,
+      "loss": 0.7355,
+      "step": 623
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.39437508744974215,
+      "learning_rate": 0.00015571856046954285,
+      "loss": 0.6854,
+      "step": 624
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.44044997727986784,
+      "learning_rate": 0.00015557498225616487,
+      "loss": 0.7947,
+      "step": 625
+    },
+    {
+      "epoch": 0.33386666666666664,
+      "grad_norm": 0.48326948955041055,
+      "learning_rate": 0.0001554312380874542,
+      "loss": 0.83,
+      "step": 626
+    },
+    {
+      "epoch": 0.3344,
+      "grad_norm": 0.4688313184034696,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 0.791,
+      "step": 627
+    },
+    {
+      "epoch": 0.33493333333333336,
+      "grad_norm": 0.42255262210533706,
+      "learning_rate": 0.00015514325360149668,
+      "loss": 0.7324,
+      "step": 628
+    },
+    {
+      "epoch": 0.3354666666666667,
+      "grad_norm": 0.4347060818044885,
+      "learning_rate": 0.0001549990141442153,
+      "loss": 0.715,
+      "step": 629
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.42756407620451586,
+      "learning_rate": 0.0001548546104515294,
+      "loss": 0.7515,
+      "step": 630
+    },
+    {
+      "epoch": 0.33653333333333335,
+      "grad_norm": 0.4513617573308796,
+      "learning_rate": 0.00015471004295465035,
+      "loss": 0.8076,
+      "step": 631
+    },
+    {
+      "epoch": 0.3370666666666667,
+      "grad_norm": 0.43649044118937214,
+      "learning_rate": 0.0001545653120852787,
+      "loss": 0.7357,
+      "step": 632
+    },
+    {
+      "epoch": 0.3376,
+      "grad_norm": 0.46975188650514227,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 0.8128,
+      "step": 633
+    },
+    {
+      "epoch": 0.33813333333333334,
+      "grad_norm": 0.4206316318669492,
+      "learning_rate": 0.00015427536195829742,
+      "loss": 0.7124,
+      "step": 634
+    },
+    {
+      "epoch": 0.33866666666666667,
+      "grad_norm": 0.43103204875720075,
+      "learning_rate": 0.00015413014356652286,
+      "loss": 0.741,
+      "step": 635
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4320759572006151,
+      "learning_rate": 0.00015398476353392323,
+      "loss": 0.7615,
+      "step": 636
+    },
+    {
+      "epoch": 0.33973333333333333,
+      "grad_norm": 0.44756733999939363,
+      "learning_rate": 0.00015383922229462549,
+      "loss": 0.7429,
+      "step": 637
+    },
+    {
+      "epoch": 0.34026666666666666,
+      "grad_norm": 0.46270821466141965,
+      "learning_rate": 0.00015369352028323774,
+      "loss": 0.7912,
+      "step": 638
+    },
+    {
+      "epoch": 0.3408,
+      "grad_norm": 0.40440151158694015,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 0.7205,
+      "step": 639
+    },
+    {
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.4066183589067994,
+      "learning_rate": 0.0001534016356850244,
+      "loss": 0.7136,
+      "step": 640
+    },
+    {
+      "epoch": 0.34186666666666665,
+      "grad_norm": 0.4428967511926324,
+      "learning_rate": 0.0001532554539698105,
+      "loss": 0.8338,
+      "step": 641
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.37756978595071494,
+      "learning_rate": 0.00015310911322572753,
+      "loss": 0.7153,
+      "step": 642
+    },
+    {
+      "epoch": 0.3429333333333333,
+      "grad_norm": 0.39618604131799795,
+      "learning_rate": 0.00015296261388977108,
+      "loss": 0.7169,
+      "step": 643
+    },
+    {
+      "epoch": 0.34346666666666664,
+      "grad_norm": 0.390452592211156,
+      "learning_rate": 0.0001528159563994104,
+      "loss": 0.6974,
+      "step": 644
+    },
+    {
+      "epoch": 0.344,
+      "grad_norm": 0.41959439719106323,
+      "learning_rate": 0.000152669141192587,
+      "loss": 0.7304,
+      "step": 645
+    },
+    {
+      "epoch": 0.34453333333333336,
+      "grad_norm": 0.38357579817872856,
+      "learning_rate": 0.00015252216870771345,
+      "loss": 0.7177,
+      "step": 646
+    },
+    {
+      "epoch": 0.3450666666666667,
+      "grad_norm": 0.37161116856644516,
+      "learning_rate": 0.00015237503938367186,
+      "loss": 0.6922,
+      "step": 647
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.4051170055648018,
+      "learning_rate": 0.00015222775365981273,
+      "loss": 0.7687,
+      "step": 648
+    },
+    {
+      "epoch": 0.34613333333333335,
+      "grad_norm": 0.42783911951428605,
+      "learning_rate": 0.00015208031197595356,
+      "loss": 0.7296,
+      "step": 649
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.4580257926351465,
+      "learning_rate": 0.0001519327147723776,
+      "loss": 0.786,
+      "step": 650
+    },
+    {
+      "epoch": 0.3472,
+      "grad_norm": 0.39123493896464817,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 0.7313,
+      "step": 651
+    },
+    {
+      "epoch": 0.34773333333333334,
+      "grad_norm": 0.41507210334543587,
+      "learning_rate": 0.0001516370555695291,
+      "loss": 0.7066,
+      "step": 652
+    },
+    {
+      "epoch": 0.34826666666666667,
+      "grad_norm": 0.481514406389178,
+      "learning_rate": 0.00015148899445313981,
+      "loss": 0.8217,
+      "step": 653
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.3943667256577028,
+      "learning_rate": 0.00015134077958279765,
+      "loss": 0.6981,
+      "step": 654
+    },
+    {
+      "epoch": 0.34933333333333333,
+      "grad_norm": 0.4004702594600421,
+      "learning_rate": 0.00015119241140109467,
+      "loss": 0.7401,
+      "step": 655
+    },
+    {
+      "epoch": 0.34986666666666666,
+      "grad_norm": 0.41454716058728625,
+      "learning_rate": 0.00015104389035108077,
+      "loss": 0.6944,
+      "step": 656
+    },
+    {
+      "epoch": 0.3504,
+      "grad_norm": 0.4008158675211356,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 0.7121,
+      "step": 657
+    },
+    {
+      "epoch": 0.3509333333333333,
+      "grad_norm": 0.4086056163398199,
+      "learning_rate": 0.0001507463914206012,
+      "loss": 0.7153,
+      "step": 658
+    },
+    {
+      "epoch": 0.35146666666666665,
+      "grad_norm": 0.42739344037098376,
+      "learning_rate": 0.0001505974144285124,
+      "loss": 0.7885,
+      "step": 659
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.4046439577379952,
+      "learning_rate": 0.000150448286344864,
+      "loss": 0.7344,
+      "step": 660
+    },
+    {
+      "epoch": 0.3525333333333333,
+      "grad_norm": 0.46612561267966846,
+      "learning_rate": 0.00015029900761497506,
+      "loss": 0.7751,
+      "step": 661
+    },
+    {
+      "epoch": 0.35306666666666664,
+      "grad_norm": 0.3931406497772889,
+      "learning_rate": 0.00015014957868461458,
+      "loss": 0.7541,
+      "step": 662
+    },
+    {
+      "epoch": 0.3536,
+      "grad_norm": 0.37640543832804774,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.6685,
+      "step": 663
+    },
+    {
+      "epoch": 0.35413333333333336,
+      "grad_norm": 0.4475185581718164,
+      "learning_rate": 0.000149850272007796,
+      "loss": 0.8463,
+      "step": 664
+    },
+    {
+      "epoch": 0.3546666666666667,
+      "grad_norm": 0.43555120723890317,
+      "learning_rate": 0.00014970039515511304,
+      "loss": 0.7967,
+      "step": 665
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.3912352955790987,
+      "learning_rate": 0.00014955036988950618,
+      "loss": 0.6932,
+      "step": 666
+    },
+    {
+      "epoch": 0.35573333333333335,
+      "grad_norm": 0.4099421378533521,
+      "learning_rate": 0.0001494001966589736,
+      "loss": 0.7297,
+      "step": 667
+    },
+    {
+      "epoch": 0.3562666666666667,
+      "grad_norm": 0.416618684089891,
+      "learning_rate": 0.00014924987591195547,
+      "loss": 0.7484,
+      "step": 668
+    },
+    {
+      "epoch": 0.3568,
+      "grad_norm": 0.37639952072149435,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 0.7055,
+      "step": 669
+    },
+    {
+      "epoch": 0.35733333333333334,
+      "grad_norm": 0.4193840133619344,
+      "learning_rate": 0.0001489487936644237,
+      "loss": 0.791,
+      "step": 670
+    },
+    {
+      "epoch": 0.35786666666666667,
+      "grad_norm": 0.39618466742229524,
+      "learning_rate": 0.00014879803306298736,
+      "loss": 0.7351,
+      "step": 671
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.380114446442462,
+      "learning_rate": 0.00014864712674321734,
+      "loss": 0.7015,
+      "step": 672
+    },
+    {
+      "epoch": 0.3589333333333333,
+      "grad_norm": 0.39050338871450424,
+      "learning_rate": 0.00014849607515574276,
+      "loss": 0.7091,
+      "step": 673
+    },
+    {
+      "epoch": 0.35946666666666666,
+      "grad_norm": 0.4502586694971886,
+      "learning_rate": 0.00014834487875162657,
+      "loss": 0.7017,
+      "step": 674
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.418052943596847,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 0.7427,
+      "step": 675
+    },
+    {
+      "epoch": 0.3605333333333333,
+      "grad_norm": 0.42893056517377914,
+      "learning_rate": 0.00014804205329988225,
+      "loss": 0.7838,
+      "step": 676
+    },
+    {
+      "epoch": 0.36106666666666665,
+      "grad_norm": 0.4241874305859737,
+      "learning_rate": 0.00014789042515653687,
+      "loss": 0.6904,
+      "step": 677
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4459911700099912,
+      "learning_rate": 0.00014773865400511272,
+      "loss": 0.7458,
+      "step": 678
+    },
+    {
+      "epoch": 0.3621333333333333,
+      "grad_norm": 0.41032809921787594,
+      "learning_rate": 0.00014758674029882152,
+      "loss": 0.6959,
+      "step": 679
+    },
+    {
+      "epoch": 0.3626666666666667,
+      "grad_norm": 0.3855667042541708,
+      "learning_rate": 0.00014743468449130063,
+      "loss": 0.7141,
+      "step": 680
+    },
+    {
+      "epoch": 0.3632,
+      "grad_norm": 0.4629190402722016,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 0.6724,
+      "step": 681
+    },
+    {
+      "epoch": 0.36373333333333335,
+      "grad_norm": 0.46279704677568356,
+      "learning_rate": 0.00014713014838923976,
+      "loss": 0.7906,
+      "step": 682
+    },
+    {
+      "epoch": 0.3642666666666667,
+      "grad_norm": 0.35871998831217716,
+      "learning_rate": 0.00014697766900409074,
+      "loss": 0.6781,
+      "step": 683
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.4930608546681951,
+      "learning_rate": 0.00014682504933649144,
+      "loss": 0.8636,
+      "step": 684
+    },
+    {
+      "epoch": 0.36533333333333334,
+      "grad_norm": 0.44256013503274955,
+      "learning_rate": 0.0001466722898421873,
+      "loss": 0.704,
+      "step": 685
+    },
+    {
+      "epoch": 0.3658666666666667,
+      "grad_norm": 0.38062984848447684,
+      "learning_rate": 0.0001465193909773413,
+      "loss": 0.6929,
+      "step": 686
+    },
+    {
+      "epoch": 0.3664,
+      "grad_norm": 0.37907512503141494,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 0.7322,
+      "step": 687
+    },
+    {
+      "epoch": 0.36693333333333333,
+      "grad_norm": 0.4438719005667484,
+      "learning_rate": 0.00014621317696275564,
+      "loss": 0.7368,
+      "step": 688
+    },
+    {
+      "epoch": 0.36746666666666666,
+      "grad_norm": 0.3757027588614008,
+      "learning_rate": 0.00014605986272741748,
+      "loss": 0.7161,
+      "step": 689
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.42762741905841856,
+      "learning_rate": 0.00014590641095033787,
+      "loss": 0.7858,
+      "step": 690
+    },
+    {
+      "epoch": 0.3685333333333333,
+      "grad_norm": 0.4603683509064343,
+      "learning_rate": 0.00014575282208974702,
+      "loss": 0.8062,
+      "step": 691
+    },
+    {
+      "epoch": 0.36906666666666665,
+      "grad_norm": 0.4249345365450475,
+      "learning_rate": 0.00014559909660428468,
+      "loss": 0.6899,
+      "step": 692
+    },
+    {
+      "epoch": 0.3696,
+      "grad_norm": 0.4307264048850522,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 0.7888,
+      "step": 693
+    },
+    {
+      "epoch": 0.3701333333333333,
+      "grad_norm": 0.36253477136303247,
+      "learning_rate": 0.00014529123759534255,
+      "loss": 0.7107,
+      "step": 694
+    },
+    {
+      "epoch": 0.37066666666666664,
+      "grad_norm": 0.4346616376243191,
+      "learning_rate": 0.00014513710499117647,
+      "loss": 0.7099,
+      "step": 695
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.5116793752500154,
+      "learning_rate": 0.0001449828376007636,
+      "loss": 0.8613,
+      "step": 696
+    },
+    {
+      "epoch": 0.37173333333333336,
+      "grad_norm": 0.46464733367215383,
+      "learning_rate": 0.00014482843588476974,
+      "loss": 0.7512,
+      "step": 697
+    },
+    {
+      "epoch": 0.3722666666666667,
+      "grad_norm": 0.3939198296872801,
+      "learning_rate": 0.00014467390030426186,
+      "loss": 0.6331,
+      "step": 698
+    },
+    {
+      "epoch": 0.3728,
+      "grad_norm": 0.3903839441336436,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 0.6779,
+      "step": 699
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.38809736264352834,
+      "learning_rate": 0.0001443644293959693,
+      "loss": 0.7305,
+      "step": 700
+    },
+    {
+      "epoch": 0.3738666666666667,
+      "grad_norm": 0.44384900300011626,
+      "learning_rate": 0.00014420949499231172,
+      "loss": 0.7188,
+      "step": 701
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4988692150897368,
+      "learning_rate": 0.0001440544285723915,
+      "loss": 0.8339,
+      "step": 702
+    },
+    {
+      "epoch": 0.37493333333333334,
+      "grad_norm": 0.3832189037895236,
+      "learning_rate": 0.00014389923059926062,
+      "loss": 0.7105,
+      "step": 703
+    },
+    {
+      "epoch": 0.37546666666666667,
+      "grad_norm": 0.369498218784444,
+      "learning_rate": 0.0001437439015363638,
+      "loss": 0.6724,
+      "step": 704
+    },
+    {
+      "epoch": 0.376,
+      "grad_norm": 0.5293337597367642,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 0.7926,
+      "step": 705
+    },
+    {
+      "epoch": 0.37653333333333333,
+      "grad_norm": 0.3947743647114129,
+      "learning_rate": 0.00014343285199700683,
+      "loss": 0.6456,
+      "step": 706
+    },
+    {
+      "epoch": 0.37706666666666666,
+      "grad_norm": 0.3697702493035694,
+      "learning_rate": 0.0001432771324493879,
+      "loss": 0.751,
+      "step": 707
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.5008865279430565,
+      "learning_rate": 0.00014312128366968243,
+      "loss": 0.7691,
+      "step": 708
+    },
+    {
+      "epoch": 0.3781333333333333,
+      "grad_norm": 0.43144752424878857,
+      "learning_rate": 0.00014296530612327863,
+      "loss": 0.7458,
+      "step": 709
+    },
+    {
+      "epoch": 0.37866666666666665,
+      "grad_norm": 0.42867835474034455,
+      "learning_rate": 0.00014280920027594907,
+      "loss": 0.7619,
+      "step": 710
+    },
+    {
+      "epoch": 0.3792,
+      "grad_norm": 0.47344859479913404,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 0.785,
+      "step": 711
+    },
+    {
+      "epoch": 0.3797333333333333,
+      "grad_norm": 0.4814850246438516,
+      "learning_rate": 0.00014249660554351752,
+      "loss": 0.898,
+      "step": 712
+    },
+    {
+      "epoch": 0.38026666666666664,
+      "grad_norm": 0.4378968855984162,
+      "learning_rate": 0.00014234011759187083,
+      "loss": 0.6866,
+      "step": 713
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.4263778469465353,
+      "learning_rate": 0.00014218350320620624,
+      "loss": 0.7583,
+      "step": 714
+    },
+    {
+      "epoch": 0.38133333333333336,
+      "grad_norm": 0.37241853028261956,
+      "learning_rate": 0.00014202676285419812,
+      "loss": 0.6812,
+      "step": 715
+    },
+    {
+      "epoch": 0.3818666666666667,
+      "grad_norm": 0.39373843615558624,
+      "learning_rate": 0.00014186989700389687,
+      "loss": 0.7345,
+      "step": 716
+    },
+    {
+      "epoch": 0.3824,
+      "grad_norm": 0.3873685666617381,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 0.7189,
+      "step": 717
+    },
+    {
+      "epoch": 0.38293333333333335,
+      "grad_norm": 0.4066160705799862,
+      "learning_rate": 0.0001415557906824895,
+      "loss": 0.7029,
+      "step": 718
+    },
+    {
+      "epoch": 0.3834666666666667,
+      "grad_norm": 0.3808234049232231,
+      "learning_rate": 0.00014139855114935252,
+      "loss": 0.7282,
+      "step": 719
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.42754032553756716,
+      "learning_rate": 0.00014124118799385796,
+      "loss": 0.7403,
+      "step": 720
+    },
+    {
+      "epoch": 0.38453333333333334,
+      "grad_norm": 0.35078303619981066,
+      "learning_rate": 0.0001410837016859161,
+      "loss": 0.6562,
+      "step": 721
+    },
+    {
+      "epoch": 0.38506666666666667,
+      "grad_norm": 0.3615541922056434,
+      "learning_rate": 0.00014092609269580496,
+      "loss": 0.6912,
+      "step": 722
+    },
+    {
+      "epoch": 0.3856,
+      "grad_norm": 0.4006073125444095,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 0.7312,
+      "step": 723
+    },
+    {
+      "epoch": 0.38613333333333333,
+      "grad_norm": 0.4170299356848603,
+      "learning_rate": 0.00014061050855201723,
+      "loss": 0.7935,
+      "step": 724
+    },
+    {
+      "epoch": 0.38666666666666666,
+      "grad_norm": 0.40175583363688694,
+      "learning_rate": 0.0001404525343407228,
+      "loss": 0.6934,
+      "step": 725
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.44773593934774614,
+      "learning_rate": 0.0001402944393320206,
+      "loss": 0.791,
+      "step": 726
+    },
+    {
+      "epoch": 0.3877333333333333,
+      "grad_norm": 0.4367014585195122,
+      "learning_rate": 0.00014013622399800627,
+      "loss": 0.6847,
+      "step": 727
+    },
+    {
+      "epoch": 0.38826666666666665,
+      "grad_norm": 0.3894095198340693,
+      "learning_rate": 0.00013997788881113489,
+      "loss": 0.6947,
+      "step": 728
+    },
+    {
+      "epoch": 0.3888,
+      "grad_norm": 0.4343669476666237,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 0.7432,
+      "step": 729
+    },
+    {
+      "epoch": 0.3893333333333333,
+      "grad_norm": 0.3929218878374168,
+      "learning_rate": 0.0001396608607704289,
+      "loss": 0.6872,
+      "step": 730
+    },
+    {
+      "epoch": 0.38986666666666664,
+      "grad_norm": 0.489807724797637,
+      "learning_rate": 0.0001395021688632882,
+      "loss": 0.7863,
+      "step": 731
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.3757636600192832,
+      "learning_rate": 0.00013934335899667527,
+      "loss": 0.659,
+      "step": 732
+    },
+    {
+      "epoch": 0.39093333333333335,
+      "grad_norm": 0.3868383973886252,
+      "learning_rate": 0.00013918443164482046,
+      "loss": 0.7136,
+      "step": 733
+    },
+    {
+      "epoch": 0.3914666666666667,
+      "grad_norm": 0.36586286706983623,
+      "learning_rate": 0.000139025387282305,
+      "loss": 0.6726,
+      "step": 734
+    },
+    {
+      "epoch": 0.392,
+      "grad_norm": 0.41106954005266616,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 0.7518,
+      "step": 735
+    },
+    {
+      "epoch": 0.39253333333333335,
+      "grad_norm": 0.3591853030231446,
+      "learning_rate": 0.0001387069494253626,
+      "loss": 0.6534,
+      "step": 736
+    },
+    {
+      "epoch": 0.3930666666666667,
+      "grad_norm": 0.37879426950938166,
+      "learning_rate": 0.0001385475568818394,
+      "loss": 0.6909,
+      "step": 737
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.45591841792152743,
+      "learning_rate": 0.00013838804922946027,
+      "loss": 0.7621,
+      "step": 738
+    },
+    {
+      "epoch": 0.39413333333333334,
+      "grad_norm": 0.3412298431432226,
+      "learning_rate": 0.00013822842694453924,
+      "loss": 0.645,
+      "step": 739
+    },
+    {
+      "epoch": 0.39466666666666667,
+      "grad_norm": 0.40544822040894074,
+      "learning_rate": 0.0001380686905037327,
+      "loss": 0.6977,
+      "step": 740
+    },
+    {
+      "epoch": 0.3952,
+      "grad_norm": 0.4086369613667534,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 0.7361,
+      "step": 741
+    },
+    {
+      "epoch": 0.3957333333333333,
+      "grad_norm": 0.38809643021441603,
+      "learning_rate": 0.00013774887706279165,
+      "loss": 0.7184,
+      "step": 742
+    },
+    {
+      "epoch": 0.39626666666666666,
+      "grad_norm": 0.4112905583546399,
+      "learning_rate": 0.0001375888010176686,
+      "loss": 0.7576,
+      "step": 743
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4459939402439977,
+      "learning_rate": 0.00013742861272668012,
+      "loss": 0.7349,
+      "step": 744
+    },
+    {
+      "epoch": 0.3973333333333333,
+      "grad_norm": 0.4050813263806746,
+      "learning_rate": 0.00013726831266817278,
+      "loss": 0.7656,
+      "step": 745
+    },
+    {
+      "epoch": 0.39786666666666665,
+      "grad_norm": 0.3667890962820171,
+      "learning_rate": 0.00013710790132082692,
+      "loss": 0.7151,
+      "step": 746
+    },
+    {
+      "epoch": 0.3984,
+      "grad_norm": 0.40011166345292043,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 0.7605,
+      "step": 747
+    },
+    {
+      "epoch": 0.3989333333333333,
+      "grad_norm": 0.4024398695217591,
+      "learning_rate": 0.00013678674667600102,
+      "loss": 0.763,
+      "step": 748
+    },
+    {
+      "epoch": 0.3994666666666667,
+      "grad_norm": 0.3761447111479893,
+      "learning_rate": 0.00013662600433753745,
+      "loss": 0.6506,
+      "step": 749
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.45940242277447846,
+      "learning_rate": 0.00013646515262826552,
+      "loss": 0.6966,
+      "step": 750
+    },
+    {
+      "epoch": 0.40053333333333335,
+      "grad_norm": 0.4110400362440283,
+      "learning_rate": 0.00013630419202851284,
+      "loss": 0.7007,
+      "step": 751
+    },
+    {
+      "epoch": 0.4010666666666667,
+      "grad_norm": 0.36473509121148145,
+      "learning_rate": 0.00013614312301893223,
+      "loss": 0.6411,
+      "step": 752
+    },
+    {
+      "epoch": 0.4016,
+      "grad_norm": 0.47718072134643,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 0.8045,
+      "step": 753
+    },
+    {
+      "epoch": 0.40213333333333334,
+      "grad_norm": 0.40243532664369813,
+      "learning_rate": 0.00013582066169451535,
+      "loss": 0.7402,
+      "step": 754
+    },
+    {
+      "epoch": 0.4026666666666667,
+      "grad_norm": 0.37488828093687565,
+      "learning_rate": 0.0001356592703425976,
+      "loss": 0.6367,
+      "step": 755
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.39537794135335386,
+      "learning_rate": 0.0001354977725066859,
+      "loss": 0.7008,
+      "step": 756
+    },
+    {
+      "epoch": 0.40373333333333333,
+      "grad_norm": 0.43967264906377374,
+      "learning_rate": 0.00013533616866903735,
+      "loss": 0.7082,
+      "step": 757
+    },
+    {
+      "epoch": 0.40426666666666666,
+      "grad_norm": 0.4079234354206369,
+      "learning_rate": 0.0001351744593122255,
+      "loss": 0.7092,
+      "step": 758
+    },
+    {
+      "epoch": 0.4048,
+      "grad_norm": 0.37155253793525617,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 0.681,
+      "step": 759
+    },
+    {
+      "epoch": 0.4053333333333333,
+      "grad_norm": 0.4031350584934996,
+      "learning_rate": 0.00013485072597298038,
+      "loss": 0.7669,
+      "step": 760
+    },
+    {
+      "epoch": 0.40586666666666665,
+      "grad_norm": 0.4273056447329353,
+      "learning_rate": 0.00013468870295726398,
+      "loss": 0.7677,
+      "step": 761
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.40110795325781556,
+      "learning_rate": 0.0001345265763558152,
+      "loss": 0.6928,
+      "step": 762
+    },
+    {
+      "epoch": 0.4069333333333333,
+      "grad_norm": 0.3921495657087361,
+      "learning_rate": 0.00013436434665276865,
+      "loss": 0.6846,
+      "step": 763
+    },
+    {
+      "epoch": 0.40746666666666664,
+      "grad_norm": 0.4043918351556086,
+      "learning_rate": 0.00013420201433256689,
+      "loss": 0.6543,
+      "step": 764
+    },
+    {
+      "epoch": 0.408,
+      "grad_norm": 0.47659688565165054,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 0.7496,
+      "step": 765
+    },
+    {
+      "epoch": 0.40853333333333336,
+      "grad_norm": 0.3984403784576225,
+      "learning_rate": 0.00013387704377999842,
+      "loss": 0.7314,
+      "step": 766
+    },
+    {
+      "epoch": 0.4090666666666667,
+      "grad_norm": 0.4196372282621598,
+      "learning_rate": 0.00013371440651804313,
+      "loss": 0.7521,
+      "step": 767
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.3962362919621647,
+      "learning_rate": 0.0001335516685797525,
+      "loss": 0.7193,
+      "step": 768
+    },
+    {
+      "epoch": 0.41013333333333335,
+      "grad_norm": 0.3944029448445006,
+      "learning_rate": 0.00013338883045108674,
+      "loss": 0.7618,
+      "step": 769
+    },
+    {
+      "epoch": 0.4106666666666667,
+      "grad_norm": 0.36807500602234233,
+      "learning_rate": 0.00013322589261830517,
+      "loss": 0.6694,
+      "step": 770
+    },
+    {
+      "epoch": 0.4112,
+      "grad_norm": 0.44695753722008286,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 0.7664,
+      "step": 771
+    },
+    {
+      "epoch": 0.41173333333333334,
+      "grad_norm": 0.42317405188437496,
+      "learning_rate": 0.0001328997197869194,
+      "loss": 0.7793,
+      "step": 772
+    },
+    {
+      "epoch": 0.41226666666666667,
+      "grad_norm": 0.3748121804380357,
+      "learning_rate": 0.0001327364857623168,
+      "loss": 0.6677,
+      "step": 773
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4038810920792045,
+      "learning_rate": 0.00013257315398159864,
+      "loss": 0.7502,
+      "step": 774
+    },
+    {
+      "epoch": 0.41333333333333333,
+      "grad_norm": 0.3791411563791544,
+      "learning_rate": 0.00013240972493249847,
+      "loss": 0.6638,
+      "step": 775
+    },
+    {
+      "epoch": 0.41386666666666666,
+      "grad_norm": 0.36400656378469093,
+      "learning_rate": 0.0001322461991030402,
+      "loss": 0.6415,
+      "step": 776
+    },
+    {
+      "epoch": 0.4144,
+      "grad_norm": 0.4139989075328661,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 0.7085,
+      "step": 777
+    },
+    {
+      "epoch": 0.4149333333333333,
+      "grad_norm": 0.41415205429654484,
+      "learning_rate": 0.00013191885905658872,
+      "loss": 0.7184,
+      "step": 778
+    },
+    {
+      "epoch": 0.41546666666666665,
+      "grad_norm": 0.47590277067949127,
+      "learning_rate": 0.0001317550458170826,
+      "loss": 0.7941,
+      "step": 779
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3797426296738849,
+      "learning_rate": 0.00013159113775218964,
+      "loss": 0.7064,
+      "step": 780
+    },
+    {
+      "epoch": 0.4165333333333333,
+      "grad_norm": 0.4200393699368836,
+      "learning_rate": 0.00013142713535136414,
+      "loss": 0.7765,
+      "step": 781
+    },
+    {
+      "epoch": 0.41706666666666664,
+      "grad_norm": 0.434728398766787,
+      "learning_rate": 0.00013126303910434214,
+      "loss": 0.7253,
+      "step": 782
+    },
+    {
+      "epoch": 0.4176,
+      "grad_norm": 0.372873424526013,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 0.6113,
+      "step": 783
+    },
+    {
+      "epoch": 0.41813333333333336,
+      "grad_norm": 0.3831484277663798,
+      "learning_rate": 0.00013093456703205288,
+      "loss": 0.6585,
+      "step": 784
+    },
+    {
+      "epoch": 0.4186666666666667,
+      "grad_norm": 0.42422980631296836,
+      "learning_rate": 0.00013077019218765305,
+      "loss": 0.7388,
+      "step": 785
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.47442795680104105,
+      "learning_rate": 0.00013060572545878875,
+      "loss": 0.7876,
+      "step": 786
+    },
+    {
+      "epoch": 0.41973333333333335,
+      "grad_norm": 0.38281098481650605,
+      "learning_rate": 0.0001304411673365826,
+      "loss": 0.6944,
+      "step": 787
+    },
+    {
+      "epoch": 0.4202666666666667,
+      "grad_norm": 0.44915982456514136,
+      "learning_rate": 0.0001302765183124302,
+      "loss": 0.7297,
+      "step": 788
+    },
+    {
+      "epoch": 0.4208,
+      "grad_norm": 0.6527029321721289,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 0.8431,
+      "step": 789
+    },
+    {
+      "epoch": 0.42133333333333334,
+      "grad_norm": 0.45960250321243257,
+      "learning_rate": 0.00012994694952522435,
+      "loss": 0.8288,
+      "step": 790
+    },
+    {
+      "epoch": 0.42186666666666667,
+      "grad_norm": 0.4774546620048671,
+      "learning_rate": 0.00012978203074631334,
+      "loss": 0.7217,
+      "step": 791
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.3828219672976716,
+      "learning_rate": 0.00012961702303373795,
+      "loss": 0.693,
+      "step": 792
+    },
+    {
+      "epoch": 0.42293333333333333,
+      "grad_norm": 0.42307392782787945,
+      "learning_rate": 0.00012945192688023624,
+      "loss": 0.7434,
+      "step": 793
+    },
+    {
+      "epoch": 0.42346666666666666,
+      "grad_norm": 0.38857163031113473,
+      "learning_rate": 0.0001292867427788104,
+      "loss": 0.6445,
+      "step": 794
+    },
+    {
+      "epoch": 0.424,
+      "grad_norm": 0.3805358298818372,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 0.6871,
+      "step": 795
+    },
+    {
+      "epoch": 0.4245333333333333,
+      "grad_norm": 0.3328935629550907,
+      "learning_rate": 0.00012895611270550666,
+      "loss": 0.6533,
+      "step": 796
+    },
+    {
+      "epoch": 0.42506666666666665,
+      "grad_norm": 0.3304526641680735,
+      "learning_rate": 0.0001287906677209403,
+      "loss": 0.598,
+      "step": 797
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.502932741405047,
+      "learning_rate": 0.00012862513676307008,
+      "loss": 0.7889,
+      "step": 798
+    },
+    {
+      "epoch": 0.4261333333333333,
+      "grad_norm": 0.37714530248134004,
+      "learning_rate": 0.0001284595203261965,
+      "loss": 0.7182,
+      "step": 799
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.4282676032795805,
+      "learning_rate": 0.00012829381890487536,
+      "loss": 0.8043,
+      "step": 800
+    },
+    {
+      "epoch": 0.4272,
+      "grad_norm": 0.40579295892150946,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 0.7614,
+      "step": 801
+    },
+    {
+      "epoch": 0.42773333333333335,
+      "grad_norm": 0.43793201883830046,
+      "learning_rate": 0.00012796216308838117,
+      "loss": 0.7839,
+      "step": 802
+    },
+    {
+      "epoch": 0.4282666666666667,
+      "grad_norm": 0.3925544721926815,
+      "learning_rate": 0.00012779620968358273,
+      "loss": 0.7125,
+      "step": 803
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.36328082834695424,
+      "learning_rate": 0.00012763017327508305,
+      "loss": 0.658,
+      "step": 804
+    },
+    {
+      "epoch": 0.42933333333333334,
+      "grad_norm": 0.3981520390409742,
+      "learning_rate": 0.00012746405435869198,
+      "loss": 0.7413,
+      "step": 805
+    },
+    {
+      "epoch": 0.4298666666666667,
+      "grad_norm": 0.4186646716736007,
+      "learning_rate": 0.00012729785343046588,
+      "loss": 0.7108,
+      "step": 806
+    },
+    {
+      "epoch": 0.4304,
+      "grad_norm": 0.4167507640819403,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 0.7129,
+      "step": 807
+    },
+    {
+      "epoch": 0.43093333333333333,
+      "grad_norm": 0.42672646152710686,
+      "learning_rate": 0.00012696520752395672,
+      "loss": 0.6867,
+      "step": 808
+    },
+    {
+      "epoch": 0.43146666666666667,
+      "grad_norm": 0.40609769487387265,
+      "learning_rate": 0.00012679876353900482,
+      "loss": 0.7522,
+      "step": 809
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.40760676107973753,
+      "learning_rate": 0.00012663223952887723,
+      "loss": 0.6635,
+      "step": 810
+    },
+    {
+      "epoch": 0.4325333333333333,
+      "grad_norm": 0.4528448358687043,
+      "learning_rate": 0.00012646563599083996,
+      "loss": 0.7825,
+      "step": 811
+    },
+    {
+      "epoch": 0.43306666666666666,
+      "grad_norm": 0.40701529720953555,
+      "learning_rate": 0.00012629895342239643,
+      "loss": 0.7173,
+      "step": 812
+    },
+    {
+      "epoch": 0.4336,
+      "grad_norm": 0.4020643001179437,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 0.6688,
+      "step": 813
+    },
+    {
+      "epoch": 0.4341333333333333,
+      "grad_norm": 0.40345527344678944,
+      "learning_rate": 0.00012596535318548289,
+      "loss": 0.7329,
+      "step": 814
+    },
+    {
+      "epoch": 0.43466666666666665,
+      "grad_norm": 0.4242645114052418,
+      "learning_rate": 0.0001257984365131938,
+      "loss": 0.7506,
+      "step": 815
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.38049829750202047,
+      "learning_rate": 0.00012563144280285741,
+      "loss": 0.7156,
+      "step": 816
+    },
+    {
+      "epoch": 0.4357333333333333,
+      "grad_norm": 0.38242188825243595,
+      "learning_rate": 0.00012546437255314222,
+      "loss": 0.6209,
+      "step": 817
+    },
+    {
+      "epoch": 0.4362666666666667,
+      "grad_norm": 0.4326541988919919,
+      "learning_rate": 0.0001252972262629454,
+      "loss": 0.724,
+      "step": 818
+    },
+    {
+      "epoch": 0.4368,
+      "grad_norm": 0.40387155552869813,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 0.7514,
+      "step": 819
+    },
+    {
+      "epoch": 0.43733333333333335,
+      "grad_norm": 0.4213488843736418,
+      "learning_rate": 0.00012496270755782914,
+      "loss": 0.7085,
+      "step": 820
+    },
+    {
+      "epoch": 0.4378666666666667,
+      "grad_norm": 0.4219765834921627,
+      "learning_rate": 0.00012479533614183334,
+      "loss": 0.8062,
+      "step": 821
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.44626299335647784,
+      "learning_rate": 0.00012462789068320017,
+      "loss": 0.7364,
+      "step": 822
+    },
+    {
+      "epoch": 0.43893333333333334,
+      "grad_norm": 0.35917611007956723,
+      "learning_rate": 0.00012446037168194714,
+      "loss": 0.6436,
+      "step": 823
+    },
+    {
+      "epoch": 0.43946666666666667,
+      "grad_norm": 0.4284702509214677,
+      "learning_rate": 0.00012429277963831148,
+      "loss": 0.7113,
+      "step": 824
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.3718477597101707,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 0.6734,
+      "step": 825
+    },
+    {
+      "epoch": 0.44053333333333333,
+      "grad_norm": 0.37388387620180413,
+      "learning_rate": 0.00012395737842592995,
+      "loss": 0.6879,
+      "step": 826
+    },
+    {
+      "epoch": 0.44106666666666666,
+      "grad_norm": 0.437811789789759,
+      "learning_rate": 0.000123789570258743,
+      "loss": 0.7298,
+      "step": 827
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.3667673569730156,
+      "learning_rate": 0.00012362169105228826,
+      "loss": 0.6608,
+      "step": 828
+    },
+    {
+      "epoch": 0.4421333333333333,
+      "grad_norm": 0.38890167788437835,
+      "learning_rate": 0.00012345374130787854,
+      "loss": 0.6963,
+      "step": 829
+    },
+    {
+      "epoch": 0.44266666666666665,
+      "grad_norm": 0.3776600784418698,
+      "learning_rate": 0.00012328572152703725,
+      "loss": 0.642,
+      "step": 830
+    },
+    {
+      "epoch": 0.4432,
+      "grad_norm": 0.443445459098556,
+      "learning_rate": 0.000123117632211497,
+      "loss": 0.7899,
+      "step": 831
+    },
+    {
+      "epoch": 0.4437333333333333,
+      "grad_norm": 0.41217818516299154,
+      "learning_rate": 0.00012294947386319794,
+      "loss": 0.7527,
+      "step": 832
+    },
+    {
+      "epoch": 0.44426666666666664,
+      "grad_norm": 0.40045702787906,
+      "learning_rate": 0.0001227812469842864,
+      "loss": 0.6717,
+      "step": 833
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.4880291023513915,
+      "learning_rate": 0.00012261295207711346,
+      "loss": 0.7917,
+      "step": 834
+    },
+    {
+      "epoch": 0.44533333333333336,
+      "grad_norm": 0.3863030279047588,
+      "learning_rate": 0.00012244458964423327,
+      "loss": 0.7036,
+      "step": 835
+    },
+    {
+      "epoch": 0.4458666666666667,
+      "grad_norm": 0.38075490277545865,
+      "learning_rate": 0.00012227616018840154,
+      "loss": 0.7006,
+      "step": 836
+    },
+    {
+      "epoch": 0.4464,
+      "grad_norm": 0.4279247073163393,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 0.6853,
+      "step": 837
+    },
+    {
+      "epoch": 0.44693333333333335,
+      "grad_norm": 0.4227832738664886,
+      "learning_rate": 0.00012193910221990581,
+      "loss": 0.6787,
+      "step": 838
+    },
+    {
+      "epoch": 0.4474666666666667,
+      "grad_norm": 0.42573542527244784,
+      "learning_rate": 0.00012177047471374807,
+      "loss": 0.7552,
+      "step": 839
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.399384715886149,
+      "learning_rate": 0.00012160178219764837,
+      "loss": 0.6697,
+      "step": 840
+    },
+    {
+      "epoch": 0.44853333333333334,
+      "grad_norm": 0.4463188348707578,
+      "learning_rate": 0.0001214330251753481,
+      "loss": 0.7402,
+      "step": 841
+    },
+    {
+      "epoch": 0.44906666666666667,
+      "grad_norm": 0.3759968799713907,
+      "learning_rate": 0.00012126420415078132,
+      "loss": 0.6409,
+      "step": 842
+    },
+    {
+      "epoch": 0.4496,
+      "grad_norm": 0.37556405363020284,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 0.7072,
+      "step": 843
+    },
+    {
+      "epoch": 0.45013333333333333,
+      "grad_norm": 0.4421198755249828,
+      "learning_rate": 0.00012092637211153885,
+      "loss": 0.7361,
+      "step": 844
+    },
+    {
+      "epoch": 0.45066666666666666,
+      "grad_norm": 0.39512134390623027,
+      "learning_rate": 0.0001207573621056809,
+      "loss": 0.6605,
+      "step": 845
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3855507896392054,
+      "learning_rate": 0.00012058829011518896,
+      "loss": 0.6841,
+      "step": 846
+    },
+    {
+      "epoch": 0.4517333333333333,
+      "grad_norm": 0.4220550953706188,
+      "learning_rate": 0.00012041915664493761,
+      "loss": 0.6901,
+      "step": 847
+    },
+    {
+      "epoch": 0.45226666666666665,
+      "grad_norm": 0.5037067808110443,
+      "learning_rate": 0.00012024996219998517,
+      "loss": 0.7213,
+      "step": 848
+    },
+    {
+      "epoch": 0.4528,
+      "grad_norm": 0.3699300074793161,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 0.6832,
+      "step": 849
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.3990881923428507,
+      "learning_rate": 0.00011991139240711857,
+      "loss": 0.6839,
+      "step": 850
+    },
+    {
+      "epoch": 0.45386666666666664,
+      "grad_norm": 0.4612287868870594,
+      "learning_rate": 0.00011974201807022525,
+      "loss": 0.7065,
+      "step": 851
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.43290972291491714,
+      "learning_rate": 0.00011957258478066931,
+      "loss": 0.7652,
+      "step": 852
+    },
+    {
+      "epoch": 0.45493333333333336,
+      "grad_norm": 0.3696005701833874,
+      "learning_rate": 0.00011940309304440433,
+      "loss": 0.6498,
+      "step": 853
+    },
+    {
+      "epoch": 0.4554666666666667,
+      "grad_norm": 0.41640436222988,
+      "learning_rate": 0.00011923354336755835,
+      "loss": 0.6839,
+      "step": 854
+    },
+    {
+      "epoch": 0.456,
+      "grad_norm": 0.41028691133947254,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 0.7063,
+      "step": 855
+    },
+    {
+      "epoch": 0.45653333333333335,
+      "grad_norm": 0.3877045556479844,
+      "learning_rate": 0.00011889427221749916,
+      "loss": 0.6798,
+      "step": 856
+    },
+    {
+      "epoch": 0.4570666666666667,
+      "grad_norm": 0.3590655618247176,
+      "learning_rate": 0.00011872455175740112,
+      "loss": 0.6322,
+      "step": 857
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.4520343453078009,
+      "learning_rate": 0.00011855477538294935,
+      "loss": 0.7226,
+      "step": 858
+    },
+    {
+      "epoch": 0.45813333333333334,
+      "grad_norm": 0.41971932607213375,
+      "learning_rate": 0.00011838494360112185,
+      "loss": 0.7178,
+      "step": 859
+    },
+    {
+      "epoch": 0.45866666666666667,
+      "grad_norm": 0.3879343467070938,
+      "learning_rate": 0.00011821505691906216,
+      "loss": 0.6642,
+      "step": 860
+    },
+    {
+      "epoch": 0.4592,
+      "grad_norm": 0.4005189721475758,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 0.6933,
+      "step": 861
+    },
+    {
+      "epoch": 0.4597333333333333,
+      "grad_norm": 0.42617766373244964,
+      "learning_rate": 0.00011787512088363817,
+      "loss": 0.7703,
+      "step": 862
+    },
+    {
+      "epoch": 0.46026666666666666,
+      "grad_norm": 0.41182650350301025,
+      "learning_rate": 0.00011770507254537453,
+      "loss": 0.6767,
+      "step": 863
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.40678105665865283,
+      "learning_rate": 0.00011753497133707679,
+      "loss": 0.7811,
+      "step": 864
+    },
+    {
+      "epoch": 0.4613333333333333,
+      "grad_norm": 0.37243949404933646,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.6604,
+      "step": 865
+    },
+    {
+      "epoch": 0.46186666666666665,
+      "grad_norm": 0.3769552998407579,
+      "learning_rate": 0.00011719461234232764,
+      "loss": 0.6583,
+      "step": 866
+    },
+    {
+      "epoch": 0.4624,
+      "grad_norm": 0.4373732641135685,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 0.7811,
+      "step": 867
+    },
+    {
+      "epoch": 0.4629333333333333,
+      "grad_norm": 0.425244461080885,
+      "learning_rate": 0.00011685404796484225,
+      "loss": 0.7476,
+      "step": 868
+    },
+    {
+      "epoch": 0.4634666666666667,
+      "grad_norm": 0.3876880020936677,
+      "learning_rate": 0.00011668369002869912,
+      "loss": 0.7322,
+      "step": 869
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4570638397527968,
+      "learning_rate": 0.00011651328227252517,
+      "loss": 0.7778,
+      "step": 870
+    },
+    {
+      "epoch": 0.46453333333333335,
+      "grad_norm": 0.4224961564350872,
+      "learning_rate": 0.00011634282520518383,
+      "loss": 0.7539,
+      "step": 871
+    },
+    {
+      "epoch": 0.4650666666666667,
+      "grad_norm": 0.39304313298896326,
+      "learning_rate": 0.00011617231933568578,
+      "loss": 0.7135,
+      "step": 872
+    },
+    {
+      "epoch": 0.4656,
+      "grad_norm": 0.4828825334181347,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 0.7783,
+      "step": 873
+    },
+    {
+      "epoch": 0.46613333333333334,
+      "grad_norm": 0.39283858698300056,
+      "learning_rate": 0.00011583116322698935,
+      "loss": 0.7025,
+      "step": 874
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.3573495234761727,
+      "learning_rate": 0.00011566051400653486,
+      "loss": 0.6791,
+      "step": 875
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.37294525593683886,
+      "learning_rate": 0.00011548981802140848,
+      "loss": 0.6384,
+      "step": 876
+    },
+    {
+      "epoch": 0.46773333333333333,
+      "grad_norm": 0.3963777636690801,
+      "learning_rate": 0.00011531907578133429,
+      "loss": 0.6534,
+      "step": 877
+    },
+    {
+      "epoch": 0.46826666666666666,
+      "grad_norm": 0.4180680996400134,
+      "learning_rate": 0.00011514828779617459,
+      "loss": 0.7239,
+      "step": 878
+    },
+    {
+      "epoch": 0.4688,
+      "grad_norm": 0.4042374723954916,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 0.6725,
+      "step": 879
+    },
+    {
+      "epoch": 0.4693333333333333,
+      "grad_norm": 0.3534423154198521,
+      "learning_rate": 0.00011480657663072896,
+      "loss": 0.601,
+      "step": 880
+    },
+    {
+      "epoch": 0.46986666666666665,
+      "grad_norm": 0.388492454037105,
+      "learning_rate": 0.00011463565447084445,
+      "loss": 0.6695,
+      "step": 881
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.44272789321840517,
+      "learning_rate": 0.00011446468860667421,
+      "loss": 0.7276,
+      "step": 882
+    },
+    {
+      "epoch": 0.4709333333333333,
+      "grad_norm": 0.3845541353315032,
+      "learning_rate": 0.00011429367954874819,
+      "loss": 0.6676,
+      "step": 883
+    },
+    {
+      "epoch": 0.47146666666666665,
+      "grad_norm": 0.36052534637825623,
+      "learning_rate": 0.0001141226278077254,
+      "loss": 0.6312,
+      "step": 884
+    },
+    {
+      "epoch": 0.472,
+      "grad_norm": 0.4274973330176012,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 0.7132,
+      "step": 885
+    },
+    {
+      "epoch": 0.47253333333333336,
+      "grad_norm": 0.40812052906393176,
+      "learning_rate": 0.00011378039831966134,
+      "loss": 0.7252,
+      "step": 886
+    },
+    {
+      "epoch": 0.4730666666666667,
+      "grad_norm": 0.47381163596812353,
+      "learning_rate": 0.00011360922159456928,
+      "loss": 0.6752,
+      "step": 887
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3911207899628515,
+      "learning_rate": 0.00011343800423027582,
+      "loss": 0.724,
+      "step": 888
+    },
+    {
+      "epoch": 0.47413333333333335,
+      "grad_norm": 0.4758575998832261,
+      "learning_rate": 0.00011326674673806195,
+      "loss": 0.7727,
+      "step": 889
+    },
+    {
+      "epoch": 0.4746666666666667,
+      "grad_norm": 0.4391072997272904,
+      "learning_rate": 0.00011309544962932862,
+      "loss": 0.7659,
+      "step": 890
+    },
+    {
+      "epoch": 0.4752,
+      "grad_norm": 0.3710229549017468,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 0.6811,
+      "step": 891
+    },
+    {
+      "epoch": 0.47573333333333334,
+      "grad_norm": 0.3801864537301717,
+      "learning_rate": 0.00011275273860849684,
+      "loss": 0.6955,
+      "step": 892
+    },
+    {
+      "epoch": 0.47626666666666667,
+      "grad_norm": 0.41131982423697444,
+      "learning_rate": 0.00011258132571978555,
+      "loss": 0.7136,
+      "step": 893
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.370602754197648,
+      "learning_rate": 0.00011240987526132594,
+      "loss": 0.6703,
+      "step": 894
+    },
+    {
+      "epoch": 0.47733333333333333,
+      "grad_norm": 0.41842152101094837,
+      "learning_rate": 0.00011223838774509514,
+      "loss": 0.7199,
+      "step": 895
+    },
+    {
+      "epoch": 0.47786666666666666,
+      "grad_norm": 0.46319395319681844,
+      "learning_rate": 0.00011206686368318086,
+      "loss": 0.7653,
+      "step": 896
+    },
+    {
+      "epoch": 0.4784,
+      "grad_norm": 0.351304136769783,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 0.6267,
+      "step": 897
+    },
+    {
+      "epoch": 0.4789333333333333,
+      "grad_norm": 0.3788732446079422,
+      "learning_rate": 0.00011172370797119712,
+      "loss": 0.6522,
+      "step": 898
+    },
+    {
+      "epoch": 0.47946666666666665,
+      "grad_norm": 0.4775016588998876,
+      "learning_rate": 0.00011155207734584263,
+      "loss": 0.6575,
+      "step": 899
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4205234225489854,
+      "learning_rate": 0.00011138041222423177,
+      "loss": 0.7306,
+      "step": 900
+    },
+    {
+      "epoch": 0.4805333333333333,
+      "grad_norm": 0.404234104372383,
+      "learning_rate": 0.00011120871311898254,
+      "loss": 0.7095,
+      "step": 901
+    },
+    {
+      "epoch": 0.48106666666666664,
+      "grad_norm": 0.39376381823555934,
+      "learning_rate": 0.0001110369805428146,
+      "loss": 0.6596,
+      "step": 902
+    },
+    {
+      "epoch": 0.4816,
+      "grad_norm": 0.3808123774052564,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 0.6682,
+      "step": 903
+    },
+    {
+      "epoch": 0.48213333333333336,
+      "grad_norm": 0.4229128793483658,
+      "learning_rate": 0.0001106934170290991,
+      "loss": 0.7182,
+      "step": 904
+    },
+    {
+      "epoch": 0.4826666666666667,
+      "grad_norm": 0.40319828089237675,
+      "learning_rate": 0.00011052158711748434,
+      "loss": 0.7313,
+      "step": 905
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.37310361858205293,
+      "learning_rate": 0.00011034972578681338,
+      "loss": 0.6417,
+      "step": 906
+    },
+    {
+      "epoch": 0.48373333333333335,
+      "grad_norm": 0.38928550027066544,
+      "learning_rate": 0.00011017783355029026,
+      "loss": 0.6768,
+      "step": 907
+    },
+    {
+      "epoch": 0.4842666666666667,
+      "grad_norm": 0.3970654743605492,
+      "learning_rate": 0.00011000591092121127,
+      "loss": 0.6732,
+      "step": 908
+    },
+    {
+      "epoch": 0.4848,
+      "grad_norm": 0.4251433741320817,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 0.7463,
+      "step": 909
+    },
+    {
+      "epoch": 0.48533333333333334,
+      "grad_norm": 0.44272744274351933,
+      "learning_rate": 0.0001096619765390232,
+      "loss": 0.7035,
+      "step": 910
+    },
+    {
+      "epoch": 0.48586666666666667,
+      "grad_norm": 0.3849067465010661,
+      "learning_rate": 0.00010948996581295436,
+      "loss": 0.6871,
+      "step": 911
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.4225859295226142,
+      "learning_rate": 0.00010931792674840718,
+      "loss": 0.7724,
+      "step": 912
+    },
+    {
+      "epoch": 0.48693333333333333,
+      "grad_norm": 0.43786770910781314,
+      "learning_rate": 0.00010914585985911632,
+      "loss": 0.7049,
+      "step": 913
+    },
+    {
+      "epoch": 0.48746666666666666,
+      "grad_norm": 0.37997218728789206,
+      "learning_rate": 0.00010897376565889971,
+      "loss": 0.6473,
+      "step": 914
+    },
+    {
+      "epoch": 0.488,
+      "grad_norm": 0.4191191877744639,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 0.7154,
+      "step": 915
+    },
+    {
+      "epoch": 0.4885333333333333,
+      "grad_norm": 0.37749020920129717,
+      "learning_rate": 0.00010862949738136681,
+      "loss": 0.6718,
+      "step": 916
+    },
+    {
+      "epoch": 0.48906666666666665,
+      "grad_norm": 0.5116400364670469,
+      "learning_rate": 0.00010845732433208779,
+      "loss": 0.7458,
+      "step": 917
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3797726500887027,
+      "learning_rate": 0.00010828512602795462,
+      "loss": 0.6239,
+      "step": 918
+    },
+    {
+      "epoch": 0.4901333333333333,
+      "grad_norm": 0.40612311974486387,
+      "learning_rate": 0.00010811290298317755,
+      "loss": 0.6561,
+      "step": 919
+    },
+    {
+      "epoch": 0.49066666666666664,
+      "grad_norm": 0.4550093082713764,
+      "learning_rate": 0.00010794065571204072,
+      "loss": 0.7613,
+      "step": 920
+    },
+    {
+      "epoch": 0.4912,
+      "grad_norm": 0.45656721520114374,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 0.7069,
+      "step": 921
+    },
+    {
+      "epoch": 0.49173333333333336,
+      "grad_norm": 0.43808840491427853,
+      "learning_rate": 0.00010759609054818458,
+      "loss": 0.7258,
+      "step": 922
+    },
+    {
+      "epoch": 0.4922666666666667,
+      "grad_norm": 0.40561370160676213,
+      "learning_rate": 0.00010742377368438914,
+      "loss": 0.6875,
+      "step": 923
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4590103093416851,
+      "learning_rate": 0.00010725143465207867,
+      "loss": 0.7326,
+      "step": 924
+    },
+    {
+      "epoch": 0.49333333333333335,
+      "grad_norm": 0.40124825398399744,
+      "learning_rate": 0.00010707907396588361,
+      "loss": 0.695,
+      "step": 925
+    },
+    {
+      "epoch": 0.4938666666666667,
+      "grad_norm": 0.3936243245443685,
+      "learning_rate": 0.0001069066921404992,
+      "loss": 0.6688,
+      "step": 926
+    },
+    {
+      "epoch": 0.4944,
+      "grad_norm": 0.3870139727112456,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 0.673,
+      "step": 927
+    },
+    {
+      "epoch": 0.49493333333333334,
+      "grad_norm": 0.44462372212712714,
+      "learning_rate": 0.00010656186713125689,
+      "loss": 0.7599,
+      "step": 928
+    },
+    {
+      "epoch": 0.49546666666666667,
+      "grad_norm": 0.3956370931937604,
+      "learning_rate": 0.0001063894249770989,
+      "loss": 0.6413,
+      "step": 929
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3996632685907131,
+      "learning_rate": 0.00010621696374314807,
+      "loss": 0.6835,
+      "step": 930
+    },
+    {
+      "epoch": 0.4965333333333333,
+      "grad_norm": 0.4068104029537071,
+      "learning_rate": 0.00010604448394439983,
+      "loss": 0.7157,
+      "step": 931
+    },
+    {
+      "epoch": 0.49706666666666666,
+      "grad_norm": 0.42322336691701257,
+      "learning_rate": 0.00010587198609590505,
+      "loss": 0.7218,
+      "step": 932
+    },
+    {
+      "epoch": 0.4976,
+      "grad_norm": 0.4395433933653365,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 0.6782,
+      "step": 933
+    },
+    {
+      "epoch": 0.4981333333333333,
+      "grad_norm": 0.38059636946312014,
+      "learning_rate": 0.00010552693831014726,
+      "loss": 0.6609,
+      "step": 934
+    },
+    {
+      "epoch": 0.49866666666666665,
+      "grad_norm": 0.39326972293461676,
+      "learning_rate": 0.0001053543894032493,
+      "loss": 0.6654,
+      "step": 935
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3589359665208239,
+      "learning_rate": 0.00010518182450733186,
+      "loss": 0.6738,
+      "step": 936
+    },
+    {
+      "epoch": 0.4997333333333333,
+      "grad_norm": 0.7439200515520827,
+      "learning_rate": 0.00010500924413769988,
+      "loss": 0.747,
+      "step": 937
+    },
+    {
+      "epoch": 0.5002666666666666,
+      "grad_norm": 0.40245284090630545,
+      "learning_rate": 0.00010483664880970457,
+      "loss": 0.6778,
+      "step": 938
+    },
+    {
+      "epoch": 0.5008,
+      "grad_norm": 0.38695190004885227,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 0.6907,
+      "step": 939
+    },
+    {
+      "epoch": 0.5013333333333333,
+      "grad_norm": 0.41700901306124494,
+      "learning_rate": 0.00010449141534025045,
+      "loss": 0.6924,
+      "step": 940
+    },
+    {
+      "epoch": 0.5018666666666667,
+      "grad_norm": 0.4322749113351211,
+      "learning_rate": 0.00010431877822971117,
+      "loss": 0.6861,
+      "step": 941
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.40759648053081476,
+      "learning_rate": 0.00010414612822264455,
+      "loss": 0.6973,
+      "step": 942
+    },
+    {
+      "epoch": 0.5029333333333333,
+      "grad_norm": 0.4064060370392942,
+      "learning_rate": 0.00010397346583460971,
+      "loss": 0.6783,
+      "step": 943
+    },
+    {
+      "epoch": 0.5034666666666666,
+      "grad_norm": 0.38600386952418253,
+      "learning_rate": 0.0001038007915812028,
+      "loss": 0.6561,
+      "step": 944
+    },
+    {
+      "epoch": 0.504,
+      "grad_norm": 0.390818005507248,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 0.6918,
+      "step": 945
+    },
+    {
+      "epoch": 0.5045333333333333,
+      "grad_norm": 0.41219389096047593,
+      "learning_rate": 0.0001034554095408326,
+      "loss": 0.6661,
+      "step": 946
+    },
+    {
+      "epoch": 0.5050666666666667,
+      "grad_norm": 0.3733957810465592,
+      "learning_rate": 0.00010328270278523256,
+      "loss": 0.6712,
+      "step": 947
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.4108558643674988,
+      "learning_rate": 0.0001031099862269837,
+      "loss": 0.715,
+      "step": 948
+    },
+    {
+      "epoch": 0.5061333333333333,
+      "grad_norm": 0.3971854201780952,
+      "learning_rate": 0.00010293726038184393,
+      "loss": 0.7098,
+      "step": 949
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.43214348210498155,
+      "learning_rate": 0.00010276452576559879,
+      "loss": 0.697,
+      "step": 950
+    },
+    {
+      "epoch": 0.5072,
+      "grad_norm": 0.43256766403814767,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 0.7776,
+      "step": 951
+    },
+    {
+      "epoch": 0.5077333333333334,
+      "grad_norm": 0.4031860596371652,
+      "learning_rate": 0.00010241903228306431,
+      "loss": 0.6837,
+      "step": 952
+    },
+    {
+      "epoch": 0.5082666666666666,
+      "grad_norm": 0.373606372821283,
+      "learning_rate": 0.0001022462744484709,
+      "loss": 0.6526,
+      "step": 953
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.3609247548304206,
+      "learning_rate": 0.00010207350990616107,
+      "loss": 0.6742,
+      "step": 954
+    },
+    {
+      "epoch": 0.5093333333333333,
+      "grad_norm": 0.36064261171581113,
+      "learning_rate": 0.00010190073917203589,
+      "loss": 0.6409,
+      "step": 955
+    },
+    {
+      "epoch": 0.5098666666666667,
+      "grad_norm": 0.4271338586204693,
+      "learning_rate": 0.00010172796276201503,
+      "loss": 0.7081,
+      "step": 956
+    },
+    {
+      "epoch": 0.5104,
+      "grad_norm": 0.4337110973339837,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 0.6522,
+      "step": 957
+    },
+    {
+      "epoch": 0.5109333333333334,
+      "grad_norm": 0.4067268385500411,
+      "learning_rate": 0.00010138239497804804,
+      "loss": 0.6979,
+      "step": 958
+    },
+    {
+      "epoch": 0.5114666666666666,
+      "grad_norm": 0.422313982241205,
+      "learning_rate": 0.00010120960463601976,
+      "loss": 0.7357,
+      "step": 959
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4283993469438326,
+      "learning_rate": 0.00010103681068192845,
+      "loss": 0.7276,
+      "step": 960
+    },
+    {
+      "epoch": 0.5125333333333333,
+      "grad_norm": 0.35585101619802095,
+      "learning_rate": 0.00010086401363176305,
+      "loss": 0.6589,
+      "step": 961
+    },
+    {
+      "epoch": 0.5130666666666667,
+      "grad_norm": 0.3885030459699722,
+      "learning_rate": 0.00010069121400152181,
+      "loss": 0.6157,
+      "step": 962
+    },
+    {
+      "epoch": 0.5136,
+      "grad_norm": 0.4611681571492855,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 0.7334,
+      "step": 963
+    },
+    {
+      "epoch": 0.5141333333333333,
+      "grad_norm": 0.3789298367290173,
+      "learning_rate": 0.0001003456090648416,
+      "loss": 0.6339,
+      "step": 964
+    },
+    {
+      "epoch": 0.5146666666666667,
+      "grad_norm": 0.3955283433599766,
+      "learning_rate": 0.00010017280479043147,
+      "loss": 0.6829,
+      "step": 965
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.3828712806641332,
+      "learning_rate": 0.0001,
+      "loss": 0.6907,
+      "step": 966
+    },
+    {
+      "epoch": 0.5157333333333334,
+      "grad_norm": 0.44360063787426984,
+      "learning_rate": 9.982719520956855e-05,
+      "loss": 0.7593,
+      "step": 967
+    },
+    {
+      "epoch": 0.5162666666666667,
+      "grad_norm": 0.41959626066678757,
+      "learning_rate": 9.965439093515841e-05,
+      "loss": 0.6632,
+      "step": 968
+    },
+    {
+      "epoch": 0.5168,
+      "grad_norm": 0.4178887763089197,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 0.7088,
+      "step": 969
+    },
+    {
+      "epoch": 0.5173333333333333,
+      "grad_norm": 0.4273794934405812,
+      "learning_rate": 9.930878599847821e-05,
+      "loss": 0.6826,
+      "step": 970
+    },
+    {
+      "epoch": 0.5178666666666667,
+      "grad_norm": 0.44613303752223565,
+      "learning_rate": 9.913598636823693e-05,
+      "loss": 0.6792,
+      "step": 971
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.41113502584267975,
+      "learning_rate": 9.896318931807155e-05,
+      "loss": 0.6822,
+      "step": 972
+    },
+    {
+      "epoch": 0.5189333333333334,
+      "grad_norm": 0.40633883955027034,
+      "learning_rate": 9.879039536398024e-05,
+      "loss": 0.7026,
+      "step": 973
+    },
+    {
+      "epoch": 0.5194666666666666,
+      "grad_norm": 0.4383419652470069,
+      "learning_rate": 9.861760502195197e-05,
+      "loss": 0.7594,
+      "step": 974
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.4428434561270538,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 0.7308,
+      "step": 975
+    },
+    {
+      "epoch": 0.5205333333333333,
+      "grad_norm": 0.3884338091203555,
+      "learning_rate": 9.827203723798498e-05,
+      "loss": 0.6943,
+      "step": 976
+    },
+    {
+      "epoch": 0.5210666666666667,
+      "grad_norm": 0.37470666307947165,
+      "learning_rate": 9.809926082796415e-05,
+      "loss": 0.6707,
+      "step": 977
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.41801931443942375,
+      "learning_rate": 9.792649009383899e-05,
+      "loss": 0.6353,
+      "step": 978
+    },
+    {
+      "epoch": 0.5221333333333333,
+      "grad_norm": 0.40988747215255444,
+      "learning_rate": 9.775372555152912e-05,
+      "loss": 0.7005,
+      "step": 979
+    },
+    {
+      "epoch": 0.5226666666666666,
+      "grad_norm": 0.39827425654353676,
+      "learning_rate": 9.758096771693573e-05,
+      "loss": 0.6945,
+      "step": 980
+    },
+    {
+      "epoch": 0.5232,
+      "grad_norm": 0.4558415624953074,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 0.7509,
+      "step": 981
+    },
+    {
+      "epoch": 0.5237333333333334,
+      "grad_norm": 0.3526514170147418,
+      "learning_rate": 9.723547423440122e-05,
+      "loss": 0.6492,
+      "step": 982
+    },
+    {
+      "epoch": 0.5242666666666667,
+      "grad_norm": 0.4579002707791232,
+      "learning_rate": 9.70627396181561e-05,
+      "loss": 0.7733,
+      "step": 983
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.40582604777311926,
+      "learning_rate": 9.689001377301633e-05,
+      "loss": 0.7124,
+      "step": 984
+    },
+    {
+      "epoch": 0.5253333333333333,
+      "grad_norm": 0.4418290278290973,
+      "learning_rate": 9.671729721476746e-05,
+      "loss": 0.7811,
+      "step": 985
+    },
+    {
+      "epoch": 0.5258666666666667,
+      "grad_norm": 0.38216328313521813,
+      "learning_rate": 9.654459045916743e-05,
+      "loss": 0.6807,
+      "step": 986
+    },
+    {
+      "epoch": 0.5264,
+      "grad_norm": 0.3801398406592987,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 0.7571,
+      "step": 987
+    },
+    {
+      "epoch": 0.5269333333333334,
+      "grad_norm": 0.36203756917696256,
+      "learning_rate": 9.619920841879725e-05,
+      "loss": 0.6301,
+      "step": 988
+    },
+    {
+      "epoch": 0.5274666666666666,
+      "grad_norm": 0.4068819481440644,
+      "learning_rate": 9.602653416539031e-05,
+      "loss": 0.6912,
+      "step": 989
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.4427916418611001,
+      "learning_rate": 9.585387177735547e-05,
+      "loss": 0.7901,
+      "step": 990
+    },
+    {
+      "epoch": 0.5285333333333333,
+      "grad_norm": 0.36702428866772624,
+      "learning_rate": 9.568122177028884e-05,
+      "loss": 0.646,
+      "step": 991
+    },
+    {
+      "epoch": 0.5290666666666667,
+      "grad_norm": 0.39937231114065663,
+      "learning_rate": 9.550858465974958e-05,
+      "loss": 0.7289,
+      "step": 992
+    },
+    {
+      "epoch": 0.5296,
+      "grad_norm": 0.42816775455491346,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 0.7307,
+      "step": 993
+    },
+    {
+      "epoch": 0.5301333333333333,
+      "grad_norm": 0.38077046269227127,
+      "learning_rate": 9.516335119029546e-05,
+      "loss": 0.6121,
+      "step": 994
+    },
+    {
+      "epoch": 0.5306666666666666,
+      "grad_norm": 0.3923238270995547,
+      "learning_rate": 9.499075586230013e-05,
+      "loss": 0.6923,
+      "step": 995
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.36767549701070884,
+      "learning_rate": 9.481817549266817e-05,
+      "loss": 0.6722,
+      "step": 996
+    },
+    {
+      "epoch": 0.5317333333333333,
+      "grad_norm": 0.36190059027736843,
+      "learning_rate": 9.464561059675073e-05,
+      "loss": 0.6385,
+      "step": 997
+    },
+    {
+      "epoch": 0.5322666666666667,
+      "grad_norm": 0.3856851764676648,
+      "learning_rate": 9.44730616898528e-05,
+      "loss": 0.7118,
+      "step": 998
+    },
+    {
+      "epoch": 0.5328,
+      "grad_norm": 0.37045340896227735,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 0.6541,
+      "step": 999
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.4018945451754231,
+      "learning_rate": 9.412801390409497e-05,
+      "loss": 0.707,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5338666666666667,
+      "grad_norm": 0.37834413149463314,
+      "learning_rate": 9.395551605560018e-05,
+      "loss": 0.7148,
+      "step": 1001
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.40264628353900195,
+      "learning_rate": 9.378303625685195e-05,
+      "loss": 0.7668,
+      "step": 1002
+    },
+    {
+      "epoch": 0.5349333333333334,
+      "grad_norm": 0.3813446944853318,
+      "learning_rate": 9.361057502290113e-05,
+      "loss": 0.6816,
+      "step": 1003
+    },
+    {
+      "epoch": 0.5354666666666666,
+      "grad_norm": 0.45300385140327293,
+      "learning_rate": 9.343813286874312e-05,
+      "loss": 0.695,
+      "step": 1004
+    },
+    {
+      "epoch": 0.536,
+      "grad_norm": 0.4318609221754859,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 0.7173,
+      "step": 1005
+    },
+    {
+      "epoch": 0.5365333333333333,
+      "grad_norm": 0.3818750659761753,
+      "learning_rate": 9.309330785950086e-05,
+      "loss": 0.707,
+      "step": 1006
+    },
+    {
+      "epoch": 0.5370666666666667,
+      "grad_norm": 0.41832051824530075,
+      "learning_rate": 9.292092603411641e-05,
+      "loss": 0.67,
+      "step": 1007
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.47118792133565957,
+      "learning_rate": 9.274856534792138e-05,
+      "loss": 0.815,
+      "step": 1008
+    },
+    {
+      "epoch": 0.5381333333333334,
+      "grad_norm": 0.45440639362174423,
+      "learning_rate": 9.257622631561085e-05,
+      "loss": 0.8078,
+      "step": 1009
+    },
+    {
+      "epoch": 0.5386666666666666,
+      "grad_norm": 0.41139339937931796,
+      "learning_rate": 9.240390945181543e-05,
+      "loss": 0.6589,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5392,
+      "grad_norm": 0.4242877542971082,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 0.6894,
+      "step": 1011
+    },
+    {
+      "epoch": 0.5397333333333333,
+      "grad_norm": 0.4122314626824244,
+      "learning_rate": 9.205934428795929e-05,
+      "loss": 0.6659,
+      "step": 1012
+    },
+    {
+      "epoch": 0.5402666666666667,
+      "grad_norm": 0.3942405704639682,
+      "learning_rate": 9.188709701682247e-05,
+      "loss": 0.6372,
+      "step": 1013
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.48786014425422236,
+      "learning_rate": 9.171487397204539e-05,
+      "loss": 0.758,
+      "step": 1014
+    },
+    {
+      "epoch": 0.5413333333333333,
+      "grad_norm": 0.4586009322004272,
+      "learning_rate": 9.154267566791223e-05,
+      "loss": 0.7957,
+      "step": 1015
+    },
+    {
+      "epoch": 0.5418666666666667,
+      "grad_norm": 0.3481292378201403,
+      "learning_rate": 9.137050261863324e-05,
+      "loss": 0.6301,
+      "step": 1016
+    },
+    {
+      "epoch": 0.5424,
+      "grad_norm": 0.4261357442235494,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 0.6973,
+      "step": 1017
+    },
+    {
+      "epoch": 0.5429333333333334,
+      "grad_norm": 0.43015655288662064,
+      "learning_rate": 9.102623434110028e-05,
+      "loss": 0.6912,
+      "step": 1018
+    },
+    {
+      "epoch": 0.5434666666666667,
+      "grad_norm": 0.44261951636802793,
+      "learning_rate": 9.085414014088369e-05,
+      "loss": 0.7114,
+      "step": 1019
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.39275232774805957,
+      "learning_rate": 9.068207325159284e-05,
+      "loss": 0.7018,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5445333333333333,
+      "grad_norm": 0.36966361537506054,
+      "learning_rate": 9.051003418704565e-05,
+      "loss": 0.7008,
+      "step": 1021
+    },
+    {
+      "epoch": 0.5450666666666667,
+      "grad_norm": 0.39882434045775006,
+      "learning_rate": 9.033802346097682e-05,
+      "loss": 0.6969,
+      "step": 1022
+    },
+    {
+      "epoch": 0.5456,
+      "grad_norm": 0.4307931222905577,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 0.7471,
+      "step": 1023
+    },
+    {
+      "epoch": 0.5461333333333334,
+      "grad_norm": 0.412152370318337,
+      "learning_rate": 8.999408907878877e-05,
+      "loss": 0.7388,
+      "step": 1024
+    },
+    {
+      "epoch": 0.5466666666666666,
+      "grad_norm": 0.44841521091369047,
+      "learning_rate": 8.982216644970979e-05,
+      "loss": 0.7133,
+      "step": 1025
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.38746110189562677,
+      "learning_rate": 8.965027421318665e-05,
+      "loss": 0.6278,
+      "step": 1026
+    },
+    {
+      "epoch": 0.5477333333333333,
+      "grad_norm": 0.3642761748368395,
+      "learning_rate": 8.947841288251568e-05,
+      "loss": 0.6477,
+      "step": 1027
+    },
+    {
+      "epoch": 0.5482666666666667,
+      "grad_norm": 0.38558497241363154,
+      "learning_rate": 8.930658297090091e-05,
+      "loss": 0.6808,
+      "step": 1028
+    },
+    {
+      "epoch": 0.5488,
+      "grad_norm": 0.42269928725720274,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 0.6766,
+      "step": 1029
+    },
+    {
+      "epoch": 0.5493333333333333,
+      "grad_norm": 0.4684943238768462,
+      "learning_rate": 8.896301945718541e-05,
+      "loss": 0.714,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5498666666666666,
+      "grad_norm": 0.45473212582450806,
+      "learning_rate": 8.879128688101749e-05,
+      "loss": 0.7519,
+      "step": 1031
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.38370641760493457,
+      "learning_rate": 8.861958777576827e-05,
+      "loss": 0.6784,
+      "step": 1032
+    },
+    {
+      "epoch": 0.5509333333333334,
+      "grad_norm": 0.4446554193607567,
+      "learning_rate": 8.844792265415738e-05,
+      "loss": 0.7339,
+      "step": 1033
+    },
+    {
+      "epoch": 0.5514666666666667,
+      "grad_norm": 0.5021184919345888,
+      "learning_rate": 8.827629202880293e-05,
+      "loss": 0.701,
+      "step": 1034
+    },
+    {
+      "epoch": 0.552,
+      "grad_norm": 0.37286008172473245,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 0.6382,
+      "step": 1035
+    },
+    {
+      "epoch": 0.5525333333333333,
+      "grad_norm": 0.3898261852650365,
+      "learning_rate": 8.793313631681915e-05,
+      "loss": 0.6524,
+      "step": 1036
+    },
+    {
+      "epoch": 0.5530666666666667,
+      "grad_norm": 0.39075418110480387,
+      "learning_rate": 8.776161225490489e-05,
+      "loss": 0.6586,
+      "step": 1037
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.34568309289503596,
+      "learning_rate": 8.759012473867407e-05,
+      "loss": 0.5977,
+      "step": 1038
+    },
+    {
+      "epoch": 0.5541333333333334,
+      "grad_norm": 0.37554068268681406,
+      "learning_rate": 8.741867428021446e-05,
+      "loss": 0.7089,
+      "step": 1039
+    },
+    {
+      "epoch": 0.5546666666666666,
+      "grad_norm": 0.4318235268052963,
+      "learning_rate": 8.724726139150318e-05,
+      "loss": 0.7457,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5552,
+      "grad_norm": 0.41952136163111886,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 0.6684,
+      "step": 1041
+    },
+    {
+      "epoch": 0.5557333333333333,
+      "grad_norm": 0.3535850500532471,
+      "learning_rate": 8.690455037067141e-05,
+      "loss": 0.6302,
+      "step": 1042
+    },
+    {
+      "epoch": 0.5562666666666667,
+      "grad_norm": 0.40140318453092033,
+      "learning_rate": 8.673325326193806e-05,
+      "loss": 0.667,
+      "step": 1043
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.4126501746046163,
+      "learning_rate": 8.656199576972423e-05,
+      "loss": 0.7622,
+      "step": 1044
+    },
+    {
+      "epoch": 0.5573333333333333,
+      "grad_norm": 0.36902047840591456,
+      "learning_rate": 8.639077840543077e-05,
+      "loss": 0.6748,
+      "step": 1045
+    },
+    {
+      "epoch": 0.5578666666666666,
+      "grad_norm": 0.4218383182542466,
+      "learning_rate": 8.621960168033867e-05,
+      "loss": 0.6723,
+      "step": 1046
+    },
+    {
+      "epoch": 0.5584,
+      "grad_norm": 0.4803868703274385,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 0.6888,
+      "step": 1047
+    },
+    {
+      "epoch": 0.5589333333333333,
+      "grad_norm": 0.41499017197056104,
+      "learning_rate": 8.587737219227462e-05,
+      "loss": 0.6625,
+      "step": 1048
+    },
+    {
+      "epoch": 0.5594666666666667,
+      "grad_norm": 0.4758356386087854,
+      "learning_rate": 8.570632045125185e-05,
+      "loss": 0.691,
+      "step": 1049
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3575237560348443,
+      "learning_rate": 8.553531139332582e-05,
+      "loss": 0.683,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5605333333333333,
+      "grad_norm": 0.4191047803710979,
+      "learning_rate": 8.536434552915556e-05,
+      "loss": 0.7114,
+      "step": 1051
+    },
+    {
+      "epoch": 0.5610666666666667,
+      "grad_norm": 0.4416907587765159,
+      "learning_rate": 8.519342336927105e-05,
+      "loss": 0.7463,
+      "step": 1052
+    },
+    {
+      "epoch": 0.5616,
+      "grad_norm": 0.45971802278763857,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 0.7283,
+      "step": 1053
+    },
+    {
+      "epoch": 0.5621333333333334,
+      "grad_norm": 0.39686142791469187,
+      "learning_rate": 8.485171220382545e-05,
+      "loss": 0.693,
+      "step": 1054
+    },
+    {
+      "epoch": 0.5626666666666666,
+      "grad_norm": 0.40324297018751654,
+      "learning_rate": 8.468092421866573e-05,
+      "loss": 0.6466,
+      "step": 1055
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.40873675796188375,
+      "learning_rate": 8.451018197859153e-05,
+      "loss": 0.679,
+      "step": 1056
+    },
+    {
+      "epoch": 0.5637333333333333,
+      "grad_norm": 0.3648416970361638,
+      "learning_rate": 8.433948599346516e-05,
+      "loss": 0.6645,
+      "step": 1057
+    },
+    {
+      "epoch": 0.5642666666666667,
+      "grad_norm": 0.4037285077267891,
+      "learning_rate": 8.416883677301069e-05,
+      "loss": 0.6495,
+      "step": 1058
+    },
+    {
+      "epoch": 0.5648,
+      "grad_norm": 0.36521119192743473,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 0.6472,
+      "step": 1059
+    },
+    {
+      "epoch": 0.5653333333333334,
+      "grad_norm": 0.4243778904952702,
+      "learning_rate": 8.382768066431425e-05,
+      "loss": 0.6567,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5658666666666666,
+      "grad_norm": 0.42523133489332654,
+      "learning_rate": 8.36571747948162e-05,
+      "loss": 0.733,
+      "step": 1061
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.415875245008936,
+      "learning_rate": 8.348671772747487e-05,
+      "loss": 0.6837,
+      "step": 1062
+    },
+    {
+      "epoch": 0.5669333333333333,
+      "grad_norm": 0.40115003336266325,
+      "learning_rate": 8.33163099713009e-05,
+      "loss": 0.6847,
+      "step": 1063
+    },
+    {
+      "epoch": 0.5674666666666667,
+      "grad_norm": 0.32881616896424515,
+      "learning_rate": 8.31459520351578e-05,
+      "loss": 0.6213,
+      "step": 1064
+    },
+    {
+      "epoch": 0.568,
+      "grad_norm": 0.4275860836489642,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 0.6188,
+      "step": 1065
+    },
+    {
+      "epoch": 0.5685333333333333,
+      "grad_norm": 0.3859053081623988,
+      "learning_rate": 8.280538765767235e-05,
+      "loss": 0.6332,
+      "step": 1066
+    },
+    {
+      "epoch": 0.5690666666666667,
+      "grad_norm": 0.412578914524153,
+      "learning_rate": 8.263518223330697e-05,
+      "loss": 0.6908,
+      "step": 1067
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.375431620235667,
+      "learning_rate": 8.246502866292324e-05,
+      "loss": 0.6565,
+      "step": 1068
+    },
+    {
+      "epoch": 0.5701333333333334,
+      "grad_norm": 0.3890627334157619,
+      "learning_rate": 8.22949274546255e-05,
+      "loss": 0.6683,
+      "step": 1069
+    },
+    {
+      "epoch": 0.5706666666666667,
+      "grad_norm": 0.4362939883861216,
+      "learning_rate": 8.212487911636184e-05,
+      "loss": 0.6673,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5712,
+      "grad_norm": 0.3848179229246365,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 0.6791,
+      "step": 1071
+    },
+    {
+      "epoch": 0.5717333333333333,
+      "grad_norm": 0.4161599348707075,
+      "learning_rate": 8.178494308093789e-05,
+      "loss": 0.6738,
+      "step": 1072
+    },
+    {
+      "epoch": 0.5722666666666667,
+      "grad_norm": 0.39481859626708815,
+      "learning_rate": 8.161505639887817e-05,
+      "loss": 0.6776,
+      "step": 1073
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.4790895542819692,
+      "learning_rate": 8.144522461705067e-05,
+      "loss": 0.7418,
+      "step": 1074
+    },
+    {
+      "epoch": 0.5733333333333334,
+      "grad_norm": 0.38934276938196133,
+      "learning_rate": 8.127544824259889e-05,
+      "loss": 0.6913,
+      "step": 1075
+    },
+    {
+      "epoch": 0.5738666666666666,
+      "grad_norm": 0.3996181990520993,
+      "learning_rate": 8.110572778250085e-05,
+      "loss": 0.69,
+      "step": 1076
+    },
+    {
+      "epoch": 0.5744,
+      "grad_norm": 0.4016846160311755,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 0.7222,
+      "step": 1077
+    },
+    {
+      "epoch": 0.5749333333333333,
+      "grad_norm": 0.40645679452658495,
+      "learning_rate": 8.076645663244168e-05,
+      "loss": 0.6975,
+      "step": 1078
+    },
+    {
+      "epoch": 0.5754666666666667,
+      "grad_norm": 0.3821026612501412,
+      "learning_rate": 8.059690695559568e-05,
+      "loss": 0.6751,
+      "step": 1079
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.43526932412856223,
+      "learning_rate": 8.042741521933071e-05,
+      "loss": 0.6802,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5765333333333333,
+      "grad_norm": 0.47893887272111735,
+      "learning_rate": 8.025798192977481e-05,
+      "loss": 0.7243,
+      "step": 1081
+    },
+    {
+      "epoch": 0.5770666666666666,
+      "grad_norm": 0.4016805709957415,
+      "learning_rate": 8.008860759288147e-05,
+      "loss": 0.6123,
+      "step": 1082
+    },
+    {
+      "epoch": 0.5776,
+      "grad_norm": 0.3790105526008244,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 0.6645,
+      "step": 1083
+    },
+    {
+      "epoch": 0.5781333333333334,
+      "grad_norm": 0.4235556005744978,
+      "learning_rate": 7.975003780001485e-05,
+      "loss": 0.6937,
+      "step": 1084
+    },
+    {
+      "epoch": 0.5786666666666667,
+      "grad_norm": 0.3929143639351382,
+      "learning_rate": 7.958084335506239e-05,
+      "loss": 0.6306,
+      "step": 1085
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.4817146393427374,
+      "learning_rate": 7.941170988481108e-05,
+      "loss": 0.8412,
+      "step": 1086
+    },
+    {
+      "epoch": 0.5797333333333333,
+      "grad_norm": 0.41991508462058535,
+      "learning_rate": 7.924263789431912e-05,
+      "loss": 0.6699,
+      "step": 1087
+    },
+    {
+      "epoch": 0.5802666666666667,
+      "grad_norm": 0.3680084098297583,
+      "learning_rate": 7.907362788846116e-05,
+      "loss": 0.6181,
+      "step": 1088
+    },
+    {
+      "epoch": 0.5808,
+      "grad_norm": 0.36386174429490653,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 0.619,
+      "step": 1089
+    },
+    {
+      "epoch": 0.5813333333333334,
+      "grad_norm": 0.4016558656284791,
+      "learning_rate": 7.873579584921869e-05,
+      "loss": 0.6795,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5818666666666666,
+      "grad_norm": 0.37460452727953286,
+      "learning_rate": 7.856697482465196e-05,
+      "loss": 0.6657,
+      "step": 1091
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.38628545562440836,
+      "learning_rate": 7.839821780235168e-05,
+      "loss": 0.6926,
+      "step": 1092
+    },
+    {
+      "epoch": 0.5829333333333333,
+      "grad_norm": 0.35647816512228697,
+      "learning_rate": 7.822952528625191e-05,
+      "loss": 0.6465,
+      "step": 1093
+    },
+    {
+      "epoch": 0.5834666666666667,
+      "grad_norm": 0.3980489861012932,
+      "learning_rate": 7.806089778009421e-05,
+      "loss": 0.6853,
+      "step": 1094
+    },
+    {
+      "epoch": 0.584,
+      "grad_norm": 0.38071801407450945,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 0.6694,
+      "step": 1095
+    },
+    {
+      "epoch": 0.5845333333333333,
+      "grad_norm": 0.4008725472437602,
+      "learning_rate": 7.772383981159849e-05,
+      "loss": 0.7112,
+      "step": 1096
+    },
+    {
+      "epoch": 0.5850666666666666,
+      "grad_norm": 0.37430962526654193,
+      "learning_rate": 7.755541035576677e-05,
+      "loss": 0.6208,
+      "step": 1097
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.39821374623112504,
+      "learning_rate": 7.738704792288655e-05,
+      "loss": 0.6364,
+      "step": 1098
+    },
+    {
+      "epoch": 0.5861333333333333,
+      "grad_norm": 0.4234024743538159,
+      "learning_rate": 7.721875301571359e-05,
+      "loss": 0.7342,
+      "step": 1099
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.38635466588685263,
+      "learning_rate": 7.705052613680211e-05,
+      "loss": 0.6606,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5872,
+      "grad_norm": 0.4640882179165805,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 0.732,
+      "step": 1101
+    },
+    {
+      "epoch": 0.5877333333333333,
+      "grad_norm": 0.3587389095046982,
+      "learning_rate": 7.671427847296275e-05,
+      "loss": 0.6048,
+      "step": 1102
+    },
+    {
+      "epoch": 0.5882666666666667,
+      "grad_norm": 0.38280570307884154,
+      "learning_rate": 7.654625869212146e-05,
+      "loss": 0.6857,
+      "step": 1103
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.385190815200957,
+      "learning_rate": 7.637830894771175e-05,
+      "loss": 0.6507,
+      "step": 1104
+    },
+    {
+      "epoch": 0.5893333333333334,
+      "grad_norm": 0.39172766294738226,
+      "learning_rate": 7.6210429741257e-05,
+      "loss": 0.6592,
+      "step": 1105
+    },
+    {
+      "epoch": 0.5898666666666667,
+      "grad_norm": 0.4343325420045918,
+      "learning_rate": 7.604262157407007e-05,
+      "loss": 0.6245,
+      "step": 1106
+    },
+    {
+      "epoch": 0.5904,
+      "grad_norm": 0.3795685669434547,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 0.6433,
+      "step": 1107
+    },
+    {
+      "epoch": 0.5909333333333333,
+      "grad_norm": 0.46458247504413547,
+      "learning_rate": 7.570722036168854e-05,
+      "loss": 0.7326,
+      "step": 1108
+    },
+    {
+      "epoch": 0.5914666666666667,
+      "grad_norm": 0.35247844174059123,
+      "learning_rate": 7.55396283180529e-05,
+      "loss": 0.5951,
+      "step": 1109
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.35314968631174554,
+      "learning_rate": 7.537210931679987e-05,
+      "loss": 0.604,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5925333333333334,
+      "grad_norm": 0.397353578219303,
+      "learning_rate": 7.520466385816671e-05,
+      "loss": 0.6967,
+      "step": 1111
+    },
+    {
+      "epoch": 0.5930666666666666,
+      "grad_norm": 0.5232148536867158,
+      "learning_rate": 7.503729244217086e-05,
+      "loss": 0.8047,
+      "step": 1112
+    },
+    {
+      "epoch": 0.5936,
+      "grad_norm": 0.35913464570859693,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 0.6145,
+      "step": 1113
+    },
+    {
+      "epoch": 0.5941333333333333,
+      "grad_norm": 0.38235743410072365,
+      "learning_rate": 7.470277373705461e-05,
+      "loss": 0.6406,
+      "step": 1114
+    },
+    {
+      "epoch": 0.5946666666666667,
+      "grad_norm": 0.41386567999451707,
+      "learning_rate": 7.453562744685778e-05,
+      "loss": 0.7543,
+      "step": 1115
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3751622819161369,
+      "learning_rate": 7.43685571971426e-05,
+      "loss": 0.705,
+      "step": 1116
+    },
+    {
+      "epoch": 0.5957333333333333,
+      "grad_norm": 0.43269323278603933,
+      "learning_rate": 7.42015634868062e-05,
+      "loss": 0.7224,
+      "step": 1117
+    },
+    {
+      "epoch": 0.5962666666666666,
+      "grad_norm": 0.38350270002476794,
+      "learning_rate": 7.403464681451715e-05,
+      "loss": 0.6654,
+      "step": 1118
+    },
+    {
+      "epoch": 0.5968,
+      "grad_norm": 0.39103904708593545,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 0.6514,
+      "step": 1119
+    },
+    {
+      "epoch": 0.5973333333333334,
+      "grad_norm": 0.3750082436334888,
+      "learning_rate": 7.370104657760361e-05,
+      "loss": 0.6194,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5978666666666667,
+      "grad_norm": 0.44970376864398026,
+      "learning_rate": 7.353436400916004e-05,
+      "loss": 0.7271,
+      "step": 1121
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4957667572030496,
+      "learning_rate": 7.336776047112276e-05,
+      "loss": 0.7052,
+      "step": 1122
+    },
+    {
+      "epoch": 0.5989333333333333,
+      "grad_norm": 0.35200520111723355,
+      "learning_rate": 7.320123646099519e-05,
+      "loss": 0.581,
+      "step": 1123
+    },
+    {
+      "epoch": 0.5994666666666667,
+      "grad_norm": 0.3804936623120466,
+      "learning_rate": 7.303479247604332e-05,
+      "loss": 0.5876,
+      "step": 1124
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.41104529446282756,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 0.6924,
+      "step": 1125
+    },
+    {
+      "epoch": 0.6005333333333334,
+      "grad_norm": 0.45170797450398764,
+      "learning_rate": 7.270214656953415e-05,
+      "loss": 0.653,
+      "step": 1126
+    },
+    {
+      "epoch": 0.6010666666666666,
+      "grad_norm": 0.38503923675228885,
+      "learning_rate": 7.253594564130804e-05,
+      "loss": 0.6022,
+      "step": 1127
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.4633595168656192,
+      "learning_rate": 7.236982672491698e-05,
+      "loss": 0.7056,
+      "step": 1128
+    },
+    {
+      "epoch": 0.6021333333333333,
+      "grad_norm": 0.4661926119636266,
+      "learning_rate": 7.22037903164173e-05,
+      "loss": 0.7071,
+      "step": 1129
+    },
+    {
+      "epoch": 0.6026666666666667,
+      "grad_norm": 0.39058116202458776,
+      "learning_rate": 7.203783691161883e-05,
+      "loss": 0.6357,
+      "step": 1130
+    },
+    {
+      "epoch": 0.6032,
+      "grad_norm": 0.4068710100692547,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 0.6284,
+      "step": 1131
+    },
+    {
+      "epoch": 0.6037333333333333,
+      "grad_norm": 0.36093967832027174,
+      "learning_rate": 7.170618109512465e-05,
+      "loss": 0.6628,
+      "step": 1132
+    },
+    {
+      "epoch": 0.6042666666666666,
+      "grad_norm": 0.3754030586808087,
+      "learning_rate": 7.154047967380354e-05,
+      "loss": 0.6962,
+      "step": 1133
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.3601575344266386,
+      "learning_rate": 7.137486323692995e-05,
+      "loss": 0.6535,
+      "step": 1134
+    },
+    {
+      "epoch": 0.6053333333333333,
+      "grad_norm": 0.42508960170227306,
+      "learning_rate": 7.12093322790597e-05,
+      "loss": 0.6896,
+      "step": 1135
+    },
+    {
+      "epoch": 0.6058666666666667,
+      "grad_norm": 0.378594427641715,
+      "learning_rate": 7.104388729449338e-05,
+      "loss": 0.6055,
+      "step": 1136
+    },
+    {
+      "epoch": 0.6064,
+      "grad_norm": 0.3924478605875361,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 0.687,
+      "step": 1137
+    },
+    {
+      "epoch": 0.6069333333333333,
+      "grad_norm": 0.386931567989563,
+      "learning_rate": 7.071325722118963e-05,
+      "loss": 0.6785,
+      "step": 1138
+    },
+    {
+      "epoch": 0.6074666666666667,
+      "grad_norm": 0.40036626592390673,
+      "learning_rate": 7.054807311976379e-05,
+      "loss": 0.6768,
+      "step": 1139
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.45732859872622206,
+      "learning_rate": 7.038297696626206e-05,
+      "loss": 0.7172,
+      "step": 1140
+    },
+    {
+      "epoch": 0.6085333333333334,
+      "grad_norm": 0.35181903750732224,
+      "learning_rate": 7.021796925368667e-05,
+      "loss": 0.6021,
+      "step": 1141
+    },
+    {
+      "epoch": 0.6090666666666666,
+      "grad_norm": 0.34220642484795927,
+      "learning_rate": 7.005305047477566e-05,
+      "loss": 0.6,
+      "step": 1142
+    },
+    {
+      "epoch": 0.6096,
+      "grad_norm": 0.37763765261431453,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 0.6134,
+      "step": 1143
+    },
+    {
+      "epoch": 0.6101333333333333,
+      "grad_norm": 0.42213695261788436,
+      "learning_rate": 6.972348168756983e-05,
+      "loss": 0.7144,
+      "step": 1144
+    },
+    {
+      "epoch": 0.6106666666666667,
+      "grad_norm": 0.3559131773085692,
+      "learning_rate": 6.955883266341741e-05,
+      "loss": 0.6105,
+      "step": 1145
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.5344280420368293,
+      "learning_rate": 6.939427454121128e-05,
+      "loss": 0.6698,
+      "step": 1146
+    },
+    {
+      "epoch": 0.6117333333333334,
+      "grad_norm": 0.4200921056379506,
+      "learning_rate": 6.922980781234699e-05,
+      "loss": 0.71,
+      "step": 1147
+    },
+    {
+      "epoch": 0.6122666666666666,
+      "grad_norm": 0.3696536574973899,
+      "learning_rate": 6.906543296794714e-05,
+      "loss": 0.6535,
+      "step": 1148
+    },
+    {
+      "epoch": 0.6128,
+      "grad_norm": 0.39741208691876595,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 0.706,
+      "step": 1149
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.38779623938426,
+      "learning_rate": 6.873696089565786e-05,
+      "loss": 0.6639,
+      "step": 1150
+    },
+    {
+      "epoch": 0.6138666666666667,
+      "grad_norm": 0.3977903235022171,
+      "learning_rate": 6.85728646486359e-05,
+      "loss": 0.6886,
+      "step": 1151
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.39730445921698443,
+      "learning_rate": 6.84088622478104e-05,
+      "loss": 0.6592,
+      "step": 1152
+    },
+    {
+      "epoch": 0.6149333333333333,
+      "grad_norm": 0.403564736270205,
+      "learning_rate": 6.82449541829174e-05,
+      "loss": 0.7242,
+      "step": 1153
+    },
+    {
+      "epoch": 0.6154666666666667,
+      "grad_norm": 0.4110931452082343,
+      "learning_rate": 6.80811409434113e-05,
+      "loss": 0.6931,
+      "step": 1154
+    },
+    {
+      "epoch": 0.616,
+      "grad_norm": 0.3996311263746934,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 0.6377,
+      "step": 1155
+    },
+    {
+      "epoch": 0.6165333333333334,
+      "grad_norm": 0.39032809008553926,
+      "learning_rate": 6.775380089695986e-05,
+      "loss": 0.7247,
+      "step": 1156
+    },
+    {
+      "epoch": 0.6170666666666667,
+      "grad_norm": 0.41106679109628125,
+      "learning_rate": 6.759027506750158e-05,
+      "loss": 0.7073,
+      "step": 1157
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.4236171414493968,
+      "learning_rate": 6.742684601840141e-05,
+      "loss": 0.7755,
+      "step": 1158
+    },
+    {
+      "epoch": 0.6181333333333333,
+      "grad_norm": 0.41627136156520544,
+      "learning_rate": 6.726351423768322e-05,
+      "loss": 0.6851,
+      "step": 1159
+    },
+    {
+      "epoch": 0.6186666666666667,
+      "grad_norm": 0.37047575479258393,
+      "learning_rate": 6.710028021308061e-05,
+      "loss": 0.7144,
+      "step": 1160
+    },
+    {
+      "epoch": 0.6192,
+      "grad_norm": 0.40065107124430077,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 0.6531,
+      "step": 1161
+    },
+    {
+      "epoch": 0.6197333333333334,
+      "grad_norm": 0.37739932655170266,
+      "learning_rate": 6.677410738169485e-05,
+      "loss": 0.6755,
+      "step": 1162
+    },
+    {
+      "epoch": 0.6202666666666666,
+      "grad_norm": 0.36182734161073676,
+      "learning_rate": 6.661116954891328e-05,
+      "loss": 0.6477,
+      "step": 1163
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.41099623638967697,
+      "learning_rate": 6.644833142024751e-05,
+      "loss": 0.634,
+      "step": 1164
+    },
+    {
+      "epoch": 0.6213333333333333,
+      "grad_norm": 0.401626466947384,
+      "learning_rate": 6.62855934819569e-05,
+      "loss": 0.691,
+      "step": 1165
+    },
+    {
+      "epoch": 0.6218666666666667,
+      "grad_norm": 0.36959968335005,
+      "learning_rate": 6.612295622000162e-05,
+      "loss": 0.6895,
+      "step": 1166
+    },
+    {
+      "epoch": 0.6224,
+      "grad_norm": 0.4031099866728893,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 0.6504,
+      "step": 1167
+    },
+    {
+      "epoch": 0.6229333333333333,
+      "grad_norm": 0.3712238547650622,
+      "learning_rate": 6.579798566743314e-05,
+      "loss": 0.667,
+      "step": 1168
+    },
+    {
+      "epoch": 0.6234666666666666,
+      "grad_norm": 0.38921717995057287,
+      "learning_rate": 6.563565334723134e-05,
+      "loss": 0.6143,
+      "step": 1169
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.40796849123557527,
+      "learning_rate": 6.547342364418481e-05,
+      "loss": 0.6796,
+      "step": 1170
+    },
+    {
+      "epoch": 0.6245333333333334,
+      "grad_norm": 0.3386381334707545,
+      "learning_rate": 6.531129704273604e-05,
+      "loss": 0.603,
+      "step": 1171
+    },
+    {
+      "epoch": 0.6250666666666667,
+      "grad_norm": 0.41218625274857573,
+      "learning_rate": 6.514927402701964e-05,
+      "loss": 0.6996,
+      "step": 1172
+    },
+    {
+      "epoch": 0.6256,
+      "grad_norm": 0.41575269185267494,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 0.6846,
+      "step": 1173
+    },
+    {
+      "epoch": 0.6261333333333333,
+      "grad_norm": 0.370757705379407,
+      "learning_rate": 6.48255406877745e-05,
+      "loss": 0.6253,
+      "step": 1174
+    },
+    {
+      "epoch": 0.6266666666666667,
+      "grad_norm": 0.3498429362561111,
+      "learning_rate": 6.466383133096267e-05,
+      "loss": 0.582,
+      "step": 1175
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3967937924943134,
+      "learning_rate": 6.450222749331414e-05,
+      "loss": 0.6348,
+      "step": 1176
+    },
+    {
+      "epoch": 0.6277333333333334,
+      "grad_norm": 0.46531867946137895,
+      "learning_rate": 6.434072965740242e-05,
+      "loss": 0.7073,
+      "step": 1177
+    },
+    {
+      "epoch": 0.6282666666666666,
+      "grad_norm": 0.37955040353422276,
+      "learning_rate": 6.417933830548467e-05,
+      "loss": 0.675,
+      "step": 1178
+    },
+    {
+      "epoch": 0.6288,
+      "grad_norm": 0.3543836601917388,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 0.6189,
+      "step": 1179
+    },
+    {
+      "epoch": 0.6293333333333333,
+      "grad_norm": 0.40689125569370455,
+      "learning_rate": 6.385687698106781e-05,
+      "loss": 0.6555,
+      "step": 1180
+    },
+    {
+      "epoch": 0.6298666666666667,
+      "grad_norm": 0.4163295050376555,
+      "learning_rate": 6.369580797148718e-05,
+      "loss": 0.6658,
+      "step": 1181
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.4969283756813846,
+      "learning_rate": 6.35348473717345e-05,
+      "loss": 0.675,
+      "step": 1182
+    },
+    {
+      "epoch": 0.6309333333333333,
+      "grad_norm": 0.44524254611823577,
+      "learning_rate": 6.337399566246257e-05,
+      "loss": 0.7528,
+      "step": 1183
+    },
+    {
+      "epoch": 0.6314666666666666,
+      "grad_norm": 0.37316751405405507,
+      "learning_rate": 6.321325332399903e-05,
+      "loss": 0.6495,
+      "step": 1184
+    },
+    {
+      "epoch": 0.632,
+      "grad_norm": 0.36577387173292003,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 0.6217,
+      "step": 1185
+    },
+    {
+      "epoch": 0.6325333333333333,
+      "grad_norm": 0.441330122528069,
+      "learning_rate": 6.289209867917312e-05,
+      "loss": 0.6822,
+      "step": 1186
+    },
+    {
+      "epoch": 0.6330666666666667,
+      "grad_norm": 0.4303781965223265,
+      "learning_rate": 6.273168733182722e-05,
+      "loss": 0.6586,
+      "step": 1187
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.4104655155426858,
+      "learning_rate": 6.25713872733199e-05,
+      "loss": 0.6503,
+      "step": 1188
+    },
+    {
+      "epoch": 0.6341333333333333,
+      "grad_norm": 0.3990485931055677,
+      "learning_rate": 6.241119898233144e-05,
+      "loss": 0.6871,
+      "step": 1189
+    },
+    {
+      "epoch": 0.6346666666666667,
+      "grad_norm": 0.3887220063485232,
+      "learning_rate": 6.225112293720836e-05,
+      "loss": 0.6743,
+      "step": 1190
+    },
+    {
+      "epoch": 0.6352,
+      "grad_norm": 0.3828225414927008,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 0.6497,
+      "step": 1191
+    },
+    {
+      "epoch": 0.6357333333333334,
+      "grad_norm": 0.39661169755538084,
+      "learning_rate": 6.19313094962673e-05,
+      "loss": 0.6359,
+      "step": 1192
+    },
+    {
+      "epoch": 0.6362666666666666,
+      "grad_norm": 0.43297412058206247,
+      "learning_rate": 6.177157305546078e-05,
+      "loss": 0.6677,
+      "step": 1193
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.4346435679710135,
+      "learning_rate": 6.161195077053976e-05,
+      "loss": 0.7041,
+      "step": 1194
+    },
+    {
+      "epoch": 0.6373333333333333,
+      "grad_norm": 0.3787375488050026,
+      "learning_rate": 6.145244311816063e-05,
+      "loss": 0.6975,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6378666666666667,
+      "grad_norm": 0.39924033430802885,
+      "learning_rate": 6.129305057463741e-05,
+      "loss": 0.6915,
+      "step": 1196
+    },
+    {
+      "epoch": 0.6384,
+      "grad_norm": 0.35882244776870287,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 0.6333,
+      "step": 1197
+    },
+    {
+      "epoch": 0.6389333333333334,
+      "grad_norm": 0.3892922219688357,
+      "learning_rate": 6.0974612717695004e-05,
+      "loss": 0.6575,
+      "step": 1198
+    },
+    {
+      "epoch": 0.6394666666666666,
+      "grad_norm": 0.4025711918252862,
+      "learning_rate": 6.0815568355179556e-05,
+      "loss": 0.6607,
+      "step": 1199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.39686322723674156,
+      "learning_rate": 6.065664100332478e-05,
+      "loss": 0.6663,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6405333333333333,
+      "grad_norm": 0.38696545733015086,
+      "learning_rate": 6.0497831136711836e-05,
+      "loss": 0.6851,
+      "step": 1201
+    },
+    {
+      "epoch": 0.6410666666666667,
+      "grad_norm": 0.4101242906783707,
+      "learning_rate": 6.0339139229571116e-05,
+      "loss": 0.7008,
+      "step": 1202
+    },
+    {
+      "epoch": 0.6416,
+      "grad_norm": 0.4016275859970521,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 0.626,
+      "step": 1203
+    },
+    {
+      "epoch": 0.6421333333333333,
+      "grad_norm": 0.4545316434527366,
+      "learning_rate": 6.002211118886514e-05,
+      "loss": 0.6483,
+      "step": 1204
+    },
+    {
+      "epoch": 0.6426666666666667,
+      "grad_norm": 0.38345074027203224,
+      "learning_rate": 5.986377600199371e-05,
+      "loss": 0.6432,
+      "step": 1205
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.4479961188016446,
+      "learning_rate": 5.970556066797941e-05,
+      "loss": 0.7402,
+      "step": 1206
+    },
+    {
+      "epoch": 0.6437333333333334,
+      "grad_norm": 0.4459207817163863,
+      "learning_rate": 5.9547465659277215e-05,
+      "loss": 0.7431,
+      "step": 1207
+    },
+    {
+      "epoch": 0.6442666666666667,
+      "grad_norm": 0.4319954107698843,
+      "learning_rate": 5.938949144798279e-05,
+      "loss": 0.722,
+      "step": 1208
+    },
+    {
+      "epoch": 0.6448,
+      "grad_norm": 0.38149712675860425,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 0.6703,
+      "step": 1209
+    },
+    {
+      "epoch": 0.6453333333333333,
+      "grad_norm": 0.37791425530410894,
+      "learning_rate": 5.907390730419507e-05,
+      "loss": 0.6656,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6458666666666667,
+      "grad_norm": 0.4121481150338273,
+      "learning_rate": 5.8916298314083915e-05,
+      "loss": 0.651,
+      "step": 1211
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4169291524422402,
+      "learning_rate": 5.875881200614207e-05,
+      "loss": 0.7033,
+      "step": 1212
+    },
+    {
+      "epoch": 0.6469333333333334,
+      "grad_norm": 0.42033859419007247,
+      "learning_rate": 5.860144885064751e-05,
+      "loss": 0.6443,
+      "step": 1213
+    },
+    {
+      "epoch": 0.6474666666666666,
+      "grad_norm": 0.3700272313497562,
+      "learning_rate": 5.8444209317510514e-05,
+      "loss": 0.6601,
+      "step": 1214
+    },
+    {
+      "epoch": 0.648,
+      "grad_norm": 0.4778802988426458,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 0.697,
+      "step": 1215
+    },
+    {
+      "epoch": 0.6485333333333333,
+      "grad_norm": 0.35768936042185684,
+      "learning_rate": 5.813010299610313e-05,
+      "loss": 0.6168,
+      "step": 1216
+    },
+    {
+      "epoch": 0.6490666666666667,
+      "grad_norm": 0.36800043178590347,
+      "learning_rate": 5.797323714580192e-05,
+      "loss": 0.6457,
+      "step": 1217
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.40303810625934766,
+      "learning_rate": 5.781649679379378e-05,
+      "loss": 0.7043,
+      "step": 1218
+    },
+    {
+      "epoch": 0.6501333333333333,
+      "grad_norm": 0.43424053966722104,
+      "learning_rate": 5.765988240812921e-05,
+      "loss": 0.7209,
+      "step": 1219
+    },
+    {
+      "epoch": 0.6506666666666666,
+      "grad_norm": 0.3793422373656162,
+      "learning_rate": 5.750339445648252e-05,
+      "loss": 0.67,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6512,
+      "grad_norm": 0.39844209797716984,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 0.7021,
+      "step": 1221
+    },
+    {
+      "epoch": 0.6517333333333334,
+      "grad_norm": 0.3722643921892602,
+      "learning_rate": 5.7190799724050924e-05,
+      "loss": 0.6377,
+      "step": 1222
+    },
+    {
+      "epoch": 0.6522666666666667,
+      "grad_norm": 0.4133785282213768,
+      "learning_rate": 5.7034693876721376e-05,
+      "loss": 0.6525,
+      "step": 1223
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.5129186669636846,
+      "learning_rate": 5.687871633031754e-05,
+      "loss": 0.726,
+      "step": 1224
+    },
+    {
+      "epoch": 0.6533333333333333,
+      "grad_norm": 0.456742676325474,
+      "learning_rate": 5.6722867550612116e-05,
+      "loss": 0.7273,
+      "step": 1225
+    },
+    {
+      "epoch": 0.6538666666666667,
+      "grad_norm": 0.3600078285545767,
+      "learning_rate": 5.6567148002993164e-05,
+      "loss": 0.6206,
+      "step": 1226
+    },
+    {
+      "epoch": 0.6544,
+      "grad_norm": 0.43111287342009397,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 0.7103,
+      "step": 1227
+    },
+    {
+      "epoch": 0.6549333333333334,
+      "grad_norm": 0.3626978793288534,
+      "learning_rate": 5.625609846363622e-05,
+      "loss": 0.6129,
+      "step": 1228
+    },
+    {
+      "epoch": 0.6554666666666666,
+      "grad_norm": 0.4343160084912167,
+      "learning_rate": 5.6100769400739383e-05,
+      "loss": 0.7009,
+      "step": 1229
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4555172930959856,
+      "learning_rate": 5.5945571427608526e-05,
+      "loss": 0.7079,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6565333333333333,
+      "grad_norm": 0.40227511524808995,
+      "learning_rate": 5.579050500768836e-05,
+      "loss": 0.6994,
+      "step": 1231
+    },
+    {
+      "epoch": 0.6570666666666667,
+      "grad_norm": 0.48632666387981066,
+      "learning_rate": 5.5635570604030705e-05,
+      "loss": 0.696,
+      "step": 1232
+    },
+    {
+      "epoch": 0.6576,
+      "grad_norm": 0.42581739779144323,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 0.6743,
+      "step": 1233
+    },
+    {
+      "epoch": 0.6581333333333333,
+      "grad_norm": 0.3825639992276115,
+      "learning_rate": 5.53260996957381e-05,
+      "loss": 0.6205,
+      "step": 1234
+    },
+    {
+      "epoch": 0.6586666666666666,
+      "grad_norm": 0.3793282181515819,
+      "learning_rate": 5.5171564115230254e-05,
+      "loss": 0.6942,
+      "step": 1235
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.571196272629039,
+      "learning_rate": 5.501716239923642e-05,
+      "loss": 0.7054,
+      "step": 1236
+    },
+    {
+      "epoch": 0.6597333333333333,
+      "grad_norm": 0.3593515575564961,
+      "learning_rate": 5.486289500882355e-05,
+      "loss": 0.6245,
+      "step": 1237
+    },
+    {
+      "epoch": 0.6602666666666667,
+      "grad_norm": 0.3768362851791814,
+      "learning_rate": 5.47087624046575e-05,
+      "loss": 0.6837,
+      "step": 1238
+    },
+    {
+      "epoch": 0.6608,
+      "grad_norm": 0.4416232089824355,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 0.6785,
+      "step": 1239
+    },
+    {
+      "epoch": 0.6613333333333333,
+      "grad_norm": 0.37772676319392723,
+      "learning_rate": 5.4400903395715366e-05,
+      "loss": 0.6278,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6618666666666667,
+      "grad_norm": 0.43467731327877024,
+      "learning_rate": 5.424717791025302e-05,
+      "loss": 0.7057,
+      "step": 1241
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.3904238371216619,
+      "learning_rate": 5.4093589049662175e-05,
+      "loss": 0.6817,
+      "step": 1242
+    },
+    {
+      "epoch": 0.6629333333333334,
+      "grad_norm": 0.3882791634332464,
+      "learning_rate": 5.394013727258254e-05,
+      "loss": 0.6684,
+      "step": 1243
+    },
+    {
+      "epoch": 0.6634666666666666,
+      "grad_norm": 0.43935029585606616,
+      "learning_rate": 5.378682303724435e-05,
+      "loss": 0.6546,
+      "step": 1244
+    },
+    {
+      "epoch": 0.664,
+      "grad_norm": 0.40037159051848714,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 0.6343,
+      "step": 1245
+    },
+    {
+      "epoch": 0.6645333333333333,
+      "grad_norm": 0.4557479411637336,
+      "learning_rate": 5.348060902265871e-05,
+      "loss": 0.7471,
+      "step": 1246
+    },
+    {
+      "epoch": 0.6650666666666667,
+      "grad_norm": 0.38531054077996774,
+      "learning_rate": 5.332771015781275e-05,
+      "loss": 0.6582,
+      "step": 1247
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.38429972181306354,
+      "learning_rate": 5.31749506635086e-05,
+      "loss": 0.698,
+      "step": 1248
+    },
+    {
+      "epoch": 0.6661333333333334,
+      "grad_norm": 0.43581105469606396,
+      "learning_rate": 5.302233099590928e-05,
+      "loss": 0.7503,
+      "step": 1249
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.3786961249187762,
+      "learning_rate": 5.286985161076029e-05,
+      "loss": 0.6616,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6672,
+      "grad_norm": 0.36814280572850133,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 0.6111,
+      "step": 1251
+    },
+    {
+      "epoch": 0.6677333333333333,
+      "grad_norm": 0.35289894291087465,
+      "learning_rate": 5.2565315508699376e-05,
+      "loss": 0.6721,
+      "step": 1252
+    },
+    {
+      "epoch": 0.6682666666666667,
+      "grad_norm": 0.3788897774951766,
+      "learning_rate": 5.2413259701178505e-05,
+      "loss": 0.6638,
+      "step": 1253
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3843078947868329,
+      "learning_rate": 5.226134599488728e-05,
+      "loss": 0.7047,
+      "step": 1254
+    },
+    {
+      "epoch": 0.6693333333333333,
+      "grad_norm": 0.3956161994374667,
+      "learning_rate": 5.210957484346314e-05,
+      "loss": 0.6583,
+      "step": 1255
+    },
+    {
+      "epoch": 0.6698666666666667,
+      "grad_norm": 0.4324262603703905,
+      "learning_rate": 5.195794670011776e-05,
+      "loss": 0.6834,
+      "step": 1256
+    },
+    {
+      "epoch": 0.6704,
+      "grad_norm": 0.4286966243205002,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 0.696,
+      "step": 1257
+    },
+    {
+      "epoch": 0.6709333333333334,
+      "grad_norm": 0.3630002637808542,
+      "learning_rate": 5.165512124837344e-05,
+      "loss": 0.6502,
+      "step": 1258
+    },
+    {
+      "epoch": 0.6714666666666667,
+      "grad_norm": 0.38035823271916014,
+      "learning_rate": 5.150392484425728e-05,
+      "loss": 0.5918,
+      "step": 1259
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.4576968077355702,
+      "learning_rate": 5.135287325678271e-05,
+      "loss": 0.686,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6725333333333333,
+      "grad_norm": 0.36880792848008637,
+      "learning_rate": 5.120196693701267e-05,
+      "loss": 0.6668,
+      "step": 1261
+    },
+    {
+      "epoch": 0.6730666666666667,
+      "grad_norm": 0.3679974050882043,
+      "learning_rate": 5.105120633557634e-05,
+      "loss": 0.6546,
+      "step": 1262
+    },
+    {
+      "epoch": 0.6736,
+      "grad_norm": 0.4131694246907738,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 0.6652,
+      "step": 1263
+    },
+    {
+      "epoch": 0.6741333333333334,
+      "grad_norm": 0.4493186583002729,
+      "learning_rate": 5.075012408804458e-05,
+      "loss": 0.7106,
+      "step": 1264
+    },
+    {
+      "epoch": 0.6746666666666666,
+      "grad_norm": 0.4208962131856972,
+      "learning_rate": 5.059980334102637e-05,
+      "loss": 0.6626,
+      "step": 1265
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.3585481968909659,
+      "learning_rate": 5.0449630110493836e-05,
+      "loss": 0.6645,
+      "step": 1266
+    },
+    {
+      "epoch": 0.6757333333333333,
+      "grad_norm": 0.35082005531937965,
+      "learning_rate": 5.0299604844886985e-05,
+      "loss": 0.6022,
+      "step": 1267
+    },
+    {
+      "epoch": 0.6762666666666667,
+      "grad_norm": 0.38486451673856137,
+      "learning_rate": 5.014972799220403e-05,
+      "loss": 0.6073,
+      "step": 1268
+    },
+    {
+      "epoch": 0.6768,
+      "grad_norm": 0.41165383955462015,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.6918,
+      "step": 1269
+    },
+    {
+      "epoch": 0.6773333333333333,
+      "grad_norm": 0.3712772782434978,
+      "learning_rate": 4.985042131538545e-05,
+      "loss": 0.5546,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6778666666666666,
+      "grad_norm": 0.39827766286989835,
+      "learning_rate": 4.9700992385024934e-05,
+      "loss": 0.6296,
+      "step": 1271
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.44595990018200404,
+      "learning_rate": 4.955171365513603e-05,
+      "loss": 0.6926,
+      "step": 1272
+    },
+    {
+      "epoch": 0.6789333333333334,
+      "grad_norm": 0.4230801733220779,
+      "learning_rate": 4.940258557148765e-05,
+      "loss": 0.642,
+      "step": 1273
+    },
+    {
+      "epoch": 0.6794666666666667,
+      "grad_norm": 0.536142732112756,
+      "learning_rate": 4.9253608579398855e-05,
+      "loss": 0.7798,
+      "step": 1274
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.36332509710940986,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 0.6313,
+      "step": 1275
+    },
+    {
+      "epoch": 0.6805333333333333,
+      "grad_norm": 0.40031407367425914,
+      "learning_rate": 4.895610964891923e-05,
+      "loss": 0.6427,
+      "step": 1276
+    },
+    {
+      "epoch": 0.6810666666666667,
+      "grad_norm": 0.40293063394952483,
+      "learning_rate": 4.880758859890536e-05,
+      "loss": 0.6547,
+      "step": 1277
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.39869455791686215,
+      "learning_rate": 4.865922041720239e-05,
+      "loss": 0.6398,
+      "step": 1278
+    },
+    {
+      "epoch": 0.6821333333333334,
+      "grad_norm": 0.48883387350145335,
+      "learning_rate": 4.851100554686021e-05,
+      "loss": 0.7496,
+      "step": 1279
+    },
+    {
+      "epoch": 0.6826666666666666,
+      "grad_norm": 0.3926469433108249,
+      "learning_rate": 4.836294443047088e-05,
+      "loss": 0.6801,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6832,
+      "grad_norm": 0.4677723780348946,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 0.7409,
+      "step": 1281
+    },
+    {
+      "epoch": 0.6837333333333333,
+      "grad_norm": 0.43363420970950584,
+      "learning_rate": 4.8067285227622404e-05,
+      "loss": 0.649,
+      "step": 1282
+    },
+    {
+      "epoch": 0.6842666666666667,
+      "grad_norm": 0.4183026247555478,
+      "learning_rate": 4.791968802404648e-05,
+      "loss": 0.6925,
+      "step": 1283
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.4405274776531298,
+      "learning_rate": 4.777224634018732e-05,
+      "loss": 0.6791,
+      "step": 1284
+    },
+    {
+      "epoch": 0.6853333333333333,
+      "grad_norm": 0.3934240974563691,
+      "learning_rate": 4.762496061632814e-05,
+      "loss": 0.6386,
+      "step": 1285
+    },
+    {
+      "epoch": 0.6858666666666666,
+      "grad_norm": 0.4054706785769142,
+      "learning_rate": 4.747783129228656e-05,
+      "loss": 0.6828,
+      "step": 1286
+    },
+    {
+      "epoch": 0.6864,
+      "grad_norm": 0.37853453149151145,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 0.6625,
+      "step": 1287
+    },
+    {
+      "epoch": 0.6869333333333333,
+      "grad_norm": 0.37513832966085037,
+      "learning_rate": 4.718404360058966e-05,
+      "loss": 0.6342,
+      "step": 1288
+    },
+    {
+      "epoch": 0.6874666666666667,
+      "grad_norm": 0.4023934982278683,
+      "learning_rate": 4.7037386110228985e-05,
+      "loss": 0.7325,
+      "step": 1289
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.36899840695902886,
+      "learning_rate": 4.689088677427249e-05,
+      "loss": 0.6281,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6885333333333333,
+      "grad_norm": 0.4033113443179532,
+      "learning_rate": 4.6744546030189486e-05,
+      "loss": 0.7099,
+      "step": 1291
+    },
+    {
+      "epoch": 0.6890666666666667,
+      "grad_norm": 0.3171488557975274,
+      "learning_rate": 4.659836431497563e-05,
+      "loss": 0.5754,
+      "step": 1292
+    },
+    {
+      "epoch": 0.6896,
+      "grad_norm": 0.41542117458614175,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 0.6905,
+      "step": 1293
+    },
+    {
+      "epoch": 0.6901333333333334,
+      "grad_norm": 0.39437793112096853,
+      "learning_rate": 4.630647971676232e-05,
+      "loss": 0.6204,
+      "step": 1294
+    },
+    {
+      "epoch": 0.6906666666666667,
+      "grad_norm": 0.3553818449896893,
+      "learning_rate": 4.6160777705374524e-05,
+      "loss": 0.6217,
+      "step": 1295
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.40995002407888353,
+      "learning_rate": 4.6015236466076747e-05,
+      "loss": 0.665,
+      "step": 1296
+    },
+    {
+      "epoch": 0.6917333333333333,
+      "grad_norm": 0.36100741046680696,
+      "learning_rate": 4.586985643347717e-05,
+      "loss": 0.616,
+      "step": 1297
+    },
+    {
+      "epoch": 0.6922666666666667,
+      "grad_norm": 0.37818239833090767,
+      "learning_rate": 4.572463804170263e-05,
+      "loss": 0.6271,
+      "step": 1298
+    },
+    {
+      "epoch": 0.6928,
+      "grad_norm": 0.3776610659012665,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 0.6637,
+      "step": 1299
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.4231154654286901,
+      "learning_rate": 4.543468791472131e-05,
+      "loss": 0.6329,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6938666666666666,
+      "grad_norm": 0.42936741925917193,
+      "learning_rate": 4.5289957045349653e-05,
+      "loss": 0.6581,
+      "step": 1301
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3923600873659972,
+      "learning_rate": 4.514538954847064e-05,
+      "loss": 0.65,
+      "step": 1302
+    },
+    {
+      "epoch": 0.6949333333333333,
+      "grad_norm": 0.3935245359603157,
+      "learning_rate": 4.5000985855784746e-05,
+      "loss": 0.6698,
+      "step": 1303
+    },
+    {
+      "epoch": 0.6954666666666667,
+      "grad_norm": 0.3672090508252383,
+      "learning_rate": 4.485674639850333e-05,
+      "loss": 0.6356,
+      "step": 1304
+    },
+    {
+      "epoch": 0.696,
+      "grad_norm": 0.38870391355750356,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 0.659,
+      "step": 1305
+    },
+    {
+      "epoch": 0.6965333333333333,
+      "grad_norm": 0.4307327798360442,
+      "learning_rate": 4.456876191254582e-05,
+      "loss": 0.7335,
+      "step": 1306
+    },
+    {
+      "epoch": 0.6970666666666666,
+      "grad_norm": 0.44166475115987996,
+      "learning_rate": 4.442501774383515e-05,
+      "loss": 0.6762,
+      "step": 1307
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.36626683468551186,
+      "learning_rate": 4.428143953045717e-05,
+      "loss": 0.6232,
+      "step": 1308
+    },
+    {
+      "epoch": 0.6981333333333334,
+      "grad_norm": 0.40103874478481943,
+      "learning_rate": 4.413802770115816e-05,
+      "loss": 0.6844,
+      "step": 1309
+    },
+    {
+      "epoch": 0.6986666666666667,
+      "grad_norm": 0.3949961307614567,
+      "learning_rate": 4.399478268418771e-05,
+      "loss": 0.6512,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6992,
+      "grad_norm": 0.4351564575723422,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 0.7587,
+      "step": 1311
+    },
+    {
+      "epoch": 0.6997333333333333,
+      "grad_norm": 0.38572628275672344,
+      "learning_rate": 4.3708794797738375e-05,
+      "loss": 0.6679,
+      "step": 1312
+    },
+    {
+      "epoch": 0.7002666666666667,
+      "grad_norm": 0.36538503676275325,
+      "learning_rate": 4.3566052782262735e-05,
+      "loss": 0.6095,
+      "step": 1313
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.38274039879275157,
+      "learning_rate": 4.342347928711953e-05,
+      "loss": 0.6448,
+      "step": 1314
+    },
+    {
+      "epoch": 0.7013333333333334,
+      "grad_norm": 0.44275866152143434,
+      "learning_rate": 4.328107473805487e-05,
+      "loss": 0.6761,
+      "step": 1315
+    },
+    {
+      "epoch": 0.7018666666666666,
+      "grad_norm": 0.37872154665111507,
+      "learning_rate": 4.3138839560310303e-05,
+      "loss": 0.6458,
+      "step": 1316
+    },
+    {
+      "epoch": 0.7024,
+      "grad_norm": 0.41637145712821355,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 0.6847,
+      "step": 1317
+    },
+    {
+      "epoch": 0.7029333333333333,
+      "grad_norm": 0.3731937878897712,
+      "learning_rate": 4.2854879017217894e-05,
+      "loss": 0.6247,
+      "step": 1318
+    },
+    {
+      "epoch": 0.7034666666666667,
+      "grad_norm": 0.4131083805194796,
+      "learning_rate": 4.271315449981934e-05,
+      "loss": 0.6178,
+      "step": 1319
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.41315094556072635,
+      "learning_rate": 4.257160104963696e-05,
+      "loss": 0.7103,
+      "step": 1320
+    },
+    {
+      "epoch": 0.7045333333333333,
+      "grad_norm": 0.3946508996007053,
+      "learning_rate": 4.2430219089370823e-05,
+      "loss": 0.6068,
+      "step": 1321
+    },
+    {
+      "epoch": 0.7050666666666666,
+      "grad_norm": 0.4133597282234054,
+      "learning_rate": 4.228900904120895e-05,
+      "loss": 0.6382,
+      "step": 1322
+    },
+    {
+      "epoch": 0.7056,
+      "grad_norm": 0.36438538202343185,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 0.6008,
+      "step": 1323
+    },
+    {
+      "epoch": 0.7061333333333333,
+      "grad_norm": 0.35554634006170616,
+      "learning_rate": 4.200710636738189e-05,
+      "loss": 0.6025,
+      "step": 1324
+    },
+    {
+      "epoch": 0.7066666666666667,
+      "grad_norm": 0.39559354992693413,
+      "learning_rate": 4.1866414583520877e-05,
+      "loss": 0.6592,
+      "step": 1325
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.40637796318098757,
+      "learning_rate": 4.172589639536991e-05,
+      "loss": 0.6341,
+      "step": 1326
+    },
+    {
+      "epoch": 0.7077333333333333,
+      "grad_norm": 0.3844034545296773,
+      "learning_rate": 4.158555222253771e-05,
+      "loss": 0.6626,
+      "step": 1327
+    },
+    {
+      "epoch": 0.7082666666666667,
+      "grad_norm": 0.3649619850541277,
+      "learning_rate": 4.14453824841132e-05,
+      "loss": 0.5766,
+      "step": 1328
+    },
+    {
+      "epoch": 0.7088,
+      "grad_norm": 0.499463231063169,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 0.7875,
+      "step": 1329
+    },
+    {
+      "epoch": 0.7093333333333334,
+      "grad_norm": 0.3661031272438177,
+      "learning_rate": 4.1165567984237764e-05,
+      "loss": 0.6066,
+      "step": 1330
+    },
+    {
+      "epoch": 0.7098666666666666,
+      "grad_norm": 0.36741709928338706,
+      "learning_rate": 4.102592405835536e-05,
+      "loss": 0.6208,
+      "step": 1331
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4284659517016139,
+      "learning_rate": 4.088645623801534e-05,
+      "loss": 0.7017,
+      "step": 1332
+    },
+    {
+      "epoch": 0.7109333333333333,
+      "grad_norm": 0.4161922080822117,
+      "learning_rate": 4.074716493968975e-05,
+      "loss": 0.7325,
+      "step": 1333
+    },
+    {
+      "epoch": 0.7114666666666667,
+      "grad_norm": 0.38075809983273384,
+      "learning_rate": 4.060805057932359e-05,
+      "loss": 0.6238,
+      "step": 1334
+    },
+    {
+      "epoch": 0.712,
+      "grad_norm": 0.39114450256771044,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 0.6198,
+      "step": 1335
+    },
+    {
+      "epoch": 0.7125333333333334,
+      "grad_norm": 0.45547674998004817,
+      "learning_rate": 4.0330354333606234e-05,
+      "loss": 0.6436,
+      "step": 1336
+    },
+    {
+      "epoch": 0.7130666666666666,
+      "grad_norm": 0.4032049645290277,
+      "learning_rate": 4.019177327749822e-05,
+      "loss": 0.6714,
+      "step": 1337
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.37367310708375145,
+      "learning_rate": 4.00533708178334e-05,
+      "loss": 0.6155,
+      "step": 1338
+    },
+    {
+      "epoch": 0.7141333333333333,
+      "grad_norm": 0.3742593122537589,
+      "learning_rate": 3.991514736790258e-05,
+      "loss": 0.6209,
+      "step": 1339
+    },
+    {
+      "epoch": 0.7146666666666667,
+      "grad_norm": 0.3949703021201749,
+      "learning_rate": 3.977710334046193e-05,
+      "loss": 0.6261,
+      "step": 1340
+    },
+    {
+      "epoch": 0.7152,
+      "grad_norm": 0.4670628457168731,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 0.6323,
+      "step": 1341
+    },
+    {
+      "epoch": 0.7157333333333333,
+      "grad_norm": 0.472725466215836,
+      "learning_rate": 3.950155520139581e-05,
+      "loss": 0.7601,
+      "step": 1342
+    },
+    {
+      "epoch": 0.7162666666666667,
+      "grad_norm": 0.4123296577584803,
+      "learning_rate": 3.936405191259891e-05,
+      "loss": 0.6882,
+      "step": 1343
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3902995937508573,
+      "learning_rate": 3.922672969194686e-05,
+      "loss": 0.6124,
+      "step": 1344
+    },
+    {
+      "epoch": 0.7173333333333334,
+      "grad_norm": 0.4432532967320152,
+      "learning_rate": 3.9089588949504655e-05,
+      "loss": 0.7045,
+      "step": 1345
+    },
+    {
+      "epoch": 0.7178666666666667,
+      "grad_norm": 0.32557319256599393,
+      "learning_rate": 3.895263009479534e-05,
+      "loss": 0.6022,
+      "step": 1346
+    },
+    {
+      "epoch": 0.7184,
+      "grad_norm": 0.44612344737641824,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 0.7007,
+      "step": 1347
+    },
+    {
+      "epoch": 0.7189333333333333,
+      "grad_norm": 0.400274471628821,
+      "learning_rate": 3.867925968395085e-05,
+      "loss": 0.624,
+      "step": 1348
+    },
+    {
+      "epoch": 0.7194666666666667,
+      "grad_norm": 0.4347762917189781,
+      "learning_rate": 3.854284894414122e-05,
+      "loss": 0.659,
+      "step": 1349
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3528563922408139,
+      "learning_rate": 3.840662172471315e-05,
+      "loss": 0.5884,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7205333333333334,
+      "grad_norm": 0.41839505115370795,
+      "learning_rate": 3.82705784324618e-05,
+      "loss": 0.6532,
+      "step": 1351
+    },
+    {
+      "epoch": 0.7210666666666666,
+      "grad_norm": 0.38938423552045953,
+      "learning_rate": 3.8134719473633094e-05,
+      "loss": 0.682,
+      "step": 1352
+    },
+    {
+      "epoch": 0.7216,
+      "grad_norm": 0.3649377142914851,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 0.6228,
+      "step": 1353
+    },
+    {
+      "epoch": 0.7221333333333333,
+      "grad_norm": 0.44484447899883567,
+      "learning_rate": 3.786355617847385e-05,
+      "loss": 0.6934,
+      "step": 1354
+    },
+    {
+      "epoch": 0.7226666666666667,
+      "grad_norm": 0.4348795155737933,
+      "learning_rate": 3.772825265187802e-05,
+      "loss": 0.6532,
+      "step": 1355
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.4035943809385256,
+      "learning_rate": 3.759313507817196e-05,
+      "loss": 0.6405,
+      "step": 1356
+    },
+    {
+      "epoch": 0.7237333333333333,
+      "grad_norm": 0.41145186927193345,
+      "learning_rate": 3.7458203860837234e-05,
+      "loss": 0.6327,
+      "step": 1357
+    },
+    {
+      "epoch": 0.7242666666666666,
+      "grad_norm": 0.35660941883562153,
+      "learning_rate": 3.732345940279893e-05,
+      "loss": 0.5593,
+      "step": 1358
+    },
+    {
+      "epoch": 0.7248,
+      "grad_norm": 0.348782187361037,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 0.6143,
+      "step": 1359
+    },
+    {
+      "epoch": 0.7253333333333334,
+      "grad_norm": 0.35229758190089866,
+      "learning_rate": 3.705453237352227e-05,
+      "loss": 0.5757,
+      "step": 1360
+    },
+    {
+      "epoch": 0.7258666666666667,
+      "grad_norm": 0.3766708379067481,
+      "learning_rate": 3.692035060534088e-05,
+      "loss": 0.6348,
+      "step": 1361
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.41844607847834725,
+      "learning_rate": 3.678635720256737e-05,
+      "loss": 0.6439,
+      "step": 1362
+    },
+    {
+      "epoch": 0.7269333333333333,
+      "grad_norm": 0.4239525619191792,
+      "learning_rate": 3.665255256532638e-05,
+      "loss": 0.7034,
+      "step": 1363
+    },
+    {
+      "epoch": 0.7274666666666667,
+      "grad_norm": 0.36214835153068375,
+      "learning_rate": 3.651893709317887e-05,
+      "loss": 0.6366,
+      "step": 1364
+    },
+    {
+      "epoch": 0.728,
+      "grad_norm": 0.3698420668311701,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 0.5886,
+      "step": 1365
+    },
+    {
+      "epoch": 0.7285333333333334,
+      "grad_norm": 0.37665993344657184,
+      "learning_rate": 3.625227523958252e-05,
+      "loss": 0.5874,
+      "step": 1366
+    },
+    {
+      "epoch": 0.7290666666666666,
+      "grad_norm": 0.3992543240402163,
+      "learning_rate": 3.611922965442648e-05,
+      "loss": 0.5887,
+      "step": 1367
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.4249390586361677,
+      "learning_rate": 3.5986374826947066e-05,
+      "loss": 0.7423,
+      "step": 1368
+    },
+    {
+      "epoch": 0.7301333333333333,
+      "grad_norm": 0.3890548320054069,
+      "learning_rate": 3.5853711153868965e-05,
+      "loss": 0.6296,
+      "step": 1369
+    },
+    {
+      "epoch": 0.7306666666666667,
+      "grad_norm": 0.4369503478017524,
+      "learning_rate": 3.5721239031346066e-05,
+      "loss": 0.778,
+      "step": 1370
+    },
+    {
+      "epoch": 0.7312,
+      "grad_norm": 0.44242209088136397,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 0.6808,
+      "step": 1371
+    },
+    {
+      "epoch": 0.7317333333333333,
+      "grad_norm": 0.3796958428239579,
+      "learning_rate": 3.545687101972013e-05,
+      "loss": 0.6024,
+      "step": 1372
+    },
+    {
+      "epoch": 0.7322666666666666,
+      "grad_norm": 0.46142542855877045,
+      "learning_rate": 3.53249759200601e-05,
+      "loss": 0.6607,
+      "step": 1373
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.38214993944754494,
+      "learning_rate": 3.519327394983888e-05,
+      "loss": 0.6265,
+      "step": 1374
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.4108942588635796,
+      "learning_rate": 3.506176550233863e-05,
+      "loss": 0.6569,
+      "step": 1375
+    },
+    {
+      "epoch": 0.7338666666666667,
+      "grad_norm": 0.43362799075751884,
+      "learning_rate": 3.4930450970263485e-05,
+      "loss": 0.6686,
+      "step": 1376
+    },
+    {
+      "epoch": 0.7344,
+      "grad_norm": 0.44126989338498557,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 0.6512,
+      "step": 1377
+    },
+    {
+      "epoch": 0.7349333333333333,
+      "grad_norm": 0.4185289731349431,
+      "learning_rate": 3.46684052203088e-05,
+      "loss": 0.6419,
+      "step": 1378
+    },
+    {
+      "epoch": 0.7354666666666667,
+      "grad_norm": 0.4734893958468771,
+      "learning_rate": 3.4537674784937614e-05,
+      "loss": 0.7045,
+      "step": 1379
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.39623039295312157,
+      "learning_rate": 3.440713983000601e-05,
+      "loss": 0.6639,
+      "step": 1380
+    },
+    {
+      "epoch": 0.7365333333333334,
+      "grad_norm": 0.4535746586117549,
+      "learning_rate": 3.427680074531113e-05,
+      "loss": 0.7225,
+      "step": 1381
+    },
+    {
+      "epoch": 0.7370666666666666,
+      "grad_norm": 0.3591509021400412,
+      "learning_rate": 3.4146657920065285e-05,
+      "loss": 0.6195,
+      "step": 1382
+    },
+    {
+      "epoch": 0.7376,
+      "grad_norm": 0.371640842368074,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 0.6137,
+      "step": 1383
+    },
+    {
+      "epoch": 0.7381333333333333,
+      "grad_norm": 0.4124128542580495,
+      "learning_rate": 3.388696260183832e-05,
+      "loss": 0.6272,
+      "step": 1384
+    },
+    {
+      "epoch": 0.7386666666666667,
+      "grad_norm": 0.40380791242840447,
+      "learning_rate": 3.3757410884346894e-05,
+      "loss": 0.7468,
+      "step": 1385
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.357970181024765,
+      "learning_rate": 3.362805697728145e-05,
+      "loss": 0.6093,
+      "step": 1386
+    },
+    {
+      "epoch": 0.7397333333333334,
+      "grad_norm": 0.41222039821290496,
+      "learning_rate": 3.3498901266912396e-05,
+      "loss": 0.6473,
+      "step": 1387
+    },
+    {
+      "epoch": 0.7402666666666666,
+      "grad_norm": 0.33234222419483217,
+      "learning_rate": 3.336994413891828e-05,
+      "loss": 0.5979,
+      "step": 1388
+    },
+    {
+      "epoch": 0.7408,
+      "grad_norm": 0.43062727956941377,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 0.6807,
+      "step": 1389
+    },
+    {
+      "epoch": 0.7413333333333333,
+      "grad_norm": 0.43833492864942547,
+      "learning_rate": 3.3112627169802946e-05,
+      "loss": 0.6377,
+      "step": 1390
+    },
+    {
+      "epoch": 0.7418666666666667,
+      "grad_norm": 0.4220942461833886,
+      "learning_rate": 3.298426809706928e-05,
+      "loss": 0.6912,
+      "step": 1391
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.43358978810760207,
+      "learning_rate": 3.285610914348332e-05,
+      "loss": 0.6776,
+      "step": 1392
+    },
+    {
+      "epoch": 0.7429333333333333,
+      "grad_norm": 0.4329608656009232,
+      "learning_rate": 3.2728150691747115e-05,
+      "loss": 0.7258,
+      "step": 1393
+    },
+    {
+      "epoch": 0.7434666666666667,
+      "grad_norm": 0.48033728320925306,
+      "learning_rate": 3.2600393123964113e-05,
+      "loss": 0.6717,
+      "step": 1394
+    },
+    {
+      "epoch": 0.744,
+      "grad_norm": 0.3943080237724141,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 0.623,
+      "step": 1395
+    },
+    {
+      "epoch": 0.7445333333333334,
+      "grad_norm": 0.37952263078699766,
+      "learning_rate": 3.234548216567049e-05,
+      "loss": 0.6349,
+      "step": 1396
+    },
+    {
+      "epoch": 0.7450666666666667,
+      "grad_norm": 0.4329984014767168,
+      "learning_rate": 3.2218329536362704e-05,
+      "loss": 0.7061,
+      "step": 1397
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.4097847626974238,
+      "learning_rate": 3.209137931341143e-05,
+      "loss": 0.6684,
+      "step": 1398
+    },
+    {
+      "epoch": 0.7461333333333333,
+      "grad_norm": 0.4261613454980889,
+      "learning_rate": 3.196463187590929e-05,
+      "loss": 0.7056,
+      "step": 1399
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.4096693428186534,
+      "learning_rate": 3.1838087602343344e-05,
+      "loss": 0.7031,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7472,
+      "grad_norm": 0.34480011480369654,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 0.5557,
+      "step": 1401
+    },
+    {
+      "epoch": 0.7477333333333334,
+      "grad_norm": 0.41176038512615737,
+      "learning_rate": 3.158561005793402e-05,
+      "loss": 0.682,
+      "step": 1402
+    },
+    {
+      "epoch": 0.7482666666666666,
+      "grad_norm": 0.42128821486930673,
+      "learning_rate": 3.145967754102691e-05,
+      "loss": 0.6459,
+      "step": 1403
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.4186312289389503,
+      "learning_rate": 3.1333949695926324e-05,
+      "loss": 0.6703,
+      "step": 1404
+    },
+    {
+      "epoch": 0.7493333333333333,
+      "grad_norm": 0.35582139540762747,
+      "learning_rate": 3.120842689807468e-05,
+      "loss": 0.5538,
+      "step": 1405
+    },
+    {
+      "epoch": 0.7498666666666667,
+      "grad_norm": 0.4594733836842397,
+      "learning_rate": 3.108310952230212e-05,
+      "loss": 0.732,
+      "step": 1406
+    },
+    {
+      "epoch": 0.7504,
+      "grad_norm": 0.3853264621047334,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 0.6299,
+      "step": 1407
+    },
+    {
+      "epoch": 0.7509333333333333,
+      "grad_norm": 0.43192092754097916,
+      "learning_rate": 3.083309253324651e-05,
+      "loss": 0.6596,
+      "step": 1408
+    },
+    {
+      "epoch": 0.7514666666666666,
+      "grad_norm": 0.36266708068494263,
+      "learning_rate": 3.070839366655215e-05,
+      "loss": 0.6557,
+      "step": 1409
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3921456918480584,
+      "learning_rate": 3.058390171511196e-05,
+      "loss": 0.632,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7525333333333334,
+      "grad_norm": 0.3682804707384396,
+      "learning_rate": 3.0459617050677868e-05,
+      "loss": 0.6162,
+      "step": 1411
+    },
+    {
+      "epoch": 0.7530666666666667,
+      "grad_norm": 0.40690924327241107,
+      "learning_rate": 3.0335540044382694e-05,
+      "loss": 0.6006,
+      "step": 1412
+    },
+    {
+      "epoch": 0.7536,
+      "grad_norm": 0.3757118918656615,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 0.6087,
+      "step": 1413
+    },
+    {
+      "epoch": 0.7541333333333333,
+      "grad_norm": 0.4333706567952028,
+      "learning_rate": 3.008801048763914e-05,
+      "loss": 0.7058,
+      "step": 1414
+    },
+    {
+      "epoch": 0.7546666666666667,
+      "grad_norm": 0.441840561374695,
+      "learning_rate": 2.996455867635155e-05,
+      "loss": 0.6384,
+      "step": 1415
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3738993216053616,
+      "learning_rate": 2.9841316001522347e-05,
+      "loss": 0.6228,
+      "step": 1416
+    },
+    {
+      "epoch": 0.7557333333333334,
+      "grad_norm": 0.4066619644813712,
+      "learning_rate": 2.9718282831172883e-05,
+      "loss": 0.6187,
+      "step": 1417
+    },
+    {
+      "epoch": 0.7562666666666666,
+      "grad_norm": 0.4249750244231828,
+      "learning_rate": 2.9595459532698854e-05,
+      "loss": 0.6931,
+      "step": 1418
+    },
+    {
+      "epoch": 0.7568,
+      "grad_norm": 0.42388755795260064,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 0.6541,
+      "step": 1419
+    },
+    {
+      "epoch": 0.7573333333333333,
+      "grad_norm": 0.4196924046795126,
+      "learning_rate": 2.9350444017825385e-05,
+      "loss": 0.6737,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7578666666666667,
+      "grad_norm": 0.3731126638907941,
+      "learning_rate": 2.922825253307947e-05,
+      "loss": 0.6032,
+      "step": 1421
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.380848844950075,
+      "learning_rate": 2.9106272383513835e-05,
+      "loss": 0.6107,
+      "step": 1422
+    },
+    {
+      "epoch": 0.7589333333333333,
+      "grad_norm": 0.41645384638764094,
+      "learning_rate": 2.898450393337977e-05,
+      "loss": 0.676,
+      "step": 1423
+    },
+    {
+      "epoch": 0.7594666666666666,
+      "grad_norm": 0.39056118434634635,
+      "learning_rate": 2.8862947546296315e-05,
+      "loss": 0.5893,
+      "step": 1424
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.45382049037984407,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 0.735,
+      "step": 1425
+    },
+    {
+      "epoch": 0.7605333333333333,
+      "grad_norm": 0.3251308768590154,
+      "learning_rate": 2.8620472412590228e-05,
+      "loss": 0.574,
+      "step": 1426
+    },
+    {
+      "epoch": 0.7610666666666667,
+      "grad_norm": 0.5116432309743618,
+      "learning_rate": 2.8499554390035143e-05,
+      "loss": 0.6927,
+      "step": 1427
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.44674476604795116,
+      "learning_rate": 2.8378849878663628e-05,
+      "loss": 0.7238,
+      "step": 1428
+    },
+    {
+      "epoch": 0.7621333333333333,
+      "grad_norm": 0.3882554952135879,
+      "learning_rate": 2.8258359238917665e-05,
+      "loss": 0.6211,
+      "step": 1429
+    },
+    {
+      "epoch": 0.7626666666666667,
+      "grad_norm": 0.5221203037694648,
+      "learning_rate": 2.8138082830600554e-05,
+      "loss": 0.6923,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7632,
+      "grad_norm": 0.36186865407806434,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 0.6819,
+      "step": 1431
+    },
+    {
+      "epoch": 0.7637333333333334,
+      "grad_norm": 0.4094104968792091,
+      "learning_rate": 2.7898174144266732e-05,
+      "loss": 0.6534,
+      "step": 1432
+    },
+    {
+      "epoch": 0.7642666666666666,
+      "grad_norm": 0.46324553773117294,
+      "learning_rate": 2.7778542582653744e-05,
+      "loss": 0.6448,
+      "step": 1433
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.40996398425618485,
+      "learning_rate": 2.7659126685275027e-05,
+      "loss": 0.6775,
+      "step": 1434
+    },
+    {
+      "epoch": 0.7653333333333333,
+      "grad_norm": 0.39539878474081086,
+      "learning_rate": 2.753992680872457e-05,
+      "loss": 0.6818,
+      "step": 1435
+    },
+    {
+      "epoch": 0.7658666666666667,
+      "grad_norm": 0.3854112496279747,
+      "learning_rate": 2.7420943308951284e-05,
+      "loss": 0.6211,
+      "step": 1436
+    },
+    {
+      "epoch": 0.7664,
+      "grad_norm": 0.3775024747959915,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 0.5985,
+      "step": 1437
+    },
+    {
+      "epoch": 0.7669333333333334,
+      "grad_norm": 0.37930716053438596,
+      "learning_rate": 2.7183626860300247e-05,
+      "loss": 0.582,
+      "step": 1438
+    },
+    {
+      "epoch": 0.7674666666666666,
+      "grad_norm": 0.3676188176044859,
+      "learning_rate": 2.7065294620085424e-05,
+      "loss": 0.5915,
+      "step": 1439
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4024937254354364,
+      "learning_rate": 2.6947180173971508e-05,
+      "loss": 0.6574,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7685333333333333,
+      "grad_norm": 0.44011655859671217,
+      "learning_rate": 2.6829283874666233e-05,
+      "loss": 0.6696,
+      "step": 1441
+    },
+    {
+      "epoch": 0.7690666666666667,
+      "grad_norm": 0.36464416554288664,
+      "learning_rate": 2.6711606074225782e-05,
+      "loss": 0.6397,
+      "step": 1442
+    },
+    {
+      "epoch": 0.7696,
+      "grad_norm": 0.4187645594144827,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 0.7247,
+      "step": 1443
+    },
+    {
+      "epoch": 0.7701333333333333,
+      "grad_norm": 0.3600886891680787,
+      "learning_rate": 2.647690737490106e-05,
+      "loss": 0.611,
+      "step": 1444
+    },
+    {
+      "epoch": 0.7706666666666667,
+      "grad_norm": 0.38212962826262514,
+      "learning_rate": 2.6359887176862718e-05,
+      "loss": 0.6104,
+      "step": 1445
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3919783855846775,
+      "learning_rate": 2.6243086879379e-05,
+      "loss": 0.5956,
+      "step": 1446
+    },
+    {
+      "epoch": 0.7717333333333334,
+      "grad_norm": 0.3710565120206455,
+      "learning_rate": 2.6126506831233344e-05,
+      "loss": 0.6252,
+      "step": 1447
+    },
+    {
+      "epoch": 0.7722666666666667,
+      "grad_norm": 0.4447727487052963,
+      "learning_rate": 2.6010147380551475e-05,
+      "loss": 0.6349,
+      "step": 1448
+    },
+    {
+      "epoch": 0.7728,
+      "grad_norm": 0.40844341307921317,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 0.6447,
+      "step": 1449
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.42002539746137924,
+      "learning_rate": 2.577809166078716e-05,
+      "loss": 0.6697,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7738666666666667,
+      "grad_norm": 0.35688537235558,
+      "learning_rate": 2.566239608465838e-05,
+      "loss": 0.5776,
+      "step": 1451
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.3849764676618137,
+      "learning_rate": 2.5546922491898495e-05,
+      "loss": 0.6557,
+      "step": 1452
+    },
+    {
+      "epoch": 0.7749333333333334,
+      "grad_norm": 0.40461391785601625,
+      "learning_rate": 2.543167122732918e-05,
+      "loss": 0.6619,
+      "step": 1453
+    },
+    {
+      "epoch": 0.7754666666666666,
+      "grad_norm": 0.407309234941252,
+      "learning_rate": 2.5316642635108244e-05,
+      "loss": 0.6472,
+      "step": 1454
+    },
+    {
+      "epoch": 0.776,
+      "grad_norm": 0.39665205227387035,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 0.6505,
+      "step": 1455
+    },
+    {
+      "epoch": 0.7765333333333333,
+      "grad_norm": 0.421233040167664,
+      "learning_rate": 2.508725484101684e-05,
+      "loss": 0.6698,
+      "step": 1456
+    },
+    {
+      "epoch": 0.7770666666666667,
+      "grad_norm": 0.41035775157864474,
+      "learning_rate": 2.4972896324133144e-05,
+      "loss": 0.622,
+      "step": 1457
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.4418146090773731,
+      "learning_rate": 2.485876184956928e-05,
+      "loss": 0.6442,
+      "step": 1458
+    },
+    {
+      "epoch": 0.7781333333333333,
+      "grad_norm": 0.5145318498415639,
+      "learning_rate": 2.4744851758148156e-05,
+      "loss": 0.6309,
+      "step": 1459
+    },
+    {
+      "epoch": 0.7786666666666666,
+      "grad_norm": 0.37263262406049175,
+      "learning_rate": 2.4631166390022574e-05,
+      "loss": 0.6281,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7792,
+      "grad_norm": 0.4264907048999342,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 0.6345,
+      "step": 1461
+    },
+    {
+      "epoch": 0.7797333333333333,
+      "grad_norm": 0.37532832160156965,
+      "learning_rate": 2.4404471180913058e-05,
+      "loss": 0.6501,
+      "step": 1462
+    },
+    {
+      "epoch": 0.7802666666666667,
+      "grad_norm": 0.41906661417341445,
+      "learning_rate": 2.429146201687538e-05,
+      "loss": 0.6608,
+      "step": 1463
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.46520029369584237,
+      "learning_rate": 2.417867893002387e-05,
+      "loss": 0.7747,
+      "step": 1464
+    },
+    {
+      "epoch": 0.7813333333333333,
+      "grad_norm": 0.3878808565874205,
+      "learning_rate": 2.4066122257145894e-05,
+      "loss": 0.6732,
+      "step": 1465
+    },
+    {
+      "epoch": 0.7818666666666667,
+      "grad_norm": 0.45584845508500854,
+      "learning_rate": 2.3953792334352787e-05,
+      "loss": 0.726,
+      "step": 1466
+    },
+    {
+      "epoch": 0.7824,
+      "grad_norm": 0.4060230858072229,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 0.6515,
+      "step": 1467
+    },
+    {
+      "epoch": 0.7829333333333334,
+      "grad_norm": 0.3780041121096824,
+      "learning_rate": 2.3729814080079816e-05,
+      "loss": 0.6279,
+      "step": 1468
+    },
+    {
+      "epoch": 0.7834666666666666,
+      "grad_norm": 0.38533091717016743,
+      "learning_rate": 2.361816641743303e-05,
+      "loss": 0.6267,
+      "step": 1469
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.41249480165369035,
+      "learning_rate": 2.3506746842535242e-05,
+      "loss": 0.6326,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7845333333333333,
+      "grad_norm": 0.47986898298335223,
+      "learning_rate": 2.339555568810221e-05,
+      "loss": 0.6894,
+      "step": 1471
+    },
+    {
+      "epoch": 0.7850666666666667,
+      "grad_norm": 0.3562972128885994,
+      "learning_rate": 2.328459328616759e-05,
+      "loss": 0.6002,
+      "step": 1472
+    },
+    {
+      "epoch": 0.7856,
+      "grad_norm": 0.3926240308679547,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 0.6376,
+      "step": 1473
+    },
+    {
+      "epoch": 0.7861333333333334,
+      "grad_norm": 0.399779778834616,
+      "learning_rate": 2.306335606451181e-05,
+      "loss": 0.6776,
+      "step": 1474
+    },
+    {
+      "epoch": 0.7866666666666666,
+      "grad_norm": 0.369862176791047,
+      "learning_rate": 2.295308190543859e-05,
+      "loss": 0.6489,
+      "step": 1475
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.40706635158485904,
+      "learning_rate": 2.2843037820157675e-05,
+      "loss": 0.6555,
+      "step": 1476
+    },
+    {
+      "epoch": 0.7877333333333333,
+      "grad_norm": 0.4360060077790018,
+      "learning_rate": 2.2733224137277366e-05,
+      "loss": 0.6844,
+      "step": 1477
+    },
+    {
+      "epoch": 0.7882666666666667,
+      "grad_norm": 0.38720371081753013,
+      "learning_rate": 2.262364118471805e-05,
+      "loss": 0.644,
+      "step": 1478
+    },
+    {
+      "epoch": 0.7888,
+      "grad_norm": 0.42319056314765996,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 0.6267,
+      "step": 1479
+    },
+    {
+      "epoch": 0.7893333333333333,
+      "grad_norm": 0.3604095302189909,
+      "learning_rate": 2.2405168778797646e-05,
+      "loss": 0.6298,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7898666666666667,
+      "grad_norm": 0.3925068211717366,
+      "learning_rate": 2.2296279977828337e-05,
+      "loss": 0.6511,
+      "step": 1481
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.41655114650776953,
+      "learning_rate": 2.2187623211961562e-05,
+      "loss": 0.6515,
+      "step": 1482
+    },
+    {
+      "epoch": 0.7909333333333334,
+      "grad_norm": 0.4770536152552729,
+      "learning_rate": 2.2079198805662914e-05,
+      "loss": 0.6991,
+      "step": 1483
+    },
+    {
+      "epoch": 0.7914666666666667,
+      "grad_norm": 0.4246596112012112,
+      "learning_rate": 2.1971007082704164e-05,
+      "loss": 0.7063,
+      "step": 1484
+    },
+    {
+      "epoch": 0.792,
+      "grad_norm": 0.4302795286271402,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 0.655,
+      "step": 1485
+    },
+    {
+      "epoch": 0.7925333333333333,
+      "grad_norm": 0.3676952748071041,
+      "learning_rate": 2.1755322978418137e-05,
+      "loss": 0.5687,
+      "step": 1486
+    },
+    {
+      "epoch": 0.7930666666666667,
+      "grad_norm": 0.38500645975771264,
+      "learning_rate": 2.1647831241156302e-05,
+      "loss": 0.6592,
+      "step": 1487
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.4291137226921338,
+      "learning_rate": 2.1540573475363402e-05,
+      "loss": 0.6636,
+      "step": 1488
+    },
+    {
+      "epoch": 0.7941333333333334,
+      "grad_norm": 0.4486703074814581,
+      "learning_rate": 2.1433550001327373e-05,
+      "loss": 0.6584,
+      "step": 1489
+    },
+    {
+      "epoch": 0.7946666666666666,
+      "grad_norm": 0.44856230316275914,
+      "learning_rate": 2.1326761138636553e-05,
+      "loss": 0.7457,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7952,
+      "grad_norm": 0.4155731229227311,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 0.6563,
+      "step": 1491
+    },
+    {
+      "epoch": 0.7957333333333333,
+      "grad_norm": 0.44306480487160593,
+      "learning_rate": 2.111388852214001e-05,
+      "loss": 0.5888,
+      "step": 1492
+    },
+    {
+      "epoch": 0.7962666666666667,
+      "grad_norm": 0.3686384335155605,
+      "learning_rate": 2.1007805404004242e-05,
+      "loss": 0.5991,
+      "step": 1493
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3664143734077232,
+      "learning_rate": 2.0901958168551638e-05,
+      "loss": 0.6081,
+      "step": 1494
+    },
+    {
+      "epoch": 0.7973333333333333,
+      "grad_norm": 0.42374095741997003,
+      "learning_rate": 2.0796347131858186e-05,
+      "loss": 0.6569,
+      "step": 1495
+    },
+    {
+      "epoch": 0.7978666666666666,
+      "grad_norm": 0.404103314861036,
+      "learning_rate": 2.069097260929439e-05,
+      "loss": 0.6889,
+      "step": 1496
+    },
+    {
+      "epoch": 0.7984,
+      "grad_norm": 0.37552177045965424,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 0.6132,
+      "step": 1497
+    },
+    {
+      "epoch": 0.7989333333333334,
+      "grad_norm": 0.38204794186683344,
+      "learning_rate": 2.048093436450603e-05,
+      "loss": 0.6706,
+      "step": 1498
+    },
+    {
+      "epoch": 0.7994666666666667,
+      "grad_norm": 0.3755646096604971,
+      "learning_rate": 2.0376271269487514e-05,
+      "loss": 0.6085,
+      "step": 1499
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3904162423032261,
+      "learning_rate": 2.027184594300898e-05,
+      "loss": 0.5988,
+      "step": 1500
+    },
+    {
+      "epoch": 0.8005333333333333,
+      "grad_norm": 0.40648751526987936,
+      "learning_rate": 2.0167658696900317e-05,
+      "loss": 0.619,
+      "step": 1501
+    },
+    {
+      "epoch": 0.8010666666666667,
+      "grad_norm": 0.40139250248289865,
+      "learning_rate": 2.0063709842280432e-05,
+      "loss": 0.6293,
+      "step": 1502
+    },
+    {
+      "epoch": 0.8016,
+      "grad_norm": 0.41177215865758354,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 0.6179,
+      "step": 1503
+    },
+    {
+      "epoch": 0.8021333333333334,
+      "grad_norm": 0.3966353468417924,
+      "learning_rate": 1.985652854842247e-05,
+      "loss": 0.6338,
+      "step": 1504
+    },
+    {
+      "epoch": 0.8026666666666666,
+      "grad_norm": 0.35642516093977317,
+      "learning_rate": 1.9753296727859195e-05,
+      "loss": 0.6419,
+      "step": 1505
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.43270269076477214,
+      "learning_rate": 1.9650304536132426e-05,
+      "loss": 0.6659,
+      "step": 1506
+    },
+    {
+      "epoch": 0.8037333333333333,
+      "grad_norm": 0.37599262291588553,
+      "learning_rate": 1.9547552280792524e-05,
+      "loss": 0.6378,
+      "step": 1507
+    },
+    {
+      "epoch": 0.8042666666666667,
+      "grad_norm": 0.5116979587249396,
+      "learning_rate": 1.9445040268673298e-05,
+      "loss": 0.7785,
+      "step": 1508
+    },
+    {
+      "epoch": 0.8048,
+      "grad_norm": 0.42212680329806335,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 0.696,
+      "step": 1509
+    },
+    {
+      "epoch": 0.8053333333333333,
+      "grad_norm": 0.3799812964323031,
+      "learning_rate": 1.9240738197844278e-05,
+      "loss": 0.6058,
+      "step": 1510
+    },
+    {
+      "epoch": 0.8058666666666666,
+      "grad_norm": 0.3591460903004341,
+      "learning_rate": 1.9138948749211472e-05,
+      "loss": 0.5966,
+      "step": 1511
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.4008722219724583,
+      "learning_rate": 1.903740076395151e-05,
+      "loss": 0.644,
+      "step": 1512
+    },
+    {
+      "epoch": 0.8069333333333333,
+      "grad_norm": 0.4147177426055831,
+      "learning_rate": 1.8936094545302095e-05,
+      "loss": 0.5915,
+      "step": 1513
+    },
+    {
+      "epoch": 0.8074666666666667,
+      "grad_norm": 0.37318212586856425,
+      "learning_rate": 1.883503039577894e-05,
+      "loss": 0.5931,
+      "step": 1514
+    },
+    {
+      "epoch": 0.808,
+      "grad_norm": 0.3621194206193851,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 0.5921,
+      "step": 1515
+    },
+    {
+      "epoch": 0.8085333333333333,
+      "grad_norm": 0.43241355380550084,
+      "learning_rate": 1.8633629510559314e-05,
+      "loss": 0.6547,
+      "step": 1516
+    },
+    {
+      "epoch": 0.8090666666666667,
+      "grad_norm": 0.3863238141027008,
+      "learning_rate": 1.8533293376276472e-05,
+      "loss": 0.5853,
+      "step": 1517
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.44936168495273077,
+      "learning_rate": 1.8433200513945337e-05,
+      "loss": 0.67,
+      "step": 1518
+    },
+    {
+      "epoch": 0.8101333333333334,
+      "grad_norm": 0.43717066334186366,
+      "learning_rate": 1.8333351222458407e-05,
+      "loss": 0.6639,
+      "step": 1519
+    },
+    {
+      "epoch": 0.8106666666666666,
+      "grad_norm": 0.5033016938744572,
+      "learning_rate": 1.8233745799980817e-05,
+      "loss": 0.7191,
+      "step": 1520
+    },
+    {
+      "epoch": 0.8112,
+      "grad_norm": 0.45779706903350137,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 0.6841,
+      "step": 1521
+    },
+    {
+      "epoch": 0.8117333333333333,
+      "grad_norm": 0.3507320796895244,
+      "learning_rate": 1.803526775107217e-05,
+      "loss": 0.5675,
+      "step": 1522
+    },
+    {
+      "epoch": 0.8122666666666667,
+      "grad_norm": 0.4098731143735821,
+      "learning_rate": 1.7936395717326704e-05,
+      "loss": 0.6578,
+      "step": 1523
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4083470933188338,
+      "learning_rate": 1.783776873795994e-05,
+      "loss": 0.6906,
+      "step": 1524
+    },
+    {
+      "epoch": 0.8133333333333334,
+      "grad_norm": 0.3760035011253386,
+      "learning_rate": 1.773938710748706e-05,
+      "loss": 0.6131,
+      "step": 1525
+    },
+    {
+      "epoch": 0.8138666666666666,
+      "grad_norm": 0.3435090246365492,
+      "learning_rate": 1.7641251119690505e-05,
+      "loss": 0.6245,
+      "step": 1526
+    },
+    {
+      "epoch": 0.8144,
+      "grad_norm": 0.3611412759226195,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 0.5484,
+      "step": 1527
+    },
+    {
+      "epoch": 0.8149333333333333,
+      "grad_norm": 0.37981053393979597,
+      "learning_rate": 1.744571724358789e-05,
+      "loss": 0.6176,
+      "step": 1528
+    },
+    {
+      "epoch": 0.8154666666666667,
+      "grad_norm": 0.39732487013425777,
+      "learning_rate": 1.7348319939175637e-05,
+      "loss": 0.6303,
+      "step": 1529
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.4170444261864867,
+      "learning_rate": 1.7251169445225657e-05,
+      "loss": 0.6259,
+      "step": 1530
+    },
+    {
+      "epoch": 0.8165333333333333,
+      "grad_norm": 0.3963048899451401,
+      "learning_rate": 1.715426605184407e-05,
+      "loss": 0.5872,
+      "step": 1531
+    },
+    {
+      "epoch": 0.8170666666666667,
+      "grad_norm": 0.38433101401537006,
+      "learning_rate": 1.705761004839911e-05,
+      "loss": 0.6413,
+      "step": 1532
+    },
+    {
+      "epoch": 0.8176,
+      "grad_norm": 0.40031332813975723,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 0.6614,
+      "step": 1533
+    },
+    {
+      "epoch": 0.8181333333333334,
+      "grad_norm": 0.39456795129406186,
+      "learning_rate": 1.6865041365097435e-05,
+      "loss": 0.6257,
+      "step": 1534
+    },
+    {
+      "epoch": 0.8186666666666667,
+      "grad_norm": 0.37751709007069784,
+      "learning_rate": 1.676912926028007e-05,
+      "loss": 0.6624,
+      "step": 1535
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.41648436193135147,
+      "learning_rate": 1.6673465695476232e-05,
+      "loss": 0.6717,
+      "step": 1536
+    },
+    {
+      "epoch": 0.8197333333333333,
+      "grad_norm": 0.40360852282661724,
+      "learning_rate": 1.6578050956351886e-05,
+      "loss": 0.6025,
+      "step": 1537
+    },
+    {
+      "epoch": 0.8202666666666667,
+      "grad_norm": 0.4130380968592082,
+      "learning_rate": 1.6482885327829913e-05,
+      "loss": 0.692,
+      "step": 1538
+    },
+    {
+      "epoch": 0.8208,
+      "grad_norm": 0.41199731700306036,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 0.6288,
+      "step": 1539
+    },
+    {
+      "epoch": 0.8213333333333334,
+      "grad_norm": 0.36383292387156796,
+      "learning_rate": 1.6293302538564382e-05,
+      "loss": 0.5999,
+      "step": 1540
+    },
+    {
+      "epoch": 0.8218666666666666,
+      "grad_norm": 0.4100830994828841,
+      "learning_rate": 1.619888594394382e-05,
+      "loss": 0.6698,
+      "step": 1541
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.4218887808159194,
+      "learning_rate": 1.6104719592169902e-05,
+      "loss": 0.6669,
+      "step": 1542
+    },
+    {
+      "epoch": 0.8229333333333333,
+      "grad_norm": 0.47142422539345946,
+      "learning_rate": 1.601080376443763e-05,
+      "loss": 0.7068,
+      "step": 1543
+    },
+    {
+      "epoch": 0.8234666666666667,
+      "grad_norm": 0.45856892382442926,
+      "learning_rate": 1.5917138741193973e-05,
+      "loss": 0.7278,
+      "step": 1544
+    },
+    {
+      "epoch": 0.824,
+      "grad_norm": 0.3992066272331399,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 0.6356,
+      "step": 1545
+    },
+    {
+      "epoch": 0.8245333333333333,
+      "grad_norm": 0.4118056801784563,
+      "learning_rate": 1.573056222621453e-05,
+      "loss": 0.6769,
+      "step": 1546
+    },
+    {
+      "epoch": 0.8250666666666666,
+      "grad_norm": 0.4037266190552037,
+      "learning_rate": 1.5637651291624523e-05,
+      "loss": 0.6252,
+      "step": 1547
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.4093902498313882,
+      "learning_rate": 1.5544992275813053e-05,
+      "loss": 0.5753,
+      "step": 1548
+    },
+    {
+      "epoch": 0.8261333333333334,
+      "grad_norm": 0.4014978890448161,
+      "learning_rate": 1.5452585455473977e-05,
+      "loss": 0.6096,
+      "step": 1549
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.41271772645387594,
+      "learning_rate": 1.536043110654809e-05,
+      "loss": 0.6634,
+      "step": 1550
+    },
+    {
+      "epoch": 0.8272,
+      "grad_norm": 0.4599963465208382,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 0.6722,
+      "step": 1551
+    },
+    {
+      "epoch": 0.8277333333333333,
+      "grad_norm": 0.37602488809181833,
+      "learning_rate": 1.5176880922928616e-05,
+      "loss": 0.6434,
+      "step": 1552
+    },
+    {
+      "epoch": 0.8282666666666667,
+      "grad_norm": 0.3732185738125047,
+      "learning_rate": 1.5085485636343755e-05,
+      "loss": 0.5867,
+      "step": 1553
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.46329867727693624,
+      "learning_rate": 1.4994343917387854e-05,
+      "loss": 0.7339,
+      "step": 1554
+    },
+    {
+      "epoch": 0.8293333333333334,
+      "grad_norm": 0.321860242670046,
+      "learning_rate": 1.4903456038223939e-05,
+      "loss": 0.5824,
+      "step": 1555
+    },
+    {
+      "epoch": 0.8298666666666666,
+      "grad_norm": 0.37045770017891644,
+      "learning_rate": 1.4812822270257009e-05,
+      "loss": 0.5911,
+      "step": 1556
+    },
+    {
+      "epoch": 0.8304,
+      "grad_norm": 0.38642688964530936,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 0.6728,
+      "step": 1557
+    },
+    {
+      "epoch": 0.8309333333333333,
+      "grad_norm": 0.3544747208762738,
+      "learning_rate": 1.4632318149739177e-05,
+      "loss": 0.6381,
+      "step": 1558
+    },
+    {
+      "epoch": 0.8314666666666667,
+      "grad_norm": 0.3939531237308629,
+      "learning_rate": 1.454244833620102e-05,
+      "loss": 0.661,
+      "step": 1559
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.40161912955238394,
+      "learning_rate": 1.4452833711883628e-05,
+      "loss": 0.599,
+      "step": 1560
+    },
+    {
+      "epoch": 0.8325333333333333,
+      "grad_norm": 0.42788960629838485,
+      "learning_rate": 1.4363474544389877e-05,
+      "loss": 0.7166,
+      "step": 1561
+    },
+    {
+      "epoch": 0.8330666666666666,
+      "grad_norm": 0.41324912480142906,
+      "learning_rate": 1.4274371100559791e-05,
+      "loss": 0.6558,
+      "step": 1562
+    },
+    {
+      "epoch": 0.8336,
+      "grad_norm": 0.3593332940563285,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 0.549,
+      "step": 1563
+    },
+    {
+      "epoch": 0.8341333333333333,
+      "grad_norm": 0.4179808942660934,
+      "learning_rate": 1.409693244743192e-05,
+      "loss": 0.6443,
+      "step": 1564
+    },
+    {
+      "epoch": 0.8346666666666667,
+      "grad_norm": 0.3445566195625704,
+      "learning_rate": 1.4008597767992871e-05,
+      "loss": 0.5818,
+      "step": 1565
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.4001823627247086,
+      "learning_rate": 1.3920519871933424e-05,
+      "loss": 0.6468,
+      "step": 1566
+    },
+    {
+      "epoch": 0.8357333333333333,
+      "grad_norm": 0.365228111068425,
+      "learning_rate": 1.3832699022267515e-05,
+      "loss": 0.6243,
+      "step": 1567
+    },
+    {
+      "epoch": 0.8362666666666667,
+      "grad_norm": 0.3578894931308083,
+      "learning_rate": 1.37451354812416e-05,
+      "loss": 0.5683,
+      "step": 1568
+    },
+    {
+      "epoch": 0.8368,
+      "grad_norm": 0.4631016919345303,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 0.6711,
+      "step": 1569
+    },
+    {
+      "epoch": 0.8373333333333334,
+      "grad_norm": 0.41538216659285476,
+      "learning_rate": 1.3570781370252582e-05,
+      "loss": 0.6859,
+      "step": 1570
+    },
+    {
+      "epoch": 0.8378666666666666,
+      "grad_norm": 0.38557045548720886,
+      "learning_rate": 1.3483991320937306e-05,
+      "loss": 0.6264,
+      "step": 1571
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3642820157209577,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 0.5968,
+      "step": 1572
+    },
+    {
+      "epoch": 0.8389333333333333,
+      "grad_norm": 0.39087622995060745,
+      "learning_rate": 1.3311186530505838e-05,
+      "loss": 0.5998,
+      "step": 1573
+    },
+    {
+      "epoch": 0.8394666666666667,
+      "grad_norm": 0.3790450593972957,
+      "learning_rate": 1.322517230541096e-05,
+      "loss": 0.6498,
+      "step": 1574
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.38428182843060554,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 0.5748,
+      "step": 1575
+    },
+    {
+      "epoch": 0.8405333333333334,
+      "grad_norm": 0.36044515105718045,
+      "learning_rate": 1.30539214797198e-05,
+      "loss": 0.6033,
+      "step": 1576
+    },
+    {
+      "epoch": 0.8410666666666666,
+      "grad_norm": 0.3602870134258713,
+      "learning_rate": 1.2968685390504465e-05,
+      "loss": 0.5785,
+      "step": 1577
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.3909645239819257,
+      "learning_rate": 1.2883709190004955e-05,
+      "loss": 0.6268,
+      "step": 1578
+    },
+    {
+      "epoch": 0.8421333333333333,
+      "grad_norm": 0.3679001478449064,
+      "learning_rate": 1.2798993131973091e-05,
+      "loss": 0.6176,
+      "step": 1579
+    },
+    {
+      "epoch": 0.8426666666666667,
+      "grad_norm": 0.4578232413768848,
+      "learning_rate": 1.2714537469383858e-05,
+      "loss": 0.7315,
+      "step": 1580
+    },
+    {
+      "epoch": 0.8432,
+      "grad_norm": 0.38246633575623384,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 0.6616,
+      "step": 1581
+    },
+    {
+      "epoch": 0.8437333333333333,
+      "grad_norm": 0.3278031029085765,
+      "learning_rate": 1.2546408338544769e-05,
+      "loss": 0.5653,
+      "step": 1582
+    },
+    {
+      "epoch": 0.8442666666666667,
+      "grad_norm": 0.3952825621324949,
+      "learning_rate": 1.2462735372353996e-05,
+      "loss": 0.6384,
+      "step": 1583
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.40482241550609704,
+      "learning_rate": 1.2379323805722576e-05,
+      "loss": 0.6229,
+      "step": 1584
+    },
+    {
+      "epoch": 0.8453333333333334,
+      "grad_norm": 0.4225222734127927,
+      "learning_rate": 1.2296173887730123e-05,
+      "loss": 0.6503,
+      "step": 1585
+    },
+    {
+      "epoch": 0.8458666666666667,
+      "grad_norm": 0.347801809382486,
+      "learning_rate": 1.2213285866674905e-05,
+      "loss": 0.5663,
+      "step": 1586
+    },
+    {
+      "epoch": 0.8464,
+      "grad_norm": 0.4362943523325906,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 0.6135,
+      "step": 1587
+    },
+    {
+      "epoch": 0.8469333333333333,
+      "grad_norm": 0.4326524921153363,
+      "learning_rate": 1.2048296504658207e-05,
+      "loss": 0.715,
+      "step": 1588
+    },
+    {
+      "epoch": 0.8474666666666667,
+      "grad_norm": 0.4314227468057858,
+      "learning_rate": 1.1966195656380031e-05,
+      "loss": 0.7296,
+      "step": 1589
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.39005495379037797,
+      "learning_rate": 1.1884357690404158e-05,
+      "loss": 0.6103,
+      "step": 1590
+    },
+    {
+      "epoch": 0.8485333333333334,
+      "grad_norm": 0.37697134200698884,
+      "learning_rate": 1.1802782851111205e-05,
+      "loss": 0.6663,
+      "step": 1591
+    },
+    {
+      "epoch": 0.8490666666666666,
+      "grad_norm": 0.3734450064142702,
+      "learning_rate": 1.1721471382096027e-05,
+      "loss": 0.586,
+      "step": 1592
+    },
+    {
+      "epoch": 0.8496,
+      "grad_norm": 0.46314791891224083,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 0.6743,
+      "step": 1593
+    },
+    {
+      "epoch": 0.8501333333333333,
+      "grad_norm": 0.43504959813464567,
+      "learning_rate": 1.1559639525345311e-05,
+      "loss": 0.6121,
+      "step": 1594
+    },
+    {
+      "epoch": 0.8506666666666667,
+      "grad_norm": 0.36929908422880675,
+      "learning_rate": 1.1479119620864276e-05,
+      "loss": 0.6279,
+      "step": 1595
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.49701659054680103,
+      "learning_rate": 1.1398864053168534e-05,
+      "loss": 0.7483,
+      "step": 1596
+    },
+    {
+      "epoch": 0.8517333333333333,
+      "grad_norm": 0.4222092092548254,
+      "learning_rate": 1.1318873061913405e-05,
+      "loss": 0.676,
+      "step": 1597
+    },
+    {
+      "epoch": 0.8522666666666666,
+      "grad_norm": 0.45831563430137234,
+      "learning_rate": 1.123914688596409e-05,
+      "loss": 0.7232,
+      "step": 1598
+    },
+    {
+      "epoch": 0.8528,
+      "grad_norm": 0.38874723637823067,
+      "learning_rate": 1.1159685763395111e-05,
+      "loss": 0.6417,
+      "step": 1599
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.37989141337877586,
+      "learning_rate": 1.1080489931489391e-05,
+      "loss": 0.5809,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8538666666666667,
+      "grad_norm": 0.4166422428912336,
+      "learning_rate": 1.1001559626737756e-05,
+      "loss": 0.6342,
+      "step": 1601
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.36713917128878437,
+      "learning_rate": 1.0922895084838037e-05,
+      "loss": 0.6315,
+      "step": 1602
+    },
+    {
+      "epoch": 0.8549333333333333,
+      "grad_norm": 0.37609252189776415,
+      "learning_rate": 1.0844496540694515e-05,
+      "loss": 0.6151,
+      "step": 1603
+    },
+    {
+      "epoch": 0.8554666666666667,
+      "grad_norm": 0.376209779205758,
+      "learning_rate": 1.0766364228417148e-05,
+      "loss": 0.636,
+      "step": 1604
+    },
+    {
+      "epoch": 0.856,
+      "grad_norm": 0.40409534948346404,
+      "learning_rate": 1.0688498381320855e-05,
+      "loss": 0.6576,
+      "step": 1605
+    },
+    {
+      "epoch": 0.8565333333333334,
+      "grad_norm": 0.3580445368354473,
+      "learning_rate": 1.0610899231924886e-05,
+      "loss": 0.5748,
+      "step": 1606
+    },
+    {
+      "epoch": 0.8570666666666666,
+      "grad_norm": 0.34008507004316907,
+      "learning_rate": 1.0533567011952094e-05,
+      "loss": 0.5227,
+      "step": 1607
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3685045006334314,
+      "learning_rate": 1.045650195232819e-05,
+      "loss": 0.5796,
+      "step": 1608
+    },
+    {
+      "epoch": 0.8581333333333333,
+      "grad_norm": 0.389301347448525,
+      "learning_rate": 1.0379704283181179e-05,
+      "loss": 0.5669,
+      "step": 1609
+    },
+    {
+      "epoch": 0.8586666666666667,
+      "grad_norm": 0.4083232449511819,
+      "learning_rate": 1.0303174233840528e-05,
+      "loss": 0.6492,
+      "step": 1610
+    },
+    {
+      "epoch": 0.8592,
+      "grad_norm": 0.41614186078286347,
+      "learning_rate": 1.0226912032836611e-05,
+      "loss": 0.7204,
+      "step": 1611
+    },
+    {
+      "epoch": 0.8597333333333333,
+      "grad_norm": 0.42405989365191976,
+      "learning_rate": 1.0150917907899926e-05,
+      "loss": 0.6691,
+      "step": 1612
+    },
+    {
+      "epoch": 0.8602666666666666,
+      "grad_norm": 0.3905776917851294,
+      "learning_rate": 1.007519208596045e-05,
+      "loss": 0.6346,
+      "step": 1613
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.4240628512154941,
+      "learning_rate": 9.999734793146998e-06,
+      "loss": 0.6504,
+      "step": 1614
+    },
+    {
+      "epoch": 0.8613333333333333,
+      "grad_norm": 0.42225047989138437,
+      "learning_rate": 9.924546254786493e-06,
+      "loss": 0.6832,
+      "step": 1615
+    },
+    {
+      "epoch": 0.8618666666666667,
+      "grad_norm": 0.4036307026291307,
+      "learning_rate": 9.849626695403324e-06,
+      "loss": 0.5814,
+      "step": 1616
+    },
+    {
+      "epoch": 0.8624,
+      "grad_norm": 0.4032539463748285,
+      "learning_rate": 9.774976338718677e-06,
+      "loss": 0.6393,
+      "step": 1617
+    },
+    {
+      "epoch": 0.8629333333333333,
+      "grad_norm": 0.44355391910769243,
+      "learning_rate": 9.700595407649805e-06,
+      "loss": 0.664,
+      "step": 1618
+    },
+    {
+      "epoch": 0.8634666666666667,
+      "grad_norm": 0.42149209704525703,
+      "learning_rate": 9.62648412430951e-06,
+      "loss": 0.6334,
+      "step": 1619
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.44447353675726664,
+      "learning_rate": 9.552642710005299e-06,
+      "loss": 0.586,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8645333333333334,
+      "grad_norm": 0.4888778248873591,
+      "learning_rate": 9.479071385238892e-06,
+      "loss": 0.6906,
+      "step": 1621
+    },
+    {
+      "epoch": 0.8650666666666667,
+      "grad_norm": 0.5618455210067266,
+      "learning_rate": 9.40577036970538e-06,
+      "loss": 0.6409,
+      "step": 1622
+    },
+    {
+      "epoch": 0.8656,
+      "grad_norm": 0.48120743644240427,
+      "learning_rate": 9.332739882292752e-06,
+      "loss": 0.735,
+      "step": 1623
+    },
+    {
+      "epoch": 0.8661333333333333,
+      "grad_norm": 0.43725656309249994,
+      "learning_rate": 9.259980141081115e-06,
+      "loss": 0.6324,
+      "step": 1624
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.4078778982634269,
+      "learning_rate": 9.187491363342093e-06,
+      "loss": 0.6272,
+      "step": 1625
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.39262040137363846,
+      "learning_rate": 9.115273765538202e-06,
+      "loss": 0.6683,
+      "step": 1626
+    },
+    {
+      "epoch": 0.8677333333333334,
+      "grad_norm": 0.4056787918545523,
+      "learning_rate": 9.043327563322112e-06,
+      "loss": 0.6422,
+      "step": 1627
+    },
+    {
+      "epoch": 0.8682666666666666,
+      "grad_norm": 0.3710993037706992,
+      "learning_rate": 8.971652971536148e-06,
+      "loss": 0.6207,
+      "step": 1628
+    },
+    {
+      "epoch": 0.8688,
+      "grad_norm": 0.4181428824263666,
+      "learning_rate": 8.900250204211514e-06,
+      "loss": 0.6546,
+      "step": 1629
+    },
+    {
+      "epoch": 0.8693333333333333,
+      "grad_norm": 0.36225059464645426,
+      "learning_rate": 8.829119474567671e-06,
+      "loss": 0.6119,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8698666666666667,
+      "grad_norm": 0.4251014294424469,
+      "learning_rate": 8.758260995011825e-06,
+      "loss": 0.663,
+      "step": 1631
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4756696780031577,
+      "learning_rate": 8.687674977138116e-06,
+      "loss": 0.6912,
+      "step": 1632
+    },
+    {
+      "epoch": 0.8709333333333333,
+      "grad_norm": 0.3554683301896617,
+      "learning_rate": 8.617361631727138e-06,
+      "loss": 0.548,
+      "step": 1633
+    },
+    {
+      "epoch": 0.8714666666666666,
+      "grad_norm": 0.603016536409824,
+      "learning_rate": 8.547321168745193e-06,
+      "loss": 0.7974,
+      "step": 1634
+    },
+    {
+      "epoch": 0.872,
+      "grad_norm": 0.3923180270827677,
+      "learning_rate": 8.47755379734373e-06,
+      "loss": 0.6231,
+      "step": 1635
+    },
+    {
+      "epoch": 0.8725333333333334,
+      "grad_norm": 0.3929339474375356,
+      "learning_rate": 8.408059725858719e-06,
+      "loss": 0.5855,
+      "step": 1636
+    },
+    {
+      "epoch": 0.8730666666666667,
+      "grad_norm": 0.3852648359159864,
+      "learning_rate": 8.338839161809997e-06,
+      "loss": 0.6461,
+      "step": 1637
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.38968951516630496,
+      "learning_rate": 8.269892311900696e-06,
+      "loss": 0.6216,
+      "step": 1638
+    },
+    {
+      "epoch": 0.8741333333333333,
+      "grad_norm": 0.3797986191453474,
+      "learning_rate": 8.201219382016556e-06,
+      "loss": 0.6372,
+      "step": 1639
+    },
+    {
+      "epoch": 0.8746666666666667,
+      "grad_norm": 0.38303609001935035,
+      "learning_rate": 8.132820577225387e-06,
+      "loss": 0.5823,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8752,
+      "grad_norm": 0.4366958103642028,
+      "learning_rate": 8.064696101776358e-06,
+      "loss": 0.6372,
+      "step": 1641
+    },
+    {
+      "epoch": 0.8757333333333334,
+      "grad_norm": 0.42322691632058274,
+      "learning_rate": 7.996846159099557e-06,
+      "loss": 0.6362,
+      "step": 1642
+    },
+    {
+      "epoch": 0.8762666666666666,
+      "grad_norm": 0.3891173311203792,
+      "learning_rate": 7.929270951805178e-06,
+      "loss": 0.5992,
+      "step": 1643
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.37586740269463964,
+      "learning_rate": 7.861970681683051e-06,
+      "loss": 0.6811,
+      "step": 1644
+    },
+    {
+      "epoch": 0.8773333333333333,
+      "grad_norm": 0.3702660037262906,
+      "learning_rate": 7.794945549701993e-06,
+      "loss": 0.6075,
+      "step": 1645
+    },
+    {
+      "epoch": 0.8778666666666667,
+      "grad_norm": 0.40831551285631984,
+      "learning_rate": 7.728195756009204e-06,
+      "loss": 0.639,
+      "step": 1646
+    },
+    {
+      "epoch": 0.8784,
+      "grad_norm": 0.42505489999592005,
+      "learning_rate": 7.661721499929753e-06,
+      "loss": 0.6602,
+      "step": 1647
+    },
+    {
+      "epoch": 0.8789333333333333,
+      "grad_norm": 0.4634256620993115,
+      "learning_rate": 7.595522979965819e-06,
+      "loss": 0.6652,
+      "step": 1648
+    },
+    {
+      "epoch": 0.8794666666666666,
+      "grad_norm": 0.3975136154435997,
+      "learning_rate": 7.529600393796232e-06,
+      "loss": 0.6182,
+      "step": 1649
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.35900290259848433,
+      "learning_rate": 7.463953938275858e-06,
+      "loss": 0.5408,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8805333333333333,
+      "grad_norm": 0.5238800168777887,
+      "learning_rate": 7.3985838094349444e-06,
+      "loss": 0.7043,
+      "step": 1651
+    },
+    {
+      "epoch": 0.8810666666666667,
+      "grad_norm": 0.41110094511302564,
+      "learning_rate": 7.333490202478666e-06,
+      "loss": 0.6375,
+      "step": 1652
+    },
+    {
+      "epoch": 0.8816,
+      "grad_norm": 0.41615166707814594,
+      "learning_rate": 7.2686733117863784e-06,
+      "loss": 0.6578,
+      "step": 1653
+    },
+    {
+      "epoch": 0.8821333333333333,
+      "grad_norm": 0.42650553868054925,
+      "learning_rate": 7.204133330911178e-06,
+      "loss": 0.6431,
+      "step": 1654
+    },
+    {
+      "epoch": 0.8826666666666667,
+      "grad_norm": 0.5233343583632561,
+      "learning_rate": 7.1398704525792e-06,
+      "loss": 0.7134,
+      "step": 1655
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.4165457177974901,
+      "learning_rate": 7.07588486868922e-06,
+      "loss": 0.6196,
+      "step": 1656
+    },
+    {
+      "epoch": 0.8837333333333334,
+      "grad_norm": 0.6116535667977607,
+      "learning_rate": 7.012176770311862e-06,
+      "loss": 0.6383,
+      "step": 1657
+    },
+    {
+      "epoch": 0.8842666666666666,
+      "grad_norm": 0.396977276556628,
+      "learning_rate": 6.948746347689183e-06,
+      "loss": 0.6391,
+      "step": 1658
+    },
+    {
+      "epoch": 0.8848,
+      "grad_norm": 0.37237274450227187,
+      "learning_rate": 6.8855937902340576e-06,
+      "loss": 0.5685,
+      "step": 1659
+    },
+    {
+      "epoch": 0.8853333333333333,
+      "grad_norm": 0.3730492247703517,
+      "learning_rate": 6.8227192865295995e-06,
+      "loss": 0.6381,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8858666666666667,
+      "grad_norm": 0.43171087207029185,
+      "learning_rate": 6.760123024328624e-06,
+      "loss": 0.6581,
+      "step": 1661
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.3594653058558447,
+      "learning_rate": 6.6978051905530855e-06,
+      "loss": 0.5859,
+      "step": 1662
+    },
+    {
+      "epoch": 0.8869333333333334,
+      "grad_norm": 0.40031606713438245,
+      "learning_rate": 6.635765971293484e-06,
+      "loss": 0.6098,
+      "step": 1663
+    },
+    {
+      "epoch": 0.8874666666666666,
+      "grad_norm": 0.3577652012444116,
+      "learning_rate": 6.5740055518083375e-06,
+      "loss": 0.595,
+      "step": 1664
+    },
+    {
+      "epoch": 0.888,
+      "grad_norm": 0.33568210599329296,
+      "learning_rate": 6.512524116523633e-06,
+      "loss": 0.5656,
+      "step": 1665
+    },
+    {
+      "epoch": 0.8885333333333333,
+      "grad_norm": 0.3757493259664873,
+      "learning_rate": 6.451321849032288e-06,
+      "loss": 0.6518,
+      "step": 1666
+    },
+    {
+      "epoch": 0.8890666666666667,
+      "grad_norm": 0.37183669635879696,
+      "learning_rate": 6.390398932093555e-06,
+      "loss": 0.5944,
+      "step": 1667
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3837060792491067,
+      "learning_rate": 6.329755547632499e-06,
+      "loss": 0.579,
+      "step": 1668
+    },
+    {
+      "epoch": 0.8901333333333333,
+      "grad_norm": 0.3869512633232343,
+      "learning_rate": 6.269391876739495e-06,
+      "loss": 0.6089,
+      "step": 1669
+    },
+    {
+      "epoch": 0.8906666666666667,
+      "grad_norm": 0.38558581631332584,
+      "learning_rate": 6.209308099669597e-06,
+      "loss": 0.5621,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8912,
+      "grad_norm": 0.3620135333664394,
+      "learning_rate": 6.149504395842087e-06,
+      "loss": 0.6053,
+      "step": 1671
+    },
+    {
+      "epoch": 0.8917333333333334,
+      "grad_norm": 0.399758881219958,
+      "learning_rate": 6.089980943839924e-06,
+      "loss": 0.5723,
+      "step": 1672
+    },
+    {
+      "epoch": 0.8922666666666667,
+      "grad_norm": 0.4116930783863641,
+      "learning_rate": 6.030737921409169e-06,
+      "loss": 0.7012,
+      "step": 1673
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.3676953632029055,
+      "learning_rate": 5.971775505458444e-06,
+      "loss": 0.6175,
+      "step": 1674
+    },
+    {
+      "epoch": 0.8933333333333333,
+      "grad_norm": 0.38209507850251573,
+      "learning_rate": 5.913093872058528e-06,
+      "loss": 0.551,
+      "step": 1675
+    },
+    {
+      "epoch": 0.8938666666666667,
+      "grad_norm": 0.39917237518155163,
+      "learning_rate": 5.854693196441641e-06,
+      "loss": 0.5764,
+      "step": 1676
+    },
+    {
+      "epoch": 0.8944,
+      "grad_norm": 0.3774143488798994,
+      "learning_rate": 5.7965736530010916e-06,
+      "loss": 0.6169,
+      "step": 1677
+    },
+    {
+      "epoch": 0.8949333333333334,
+      "grad_norm": 0.4326734293746316,
+      "learning_rate": 5.738735415290642e-06,
+      "loss": 0.5727,
+      "step": 1678
+    },
+    {
+      "epoch": 0.8954666666666666,
+      "grad_norm": 0.46639365334044075,
+      "learning_rate": 5.681178656024055e-06,
+      "loss": 0.7644,
+      "step": 1679
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.41650925093634616,
+      "learning_rate": 5.623903547074549e-06,
+      "loss": 0.7055,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8965333333333333,
+      "grad_norm": 0.425413643257288,
+      "learning_rate": 5.566910259474289e-06,
+      "loss": 0.6392,
+      "step": 1681
+    },
+    {
+      "epoch": 0.8970666666666667,
+      "grad_norm": 0.39278957561917244,
+      "learning_rate": 5.510198963413881e-06,
+      "loss": 0.6099,
+      "step": 1682
+    },
+    {
+      "epoch": 0.8976,
+      "grad_norm": 0.390744500709825,
+      "learning_rate": 5.453769828241872e-06,
+      "loss": 0.6568,
+      "step": 1683
+    },
+    {
+      "epoch": 0.8981333333333333,
+      "grad_norm": 0.4073043144813503,
+      "learning_rate": 5.397623022464226e-06,
+      "loss": 0.594,
+      "step": 1684
+    },
+    {
+      "epoch": 0.8986666666666666,
+      "grad_norm": 0.4154341662053565,
+      "learning_rate": 5.341758713743828e-06,
+      "loss": 0.6985,
+      "step": 1685
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.43612098721955833,
+      "learning_rate": 5.286177068899989e-06,
+      "loss": 0.6879,
+      "step": 1686
+    },
+    {
+      "epoch": 0.8997333333333334,
+      "grad_norm": 0.4580436108523151,
+      "learning_rate": 5.230878253907912e-06,
+      "loss": 0.6894,
+      "step": 1687
+    },
+    {
+      "epoch": 0.9002666666666667,
+      "grad_norm": 0.45484764609805073,
+      "learning_rate": 5.175862433898282e-06,
+      "loss": 0.7056,
+      "step": 1688
+    },
+    {
+      "epoch": 0.9008,
+      "grad_norm": 0.4056923813691809,
+      "learning_rate": 5.121129773156663e-06,
+      "loss": 0.5872,
+      "step": 1689
+    },
+    {
+      "epoch": 0.9013333333333333,
+      "grad_norm": 0.3959728264489669,
+      "learning_rate": 5.066680435123106e-06,
+      "loss": 0.6153,
+      "step": 1690
+    },
+    {
+      "epoch": 0.9018666666666667,
+      "grad_norm": 0.38389124048407347,
+      "learning_rate": 5.012514582391592e-06,
+      "loss": 0.6166,
+      "step": 1691
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.37726113265552336,
+      "learning_rate": 4.95863237670956e-06,
+      "loss": 0.676,
+      "step": 1692
+    },
+    {
+      "epoch": 0.9029333333333334,
+      "grad_norm": 0.39381983874735715,
+      "learning_rate": 4.905033978977491e-06,
+      "loss": 0.6252,
+      "step": 1693
+    },
+    {
+      "epoch": 0.9034666666666666,
+      "grad_norm": 0.38630816177028143,
+      "learning_rate": 4.851719549248301e-06,
+      "loss": 0.6824,
+      "step": 1694
+    },
+    {
+      "epoch": 0.904,
+      "grad_norm": 0.3938486563850244,
+      "learning_rate": 4.798689246727006e-06,
+      "loss": 0.5795,
+      "step": 1695
+    },
+    {
+      "epoch": 0.9045333333333333,
+      "grad_norm": 0.3935846533547814,
+      "learning_rate": 4.745943229770122e-06,
+      "loss": 0.6273,
+      "step": 1696
+    },
+    {
+      "epoch": 0.9050666666666667,
+      "grad_norm": 0.4016983236901665,
+      "learning_rate": 4.693481655885257e-06,
+      "loss": 0.6784,
+      "step": 1697
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.42610143158704605,
+      "learning_rate": 4.641304681730641e-06,
+      "loss": 0.6469,
+      "step": 1698
+    },
+    {
+      "epoch": 0.9061333333333333,
+      "grad_norm": 0.37841507863339735,
+      "learning_rate": 4.58941246311464e-06,
+      "loss": 0.6772,
+      "step": 1699
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.43091005059953413,
+      "learning_rate": 4.537805154995278e-06,
+      "loss": 0.6265,
+      "step": 1700
+    },
+    {
+      "epoch": 0.9072,
+      "grad_norm": 0.3862461493501147,
+      "learning_rate": 4.486482911479839e-06,
+      "loss": 0.5994,
+      "step": 1701
+    },
+    {
+      "epoch": 0.9077333333333333,
+      "grad_norm": 0.45623319790774136,
+      "learning_rate": 4.435445885824285e-06,
+      "loss": 0.7243,
+      "step": 1702
+    },
+    {
+      "epoch": 0.9082666666666667,
+      "grad_norm": 0.3914923161785018,
+      "learning_rate": 4.384694230432984e-06,
+      "loss": 0.6274,
+      "step": 1703
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.37402162465050454,
+      "learning_rate": 4.3342280968580285e-06,
+      "loss": 0.6032,
+      "step": 1704
+    },
+    {
+      "epoch": 0.9093333333333333,
+      "grad_norm": 0.4507907929467838,
+      "learning_rate": 4.2840476357989825e-06,
+      "loss": 0.6729,
+      "step": 1705
+    },
+    {
+      "epoch": 0.9098666666666667,
+      "grad_norm": 0.3661986883957363,
+      "learning_rate": 4.2341529971023255e-06,
+      "loss": 0.5669,
+      "step": 1706
+    },
+    {
+      "epoch": 0.9104,
+      "grad_norm": 0.41388626248702587,
+      "learning_rate": 4.184544329761009e-06,
+      "loss": 0.6383,
+      "step": 1707
+    },
+    {
+      "epoch": 0.9109333333333334,
+      "grad_norm": 0.3663264719910165,
+      "learning_rate": 4.135221781914034e-06,
+      "loss": 0.6319,
+      "step": 1708
+    },
+    {
+      "epoch": 0.9114666666666666,
+      "grad_norm": 0.38342215848984346,
+      "learning_rate": 4.0861855008460405e-06,
+      "loss": 0.565,
+      "step": 1709
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4429325625906281,
+      "learning_rate": 4.037435632986786e-06,
+      "loss": 0.6849,
+      "step": 1710
+    },
+    {
+      "epoch": 0.9125333333333333,
+      "grad_norm": 0.40666064686143744,
+      "learning_rate": 3.988972323910778e-06,
+      "loss": 0.6665,
+      "step": 1711
+    },
+    {
+      "epoch": 0.9130666666666667,
+      "grad_norm": 0.39052164502879655,
+      "learning_rate": 3.9407957183368095e-06,
+      "loss": 0.6324,
+      "step": 1712
+    },
+    {
+      "epoch": 0.9136,
+      "grad_norm": 0.41727403909031907,
+      "learning_rate": 3.892905960127546e-06,
+      "loss": 0.6476,
+      "step": 1713
+    },
+    {
+      "epoch": 0.9141333333333334,
+      "grad_norm": 0.425226205526703,
+      "learning_rate": 3.845303192289074e-06,
+      "loss": 0.5984,
+      "step": 1714
+    },
+    {
+      "epoch": 0.9146666666666666,
+      "grad_norm": 0.42989062615720675,
+      "learning_rate": 3.797987556970495e-06,
+      "loss": 0.6723,
+      "step": 1715
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.4636995124081845,
+      "learning_rate": 3.750959195463466e-06,
+      "loss": 0.7219,
+      "step": 1716
+    },
+    {
+      "epoch": 0.9157333333333333,
+      "grad_norm": 0.3605121368134305,
+      "learning_rate": 3.7042182482018075e-06,
+      "loss": 0.6043,
+      "step": 1717
+    },
+    {
+      "epoch": 0.9162666666666667,
+      "grad_norm": 0.38272748533172646,
+      "learning_rate": 3.6577648547611033e-06,
+      "loss": 0.6545,
+      "step": 1718
+    },
+    {
+      "epoch": 0.9168,
+      "grad_norm": 0.3522011131041464,
+      "learning_rate": 3.611599153858214e-06,
+      "loss": 0.5692,
+      "step": 1719
+    },
+    {
+      "epoch": 0.9173333333333333,
+      "grad_norm": 0.49209849132812505,
+      "learning_rate": 3.565721283350931e-06,
+      "loss": 0.6871,
+      "step": 1720
+    },
+    {
+      "epoch": 0.9178666666666667,
+      "grad_norm": 0.47373614480469567,
+      "learning_rate": 3.5201313802375456e-06,
+      "loss": 0.6194,
+      "step": 1721
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.37572241258048084,
+      "learning_rate": 3.4748295806564356e-06,
+      "loss": 0.6331,
+      "step": 1722
+    },
+    {
+      "epoch": 0.9189333333333334,
+      "grad_norm": 0.41192517155161484,
+      "learning_rate": 3.4298160198856568e-06,
+      "loss": 0.6621,
+      "step": 1723
+    },
+    {
+      "epoch": 0.9194666666666667,
+      "grad_norm": 0.34315006199865633,
+      "learning_rate": 3.3850908323424967e-06,
+      "loss": 0.5141,
+      "step": 1724
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.38284360446064186,
+      "learning_rate": 3.3406541515832003e-06,
+      "loss": 0.693,
+      "step": 1725
+    },
+    {
+      "epoch": 0.9205333333333333,
+      "grad_norm": 0.42746887049605614,
+      "learning_rate": 3.296506110302422e-06,
+      "loss": 0.6035,
+      "step": 1726
+    },
+    {
+      "epoch": 0.9210666666666667,
+      "grad_norm": 0.3717543292629806,
+      "learning_rate": 3.252646840332918e-06,
+      "loss": 0.5495,
+      "step": 1727
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.4416172856217034,
+      "learning_rate": 3.209076472645112e-06,
+      "loss": 0.6688,
+      "step": 1728
+    },
+    {
+      "epoch": 0.9221333333333334,
+      "grad_norm": 0.40412629092513996,
+      "learning_rate": 3.1657951373467497e-06,
+      "loss": 0.5929,
+      "step": 1729
+    },
+    {
+      "epoch": 0.9226666666666666,
+      "grad_norm": 0.548109727386344,
+      "learning_rate": 3.1228029636824475e-06,
+      "loss": 0.6094,
+      "step": 1730
+    },
+    {
+      "epoch": 0.9232,
+      "grad_norm": 0.37109535691540146,
+      "learning_rate": 3.0801000800333877e-06,
+      "loss": 0.6617,
+      "step": 1731
+    },
+    {
+      "epoch": 0.9237333333333333,
+      "grad_norm": 0.38958918145454857,
+      "learning_rate": 3.037686613916857e-06,
+      "loss": 0.6604,
+      "step": 1732
+    },
+    {
+      "epoch": 0.9242666666666667,
+      "grad_norm": 0.324289583165879,
+      "learning_rate": 2.995562691985898e-06,
+      "loss": 0.5615,
+      "step": 1733
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.33730563151936555,
+      "learning_rate": 2.9537284400289355e-06,
+      "loss": 0.543,
+      "step": 1734
+    },
+    {
+      "epoch": 0.9253333333333333,
+      "grad_norm": 0.3420527854088097,
+      "learning_rate": 2.912183982969385e-06,
+      "loss": 0.5682,
+      "step": 1735
+    },
+    {
+      "epoch": 0.9258666666666666,
+      "grad_norm": 0.6141191055395692,
+      "learning_rate": 2.8709294448653225e-06,
+      "loss": 0.7235,
+      "step": 1736
+    },
+    {
+      "epoch": 0.9264,
+      "grad_norm": 0.4081644698765578,
+      "learning_rate": 2.8299649489090475e-06,
+      "loss": 0.5907,
+      "step": 1737
+    },
+    {
+      "epoch": 0.9269333333333334,
+      "grad_norm": 0.4304989232114822,
+      "learning_rate": 2.789290617426765e-06,
+      "loss": 0.6644,
+      "step": 1738
+    },
+    {
+      "epoch": 0.9274666666666667,
+      "grad_norm": 0.44572336968832404,
+      "learning_rate": 2.748906571878207e-06,
+      "loss": 0.7001,
+      "step": 1739
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4304277308592198,
+      "learning_rate": 2.708812932856253e-06,
+      "loss": 0.5982,
+      "step": 1740
+    },
+    {
+      "epoch": 0.9285333333333333,
+      "grad_norm": 0.42531278167401115,
+      "learning_rate": 2.6690098200866098e-06,
+      "loss": 0.664,
+      "step": 1741
+    },
+    {
+      "epoch": 0.9290666666666667,
+      "grad_norm": 0.4149536644687968,
+      "learning_rate": 2.6294973524274125e-06,
+      "loss": 0.6007,
+      "step": 1742
+    },
+    {
+      "epoch": 0.9296,
+      "grad_norm": 0.43253782934120905,
+      "learning_rate": 2.590275647868867e-06,
+      "loss": 0.6781,
+      "step": 1743
+    },
+    {
+      "epoch": 0.9301333333333334,
+      "grad_norm": 0.35974443341526036,
+      "learning_rate": 2.551344823532964e-06,
+      "loss": 0.594,
+      "step": 1744
+    },
+    {
+      "epoch": 0.9306666666666666,
+      "grad_norm": 0.41653066509061626,
+      "learning_rate": 2.5127049956730207e-06,
+      "loss": 0.6348,
+      "step": 1745
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.4413389196480324,
+      "learning_rate": 2.4743562796734622e-06,
+      "loss": 0.6193,
+      "step": 1746
+    },
+    {
+      "epoch": 0.9317333333333333,
+      "grad_norm": 0.3688000579881498,
+      "learning_rate": 2.436298790049363e-06,
+      "loss": 0.5977,
+      "step": 1747
+    },
+    {
+      "epoch": 0.9322666666666667,
+      "grad_norm": 0.4199307911083171,
+      "learning_rate": 2.3985326404461604e-06,
+      "loss": 0.6777,
+      "step": 1748
+    },
+    {
+      "epoch": 0.9328,
+      "grad_norm": 0.3970606163052576,
+      "learning_rate": 2.3610579436393e-06,
+      "loss": 0.6222,
+      "step": 1749
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.4174698629182832,
+      "learning_rate": 2.3238748115339324e-06,
+      "loss": 0.6815,
+      "step": 1750
+    },
+    {
+      "epoch": 0.9338666666666666,
+      "grad_norm": 0.4036921769098127,
+      "learning_rate": 2.286983355164529e-06,
+      "loss": 0.6032,
+      "step": 1751
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.4285008185444954,
+      "learning_rate": 2.250383684694579e-06,
+      "loss": 0.6643,
+      "step": 1752
+    },
+    {
+      "epoch": 0.9349333333333333,
+      "grad_norm": 0.36953939602361985,
+      "learning_rate": 2.2140759094162467e-06,
+      "loss": 0.6107,
+      "step": 1753
+    },
+    {
+      "epoch": 0.9354666666666667,
+      "grad_norm": 0.4132918798454661,
+      "learning_rate": 2.178060137750071e-06,
+      "loss": 0.6374,
+      "step": 1754
+    },
+    {
+      "epoch": 0.936,
+      "grad_norm": 0.39093970713003195,
+      "learning_rate": 2.1423364772445887e-06,
+      "loss": 0.5885,
+      "step": 1755
+    },
+    {
+      "epoch": 0.9365333333333333,
+      "grad_norm": 0.45277043620307783,
+      "learning_rate": 2.106905034576112e-06,
+      "loss": 0.6468,
+      "step": 1756
+    },
+    {
+      "epoch": 0.9370666666666667,
+      "grad_norm": 0.38838921651357705,
+      "learning_rate": 2.0717659155482738e-06,
+      "loss": 0.6715,
+      "step": 1757
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.35170101651760033,
+      "learning_rate": 2.036919225091827e-06,
+      "loss": 0.6085,
+      "step": 1758
+    },
+    {
+      "epoch": 0.9381333333333334,
+      "grad_norm": 0.40977188438436474,
+      "learning_rate": 2.002365067264289e-06,
+      "loss": 0.6122,
+      "step": 1759
+    },
+    {
+      "epoch": 0.9386666666666666,
+      "grad_norm": 0.4999692476876301,
+      "learning_rate": 1.968103545249611e-06,
+      "loss": 0.7785,
+      "step": 1760
+    },
+    {
+      "epoch": 0.9392,
+      "grad_norm": 0.44579608014517047,
+      "learning_rate": 1.9341347613579087e-06,
+      "loss": 0.6574,
+      "step": 1761
+    },
+    {
+      "epoch": 0.9397333333333333,
+      "grad_norm": 0.38393065737591503,
+      "learning_rate": 1.900458817025097e-06,
+      "loss": 0.6132,
+      "step": 1762
+    },
+    {
+      "epoch": 0.9402666666666667,
+      "grad_norm": 0.39650180104626154,
+      "learning_rate": 1.8670758128126909e-06,
+      "loss": 0.6378,
+      "step": 1763
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.37566035110459617,
+      "learning_rate": 1.8339858484073935e-06,
+      "loss": 0.5903,
+      "step": 1764
+    },
+    {
+      "epoch": 0.9413333333333334,
+      "grad_norm": 0.3779202134176613,
+      "learning_rate": 1.8011890226208527e-06,
+      "loss": 0.6376,
+      "step": 1765
+    },
+    {
+      "epoch": 0.9418666666666666,
+      "grad_norm": 0.45430837631109716,
+      "learning_rate": 1.7686854333893833e-06,
+      "loss": 0.6216,
+      "step": 1766
+    },
+    {
+      "epoch": 0.9424,
+      "grad_norm": 0.380466844962673,
+      "learning_rate": 1.7364751777736332e-06,
+      "loss": 0.6337,
+      "step": 1767
+    },
+    {
+      "epoch": 0.9429333333333333,
+      "grad_norm": 0.45972781704796434,
+      "learning_rate": 1.7045583519583074e-06,
+      "loss": 0.6983,
+      "step": 1768
+    },
+    {
+      "epoch": 0.9434666666666667,
+      "grad_norm": 0.36117014737348896,
+      "learning_rate": 1.6729350512519005e-06,
+      "loss": 0.6296,
+      "step": 1769
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3938199548858364,
+      "learning_rate": 1.6416053700863964e-06,
+      "loss": 0.6015,
+      "step": 1770
+    },
+    {
+      "epoch": 0.9445333333333333,
+      "grad_norm": 0.37137212354045274,
+      "learning_rate": 1.6105694020169593e-06,
+      "loss": 0.5782,
+      "step": 1771
+    },
+    {
+      "epoch": 0.9450666666666667,
+      "grad_norm": 0.3882336811470042,
+      "learning_rate": 1.5798272397217095e-06,
+      "loss": 0.5943,
+      "step": 1772
+    },
+    {
+      "epoch": 0.9456,
+      "grad_norm": 0.4376675474062556,
+      "learning_rate": 1.5493789750014031e-06,
+      "loss": 0.6402,
+      "step": 1773
+    },
+    {
+      "epoch": 0.9461333333333334,
+      "grad_norm": 0.3965225862235942,
+      "learning_rate": 1.5192246987791981e-06,
+      "loss": 0.6234,
+      "step": 1774
+    },
+    {
+      "epoch": 0.9466666666666667,
+      "grad_norm": 0.38088896296122987,
+      "learning_rate": 1.489364501100332e-06,
+      "loss": 0.6113,
+      "step": 1775
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.40567672292834395,
+      "learning_rate": 1.459798471131868e-06,
+      "loss": 0.579,
+      "step": 1776
+    },
+    {
+      "epoch": 0.9477333333333333,
+      "grad_norm": 0.39066870639184614,
+      "learning_rate": 1.430526697162482e-06,
+      "loss": 0.6062,
+      "step": 1777
+    },
+    {
+      "epoch": 0.9482666666666667,
+      "grad_norm": 0.43513431531760194,
+      "learning_rate": 1.4015492666021312e-06,
+      "loss": 0.6324,
+      "step": 1778
+    },
+    {
+      "epoch": 0.9488,
+      "grad_norm": 0.47798973632227304,
+      "learning_rate": 1.3728662659818204e-06,
+      "loss": 0.7004,
+      "step": 1779
+    },
+    {
+      "epoch": 0.9493333333333334,
+      "grad_norm": 0.3437629351738168,
+      "learning_rate": 1.344477780953346e-06,
+      "loss": 0.5798,
+      "step": 1780
+    },
+    {
+      "epoch": 0.9498666666666666,
+      "grad_norm": 0.5310871525203585,
+      "learning_rate": 1.3163838962890195e-06,
+      "loss": 0.7382,
+      "step": 1781
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3380724272652971,
+      "learning_rate": 1.2885846958814673e-06,
+      "loss": 0.5703,
+      "step": 1782
+    },
+    {
+      "epoch": 0.9509333333333333,
+      "grad_norm": 0.3692754455038017,
+      "learning_rate": 1.261080262743297e-06,
+      "loss": 0.6134,
+      "step": 1783
+    },
+    {
+      "epoch": 0.9514666666666667,
+      "grad_norm": 0.46281748086123625,
+      "learning_rate": 1.2338706790069431e-06,
+      "loss": 0.6626,
+      "step": 1784
+    },
+    {
+      "epoch": 0.952,
+      "grad_norm": 0.4344435512092572,
+      "learning_rate": 1.2069560259243328e-06,
+      "loss": 0.6054,
+      "step": 1785
+    },
+    {
+      "epoch": 0.9525333333333333,
+      "grad_norm": 0.4376945848340338,
+      "learning_rate": 1.1803363838667092e-06,
+      "loss": 0.6312,
+      "step": 1786
+    },
+    {
+      "epoch": 0.9530666666666666,
+      "grad_norm": 0.38851437739746575,
+      "learning_rate": 1.1540118323243865e-06,
+      "loss": 0.6102,
+      "step": 1787
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.3791549299691001,
+      "learning_rate": 1.1279824499064396e-06,
+      "loss": 0.5704,
+      "step": 1788
+    },
+    {
+      "epoch": 0.9541333333333334,
+      "grad_norm": 0.34637671330326325,
+      "learning_rate": 1.1022483143405705e-06,
+      "loss": 0.569,
+      "step": 1789
+    },
+    {
+      "epoch": 0.9546666666666667,
+      "grad_norm": 0.397350750997209,
+      "learning_rate": 1.076809502472831e-06,
+      "loss": 0.6319,
+      "step": 1790
+    },
+    {
+      "epoch": 0.9552,
+      "grad_norm": 0.45244614874524447,
+      "learning_rate": 1.0516660902673448e-06,
+      "loss": 0.6244,
+      "step": 1791
+    },
+    {
+      "epoch": 0.9557333333333333,
+      "grad_norm": 0.4220643235107355,
+      "learning_rate": 1.0268181528061749e-06,
+      "loss": 0.6361,
+      "step": 1792
+    },
+    {
+      "epoch": 0.9562666666666667,
+      "grad_norm": 0.398898706807609,
+      "learning_rate": 1.0022657642890231e-06,
+      "loss": 0.6188,
+      "step": 1793
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3431482155645094,
+      "learning_rate": 9.780089980330642e-07,
+      "loss": 0.5479,
+      "step": 1794
+    },
+    {
+      "epoch": 0.9573333333333334,
+      "grad_norm": 0.4041433878134855,
+      "learning_rate": 9.540479264726676e-07,
+      "loss": 0.6493,
+      "step": 1795
+    },
+    {
+      "epoch": 0.9578666666666666,
+      "grad_norm": 0.37743795314295875,
+      "learning_rate": 9.303826211592315e-07,
+      "loss": 0.6491,
+      "step": 1796
+    },
+    {
+      "epoch": 0.9584,
+      "grad_norm": 0.4370842130641423,
+      "learning_rate": 9.070131527609604e-07,
+      "loss": 0.584,
+      "step": 1797
+    },
+    {
+      "epoch": 0.9589333333333333,
+      "grad_norm": 0.40030636657203433,
+      "learning_rate": 8.839395910626213e-07,
+      "loss": 0.6349,
+      "step": 1798
+    },
+    {
+      "epoch": 0.9594666666666667,
+      "grad_norm": 0.34671994474872103,
+      "learning_rate": 8.611620049653879e-07,
+      "loss": 0.5521,
+      "step": 1799
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.4450820840061595,
+      "learning_rate": 8.386804624865851e-07,
+      "loss": 0.6631,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9605333333333334,
+      "grad_norm": 0.39007069670154804,
+      "learning_rate": 8.16495030759501e-07,
+      "loss": 0.6345,
+      "step": 1801
+    },
+    {
+      "epoch": 0.9610666666666666,
+      "grad_norm": 0.4845383265828704,
+      "learning_rate": 7.946057760332193e-07,
+      "loss": 0.6056,
+      "step": 1802
+    },
+    {
+      "epoch": 0.9616,
+      "grad_norm": 0.3805334869970108,
+      "learning_rate": 7.730127636723539e-07,
+      "loss": 0.5943,
+      "step": 1803
+    },
+    {
+      "epoch": 0.9621333333333333,
+      "grad_norm": 0.4392854524912693,
+      "learning_rate": 7.517160581569372e-07,
+      "loss": 0.6876,
+      "step": 1804
+    },
+    {
+      "epoch": 0.9626666666666667,
+      "grad_norm": 0.46948718919118265,
+      "learning_rate": 7.307157230821426e-07,
+      "loss": 0.6842,
+      "step": 1805
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.42666707394312775,
+      "learning_rate": 7.100118211581852e-07,
+      "loss": 0.7057,
+      "step": 1806
+    },
+    {
+      "epoch": 0.9637333333333333,
+      "grad_norm": 0.4691367931022991,
+      "learning_rate": 6.896044142100433e-07,
+      "loss": 0.6714,
+      "step": 1807
+    },
+    {
+      "epoch": 0.9642666666666667,
+      "grad_norm": 0.37921786631867405,
+      "learning_rate": 6.694935631773258e-07,
+      "loss": 0.6112,
+      "step": 1808
+    },
+    {
+      "epoch": 0.9648,
+      "grad_norm": 0.46148125725126665,
+      "learning_rate": 6.496793281141056e-07,
+      "loss": 0.7639,
+      "step": 1809
+    },
+    {
+      "epoch": 0.9653333333333334,
+      "grad_norm": 0.4844058606638755,
+      "learning_rate": 6.301617681886863e-07,
+      "loss": 0.7221,
+      "step": 1810
+    },
+    {
+      "epoch": 0.9658666666666667,
+      "grad_norm": 0.406922952308384,
+      "learning_rate": 6.109409416834688e-07,
+      "loss": 0.6234,
+      "step": 1811
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.38163981720304063,
+      "learning_rate": 5.920169059947411e-07,
+      "loss": 0.6152,
+      "step": 1812
+    },
+    {
+      "epoch": 0.9669333333333333,
+      "grad_norm": 0.37315891106541976,
+      "learning_rate": 5.733897176325665e-07,
+      "loss": 0.6172,
+      "step": 1813
+    },
+    {
+      "epoch": 0.9674666666666667,
+      "grad_norm": 0.3889085407281807,
+      "learning_rate": 5.550594322205504e-07,
+      "loss": 0.6271,
+      "step": 1814
+    },
+    {
+      "epoch": 0.968,
+      "grad_norm": 0.40090834399965586,
+      "learning_rate": 5.370261044956971e-07,
+      "loss": 0.6137,
+      "step": 1815
+    },
+    {
+      "epoch": 0.9685333333333334,
+      "grad_norm": 0.40461055305320026,
+      "learning_rate": 5.192897883082747e-07,
+      "loss": 0.6256,
+      "step": 1816
+    },
+    {
+      "epoch": 0.9690666666666666,
+      "grad_norm": 0.38194267251555764,
+      "learning_rate": 5.018505366216175e-07,
+      "loss": 0.6062,
+      "step": 1817
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.43002286383319605,
+      "learning_rate": 4.847084015119574e-07,
+      "loss": 0.6104,
+      "step": 1818
+    },
+    {
+      "epoch": 0.9701333333333333,
+      "grad_norm": 0.5208208467308275,
+      "learning_rate": 4.678634341683252e-07,
+      "loss": 0.672,
+      "step": 1819
+    },
+    {
+      "epoch": 0.9706666666666667,
+      "grad_norm": 0.42246477229007084,
+      "learning_rate": 4.5131568489236166e-07,
+      "loss": 0.6144,
+      "step": 1820
+    },
+    {
+      "epoch": 0.9712,
+      "grad_norm": 0.42363089684681204,
+      "learning_rate": 4.3506520309813947e-07,
+      "loss": 0.6333,
+      "step": 1821
+    },
+    {
+      "epoch": 0.9717333333333333,
+      "grad_norm": 0.4419638503758844,
+      "learning_rate": 4.191120373120749e-07,
+      "loss": 0.6045,
+      "step": 1822
+    },
+    {
+      "epoch": 0.9722666666666666,
+      "grad_norm": 0.3869966116104389,
+      "learning_rate": 4.034562351727389e-07,
+      "loss": 0.6064,
+      "step": 1823
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.39819979538946243,
+      "learning_rate": 3.8809784343072366e-07,
+      "loss": 0.6683,
+      "step": 1824
+    },
+    {
+      "epoch": 0.9733333333333334,
+      "grad_norm": 0.38011596539787706,
+      "learning_rate": 3.73036907948543e-07,
+      "loss": 0.5736,
+      "step": 1825
+    },
+    {
+      "epoch": 0.9738666666666667,
+      "grad_norm": 0.3746452885852023,
+      "learning_rate": 3.582734737004101e-07,
+      "loss": 0.61,
+      "step": 1826
+    },
+    {
+      "epoch": 0.9744,
+      "grad_norm": 0.47900250653324616,
+      "learning_rate": 3.4380758477219333e-07,
+      "loss": 0.6865,
+      "step": 1827
+    },
+    {
+      "epoch": 0.9749333333333333,
+      "grad_norm": 0.3525264619604992,
+      "learning_rate": 3.296392843612273e-07,
+      "loss": 0.5629,
+      "step": 1828
+    },
+    {
+      "epoch": 0.9754666666666667,
+      "grad_norm": 0.4955924167435773,
+      "learning_rate": 3.1576861477621287e-07,
+      "loss": 0.7149,
+      "step": 1829
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4184355019656372,
+      "learning_rate": 3.0219561743707326e-07,
+      "loss": 0.6345,
+      "step": 1830
+    },
+    {
+      "epoch": 0.9765333333333334,
+      "grad_norm": 0.3938800613710465,
+      "learning_rate": 2.889203328748424e-07,
+      "loss": 0.6386,
+      "step": 1831
+    },
+    {
+      "epoch": 0.9770666666666666,
+      "grad_norm": 0.3982472188768771,
+      "learning_rate": 2.759428007315212e-07,
+      "loss": 0.6034,
+      "step": 1832
+    },
+    {
+      "epoch": 0.9776,
+      "grad_norm": 0.37240093380425443,
+      "learning_rate": 2.6326305976001055e-07,
+      "loss": 0.6225,
+      "step": 1833
+    },
+    {
+      "epoch": 0.9781333333333333,
+      "grad_norm": 0.4214353885560392,
+      "learning_rate": 2.5088114782392257e-07,
+      "loss": 0.6532,
+      "step": 1834
+    },
+    {
+      "epoch": 0.9786666666666667,
+      "grad_norm": 0.41423386651278216,
+      "learning_rate": 2.3879710189753656e-07,
+      "loss": 0.6109,
+      "step": 1835
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.6273923580496839,
+      "learning_rate": 2.2701095806565432e-07,
+      "loss": 0.6919,
+      "step": 1836
+    },
+    {
+      "epoch": 0.9797333333333333,
+      "grad_norm": 0.41093764346339123,
+      "learning_rate": 2.15522751523467e-07,
+      "loss": 0.6575,
+      "step": 1837
+    },
+    {
+      "epoch": 0.9802666666666666,
+      "grad_norm": 0.4177778256201117,
+      "learning_rate": 2.0433251657653308e-07,
+      "loss": 0.6339,
+      "step": 1838
+    },
+    {
+      "epoch": 0.9808,
+      "grad_norm": 0.4095187632595675,
+      "learning_rate": 1.9344028664056713e-07,
+      "loss": 0.589,
+      "step": 1839
+    },
+    {
+      "epoch": 0.9813333333333333,
+      "grad_norm": 0.40917192662630975,
+      "learning_rate": 1.8284609424142895e-07,
+      "loss": 0.5995,
+      "step": 1840
+    },
+    {
+      "epoch": 0.9818666666666667,
+      "grad_norm": 0.36597627394304755,
+      "learning_rate": 1.7254997101500137e-07,
+      "loss": 0.6336,
+      "step": 1841
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3943987511881223,
+      "learning_rate": 1.6255194770704586e-07,
+      "loss": 0.6231,
+      "step": 1842
+    },
+    {
+      "epoch": 0.9829333333333333,
+      "grad_norm": 0.41345739895333505,
+      "learning_rate": 1.5285205417319149e-07,
+      "loss": 0.6544,
+      "step": 1843
+    },
+    {
+      "epoch": 0.9834666666666667,
+      "grad_norm": 0.4024778477907338,
+      "learning_rate": 1.4345031937879062e-07,
+      "loss": 0.6262,
+      "step": 1844
+    },
+    {
+      "epoch": 0.984,
+      "grad_norm": 0.47877298730134144,
+      "learning_rate": 1.3434677139885222e-07,
+      "loss": 0.6805,
+      "step": 1845
+    },
+    {
+      "epoch": 0.9845333333333334,
+      "grad_norm": 0.4232211837735616,
+      "learning_rate": 1.255414374179531e-07,
+      "loss": 0.6415,
+      "step": 1846
+    },
+    {
+      "epoch": 0.9850666666666666,
+      "grad_norm": 0.3626033796777617,
+      "learning_rate": 1.170343437301491e-07,
+      "loss": 0.5324,
+      "step": 1847
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.4424864683447427,
+      "learning_rate": 1.0882551573891953e-07,
+      "loss": 0.6041,
+      "step": 1848
+    },
+    {
+      "epoch": 0.9861333333333333,
+      "grad_norm": 0.3991040889335519,
+      "learning_rate": 1.0091497795706728e-07,
+      "loss": 0.6313,
+      "step": 1849
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.3712157586467656,
+      "learning_rate": 9.330275400666332e-08,
+      "loss": 0.5788,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9872,
+      "grad_norm": 0.39765646907224256,
+      "learning_rate": 8.598886661895788e-08,
+      "loss": 0.6132,
+      "step": 1851
+    },
+    {
+      "epoch": 0.9877333333333334,
+      "grad_norm": 0.45976109755188366,
+      "learning_rate": 7.8973337634336e-08,
+      "loss": 0.6972,
+      "step": 1852
+    },
+    {
+      "epoch": 0.9882666666666666,
+      "grad_norm": 0.3958638220607664,
+      "learning_rate": 7.225618800222877e-08,
+      "loss": 0.6145,
+      "step": 1853
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.46242520338791676,
+      "learning_rate": 6.583743778106887e-08,
+      "loss": 0.6866,
+      "step": 1854
+    },
+    {
+      "epoch": 0.9893333333333333,
+      "grad_norm": 0.41695337344509853,
+      "learning_rate": 5.971710613821291e-08,
+      "loss": 0.6614,
+      "step": 1855
+    },
+    {
+      "epoch": 0.9898666666666667,
+      "grad_norm": 0.4227237033985834,
+      "learning_rate": 5.389521134989695e-08,
+      "loss": 0.6806,
+      "step": 1856
+    },
+    {
+      "epoch": 0.9904,
+      "grad_norm": 0.4829258985045428,
+      "learning_rate": 4.837177080119215e-08,
+      "loss": 0.6839,
+      "step": 1857
+    },
+    {
+      "epoch": 0.9909333333333333,
+      "grad_norm": 0.4294697874822055,
+      "learning_rate": 4.314680098592705e-08,
+      "loss": 0.661,
+      "step": 1858
+    },
+    {
+      "epoch": 0.9914666666666667,
+      "grad_norm": 0.3901223575904081,
+      "learning_rate": 3.8220317506654226e-08,
+      "loss": 0.6105,
+      "step": 1859
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.40648596688328853,
+      "learning_rate": 3.359233507459481e-08,
+      "loss": 0.6426,
+      "step": 1860
+    },
+    {
+      "epoch": 0.9925333333333334,
+      "grad_norm": 0.3686515189571681,
+      "learning_rate": 2.9262867509605163e-08,
+      "loss": 0.5758,
+      "step": 1861
+    },
+    {
+      "epoch": 0.9930666666666667,
+      "grad_norm": 0.6166304982855115,
+      "learning_rate": 2.5231927740154704e-08,
+      "loss": 0.6289,
+      "step": 1862
+    },
+    {
+      "epoch": 0.9936,
+      "grad_norm": 0.4229002483853549,
+      "learning_rate": 2.1499527803214846e-08,
+      "loss": 0.6452,
+      "step": 1863
+    },
+    {
+      "epoch": 0.9941333333333333,
+      "grad_norm": 0.3963059097706496,
+      "learning_rate": 1.8065678844314538e-08,
+      "loss": 0.6044,
+      "step": 1864
+    },
+    {
+      "epoch": 0.9946666666666667,
+      "grad_norm": 0.3920565762890434,
+      "learning_rate": 1.4930391117451426e-08,
+      "loss": 0.6373,
+      "step": 1865
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.4401043721301902,
+      "learning_rate": 1.209367398504746e-08,
+      "loss": 0.6717,
+      "step": 1866
+    },
+    {
+      "epoch": 0.9957333333333334,
+      "grad_norm": 0.3856518849580394,
+      "learning_rate": 9.555535917993297e-09,
+      "loss": 0.5939,
+      "step": 1867
+    },
+    {
+      "epoch": 0.9962666666666666,
+      "grad_norm": 0.4074397128158654,
+      "learning_rate": 7.315984495548378e-09,
+      "loss": 0.6534,
+      "step": 1868
+    },
+    {
+      "epoch": 0.9968,
+      "grad_norm": 0.3903679497354316,
+      "learning_rate": 5.375026405352035e-09,
+      "loss": 0.6001,
+      "step": 1869
+    },
+    {
+      "epoch": 0.9973333333333333,
+      "grad_norm": 0.39556118144329355,
+      "learning_rate": 3.732667443390181e-09,
+      "loss": 0.6371,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9978666666666667,
+      "grad_norm": 0.3570559166896929,
+      "learning_rate": 2.388912514017516e-09,
+      "loss": 0.5798,
+      "step": 1871
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.43848927183099673,
+      "learning_rate": 1.3437656298687097e-09,
+      "loss": 0.6691,
+      "step": 1872
+    },
+    {
+      "epoch": 0.9989333333333333,
+      "grad_norm": 0.3887278108125185,
+      "learning_rate": 5.972299119250125e-10,
+      "loss": 0.6331,
+      "step": 1873
+    },
+    {
+      "epoch": 0.9994666666666666,
+      "grad_norm": 0.3720451458593415,
+      "learning_rate": 1.4930758944764479e-10,
+      "loss": 0.6607,
+      "step": 1874
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.42451716518267596,
+      "learning_rate": 0.0,
+      "loss": 0.6693,
+      "step": 1875
+    },
+    {
+      "epoch": 1.0,
+      "step": 1875,
+      "total_flos": 1690936583258112.0,
+      "train_loss": 0.7105037103970846,
+      "train_runtime": 29377.3841,
+      "train_samples_per_second": 1.021,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1690936583258112.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..30b813290ec4aa61eeeba82f3f4c5009897d19f4
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "down_proj",
+    "gate_proj",
+    "q_proj",
+    "o_proj",
+    "up_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0cae4f2aff9c5d14ef610908eed96b046e9c7aee
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d612e19a7470cc3e3a55d6951195e5cd1768350e9cfdaac960417de402b7a29a
+size 671150064
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..be8efead8647943c5b4f26116482153b15c7eae5
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c8d63fc4cbc4f145985f596f2c6af645c7ee34970ed978ab9d4cedadeae8872
+size 918507402
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f66f6f388913ee61c26ce37fb0c8b72c41ba9ae5
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.706372384637355,
+      "learning_rate": 2e-05,
+      "loss": 1.2116,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.7132577302985311,
+      "learning_rate": 4e-05,
+      "loss": 1.2159,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7576851087474644,
+      "learning_rate": 6e-05,
+      "loss": 1.3365,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.7251469370332176,
+      "learning_rate": 8e-05,
+      "loss": 1.2046,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.7543909315754976,
+      "learning_rate": 0.0001,
+      "loss": 1.1315,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.7710513379038294,
+      "learning_rate": 0.00012,
+      "loss": 1.0763,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.8700130881117148,
+      "learning_rate": 0.00014,
+      "loss": 1.0161,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7062873685696989,
+      "learning_rate": 0.00016,
+      "loss": 0.9766,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.6276719913686867,
+      "learning_rate": 0.00018,
+      "loss": 0.8885,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.48268253814003653,
+      "learning_rate": 0.0002,
+      "loss": 0.9105,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.6046383023845995,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.9888,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5207115213993277,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.9443,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4804285710737122,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.8564,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5545095568507734,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.9725,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5511366684926723,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.9165,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.526546815165093,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9242,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.5320719142330005,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.8972,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.49262475460436955,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.8967,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.46026944803598585,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.8632,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.47274515264742456,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.8848,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.46436181337012583,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.8669,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.41213506760026203,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.8417,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.40866649316812587,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.8383,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5895646382759859,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8898,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.4754218774744775,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.8739,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.42615491071092587,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.8955,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.4973425567847533,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.9069,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.42284857437958,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.8475,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.42719840251241153,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.8369,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4361087817832279,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.8563,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.3870466447188877,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.8284,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4333186810342646,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.803,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.5057877523198733,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.9339,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.39771554288914684,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8405,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.39423571382411393,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.7895,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4145519181964843,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.8203,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.41487025580304077,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.849,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.41940164168771354,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.7895,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.4820667535546615,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.8965,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4665067823750921,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.9094,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.5265719547596305,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.8688,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.42260029299982316,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.781,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.46029663275021715,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.8734,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4032305072429021,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.7946,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4271057343845864,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.862,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.43514680222912955,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.9001,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.3994355342843695,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.8053,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.3852189335285682,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.8227,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.42218700595202424,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.9141,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4134418764962068,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8425,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.4365271441054017,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.8687,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4198113121039888,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.7688,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.405081556296362,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.8238,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.3966738782637263,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8059,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4216526542500135,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.8174,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.40486760918324616,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.7881,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.372170659374174,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.813,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.3623051107925519,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.7411,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.4399934756341017,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.7899,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.37612989675354286,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.8054,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4009296252617688,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.7704,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4375959043493014,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.85,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4058068414533547,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.8767,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.45318507931954866,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.9078,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.47499487962131176,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.8995,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.4112047746691982,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.7646,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.4599444309834608,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.8395,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4153268210964602,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.8149,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4095997162714167,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.7965,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.37632430395084054,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.7468,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 1.754470990635494,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.7746,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.43862893440675715,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8312,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.3730853254794896,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.7838,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.43501234736399524,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.8121,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4315395622099375,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.8045,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.45243219214473296,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.7998,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.38158104127374903,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.7273,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.39029421047072665,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.8051,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4448685187536034,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.821,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.40437396682659577,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.8099,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.39897213964130696,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.7215,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3830906985552,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.7608,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.4293136276970088,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.8422,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.3990501135156007,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.7954,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3969139416853821,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.7744,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.46193230448634365,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.8728,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.44007549015603314,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.8259,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.3869222724070434,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.7533,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.41884480439098987,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.8667,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.43009413537086566,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8322,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.44501336696217714,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.8458,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4117672217268619,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.8866,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.3635525623502143,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.6836,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.44119747695330935,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.9032,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.3634053428885579,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.7741,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.40074048786281563,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.7567,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.3710119961265731,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.7438,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.43659390815943544,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.8229,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.44089643664019057,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.8717,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4882390118058521,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.8413,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.3892151483766946,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.7386,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.45962800562606665,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.8214,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.36844123484513785,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.7762,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.393973862273754,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7748,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4068913984751016,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.7453,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4198609640121107,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.8147,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.44435427505636294,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.7834,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.4240365058162843,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.8124,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.43581310205188567,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.7962,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.4067616506759109,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.7642,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.4389221538921379,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.8329,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.4292143785202815,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.8139,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.41683271568461383,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.7463,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.41560250231469886,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8365,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4265526045880755,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.7318,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4125340931465266,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7438,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.41380660062293606,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.7589,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.4027084665611716,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.7795,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.4353731889281649,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.8598,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4740349413274777,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.9333,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.40992972281306467,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.8826,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.36347394284436363,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7407,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.404150662357857,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.7737,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4278255533166568,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.8332,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.39436523824192266,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.7932,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.40713229995212113,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.82,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.4310281065090191,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.7957,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.3613968258945572,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.7372,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3800431375399143,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.7416,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3849926632349601,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.712,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.3827741004319946,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.7685,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.4184348035492567,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.8409,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.42952809627796723,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.782,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3885255772037007,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7796,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.4177816693102137,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.8373,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4252103124047141,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7054,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.39960643857322153,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.7574,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.4020654231178936,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.8206,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.42372134548147655,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.8005,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4045781402837269,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.822,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.4435646118102826,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.8393,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3828792544162484,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.7898,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.38080024429440795,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.7732,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.4056355024308322,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7626,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.3882052147935702,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.7952,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.45730059206674206,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.8217,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3974489084438826,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.7611,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3797558562347122,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.8435,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.358948721145034,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7286,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3782538505103814,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7428,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.39267784239042214,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.787,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.42146096754745493,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.789,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.36020480638676666,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.7271,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.37430245737142476,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.8252,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.38913045174749195,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.773,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.39960922244897257,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7155,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.39641619234048997,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.7896,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.33652474109408925,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7171,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.3941859869494838,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.7855,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.35618008718785404,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7337,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4131684190723905,
+      "learning_rate": 0.0001,
+      "loss": 0.7677,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.4082194849724162,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7892,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.3989600006708164,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.8089,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.41631670995914843,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.7854,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.402657543866696,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.7787,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.4107566999788711,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.7596,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.41566952699593074,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7629,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.5342359883614851,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.9196,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.3609183839419,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.7353,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3565874591187865,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7193,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.3444424796083395,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.697,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.41961617081613556,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.8216,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.3927724673868508,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.8328,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.3711041396342585,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7837,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.42945797533025126,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.8349,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.44056918352960456,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7856,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.34843951779969207,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.6821,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.3888506113430166,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.8158,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.37766985513771145,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.7402,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4020630450815043,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.7494,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.377081441672366,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.7844,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.40450245428415776,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.8174,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.39493090616168064,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.7985,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.41675679202293997,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.8159,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3931281130361976,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.7521,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.6042502120663296,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.7507,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.47468697932857046,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.9119,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.38550459532969644,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.7628,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.36495206218397463,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.7743,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4188243954824607,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.8073,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3699670401294648,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.727,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3801784175823091,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.7446,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3973466051960255,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.8147,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3550433416895208,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7358,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3593347476207881,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.6282,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.6312465616911078,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.8337,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.4012296319663537,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.7803,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.3549863053806089,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7183,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.4047234928323414,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.8044,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3626160894555547,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.756,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.41467079082946373,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.8027,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4075587028092342,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.7418,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.3830976241692011,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.7419,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.42143030433733797,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.7973,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.35952121482881066,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.685,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.3719557223049432,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.7693,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4530552292723754,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.8558,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.404333362478301,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7737,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3602593506741611,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.7277,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.43170347457251695,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.8079,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.38157460056469383,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.7265,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.3787607400303287,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.7648,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.37812142777908264,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.7608,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.4105230234147243,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.7745,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3758126653197388,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.7584,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.36754219735668736,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.7662,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3894194421895725,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.7093,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.357425964555511,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.683,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.3534371862254715,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.7069,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.40921866361886605,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.7862,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.3634675831446619,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.7012,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.38769159694516386,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.8184,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.3883328519290395,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.6963,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.4039785395114772,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.8165,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.36626014714342764,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.6868,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.36486252134168545,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7548,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.37920248619014446,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.75,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3927655551690742,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.7159,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.4171823450293806,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.7247,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.43052610498046595,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.763,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.37889611789663313,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.7687,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.382653293227921,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.7021,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.36566488831348215,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.7679,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3694354834019595,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7395,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.35908269164475143,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.7337,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.39016941968285335,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.772,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.39819859786786,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.7805,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.39185642378960983,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.6778,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.401622643081188,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.8092,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3920103368441694,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7962,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.48709130599059774,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.7285,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.40376554364298234,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.8016,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.36837916085597394,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.6855,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.4483714742264585,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.7492,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.3751255820392184,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.724,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.46753824670719196,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7511,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.36924248073327026,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.6697,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.4311324864533044,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.8389,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3468847141233154,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.6796,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.40305637231032954,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7589,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.37006132966778144,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.7222,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3828599194296075,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7471,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.42119470659162916,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.8019,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4240891071356015,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.7722,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.343317961799583,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.681,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3858696373590418,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.7242,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.3852297204123048,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.82,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.3660298327029085,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7614,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3501196050916281,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.7489,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.4312933762780563,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7936,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.38094591751083784,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.8143,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.4304063885736067,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.8567,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.3530318575675933,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.7033,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.34922822590438607,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.6877,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3526678283487226,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.7317,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.7094102415989282,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.8096,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.41180672382392763,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.7674,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3797380672648192,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.7135,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.3666105727289888,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.7116,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.3939067802745871,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.8032,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3715620422004267,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.7724,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.3931537314827726,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7537,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.35281398953949933,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.7231,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.38182168546498296,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.7483,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3657635352324864,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.6833,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.3824285828193031,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7575,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.37882674471517297,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.736,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.37706586346203985,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7194,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.37702088330630684,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.7112,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.345197042865683,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.7507,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3952886290968386,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.806,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.4156637546454637,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7324,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.4181202383313785,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.7611,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.39953806832811245,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.7984,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.39038420791176903,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.7821,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3171140666423134,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.598,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.3671672595983185,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.7397,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.3494979584858635,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.6832,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3739816532239034,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.7122,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.3434731826805483,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.71,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.4175439167844619,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.7685,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.38016444223431994,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7305,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.3816533198845126,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.7436,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.36357433955019786,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.7354,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.38398526161124624,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.8063,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.39414628772972166,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7807,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.35210866914929495,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.7157,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.45679210106521884,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7635,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.35302500589097974,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.7143,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.5043013214314466,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.6692,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.3901330300448095,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.7647,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.41218713945684576,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.8084,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.38756210150749887,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.7732,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.3535259267039688,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.6677,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.41043791682590347,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.8145,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.3955724908252904,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.7789,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3897792547207055,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.7572,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.3849603185219187,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.7607,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.4033374013090449,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.7728,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.4085978341355722,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.7689,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.4317191681124255,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.8225,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.36886491969634516,
+      "learning_rate": 0.0,
+      "loss": 0.6977,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 273275681636352.0,
+      "train_loss": 0.7957936550180117,
+      "train_runtime": 4889.7546,
+      "train_samples_per_second": 1.023,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 273275681636352.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..13d9060160fbad1bafb8b6da3eeb56ca6bbea573
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "q_proj",
+    "down_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..08c0d23e21dea8488e5be9c6158f457e9b83b4e0
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8c25b431e0d66526adddf63874623da42b2b8345a74cd589a4adbbdf0ed2c8d
+size 671150064
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f06302ebff3dfafdaacc4077ed7c2312ef7b3825
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11a5ee48a4cdd5e71d24c8b129ac499b66de672d387a7626163c2ede608c6bd1
+size 918507402
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8f1cafbfac42eeea479536c7ac86ccf432f8465
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.7353335647051229,
+      "learning_rate": 2e-05,
+      "loss": 1.2076,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.7279787019415223,
+      "learning_rate": 4e-05,
+      "loss": 1.2524,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7010361557438148,
+      "learning_rate": 6e-05,
+      "loss": 1.2511,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.6687401924085121,
+      "learning_rate": 8e-05,
+      "loss": 1.2113,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.7346425995604949,
+      "learning_rate": 0.0001,
+      "loss": 1.1487,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.756184601542448,
+      "learning_rate": 0.00012,
+      "loss": 1.1107,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.8095698213691533,
+      "learning_rate": 0.00014,
+      "loss": 0.9564,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7222059057957128,
+      "learning_rate": 0.00016,
+      "loss": 0.9622,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.5791907109692352,
+      "learning_rate": 0.00018,
+      "loss": 0.9456,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.4755161585748817,
+      "learning_rate": 0.0002,
+      "loss": 0.8847,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.534405283597521,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 1.023,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.48687859353694035,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.9897,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5569660910880551,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.9866,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5616330007320514,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.998,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.5611217307031949,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.8927,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5370487900027019,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9287,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.46385921148664616,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.8463,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5051655153816611,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.9266,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.4733629776054302,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.87,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.45292377837228687,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.887,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.46552242008280886,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.8795,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.46418388572124936,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.887,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.42205968841775676,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.7814,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.47135505086732565,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8869,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.4773649704916738,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.8908,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.45081311668010043,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.9113,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.4480881064139389,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.932,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4367083903922008,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.8035,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.44252534295962925,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.876,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.4187006380577945,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.884,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.3852515402302905,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.8389,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.39441356614834566,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.8309,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.45323199266429537,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.9031,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.4203601208851421,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8488,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.39400991555307874,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.8236,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4078190992380361,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.8059,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5571567140998109,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.8765,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.4339029798097146,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.8513,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.49432651381587406,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.9171,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.44088058900747906,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.9415,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.4769524240996268,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.9368,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.49387217949165196,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.9142,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.40446228014333374,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.8133,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.44322829688845194,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.9206,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.38951565881037564,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.7707,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.43181804382854094,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8291,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.4073163231966112,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.8037,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.5006944874836005,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.8234,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.41216712583716064,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.823,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.40904550226721814,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.7697,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.45468333701316305,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.8864,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4302613251664346,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.751,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.33829052241502605,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.6905,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.4320963001746247,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.7661,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.3898845794735546,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.7615,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.43418724158248473,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.8216,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.4026011963114364,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.8503,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.4739645096526827,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.7476,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.41850488712881967,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.8024,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.4709244414897302,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.8109,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.3693587269306096,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.769,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.47301661699875125,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.889,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4386889037604545,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.8279,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.43011697191812426,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.7882,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.48661858269251446,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.8426,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.40247048922777995,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.8584,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.4508726830470603,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.8058,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4358043323296017,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.782,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.44946172263886064,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.8252,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4101863243055401,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.765,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.3847259325031979,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.8052,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.4468868011811026,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8765,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.3951583426923667,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.788,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.3985087499811743,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.7706,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.423798419258559,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.8708,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.374327382153691,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.7646,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.3989217108444149,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.7928,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.4203983455730358,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.7771,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.41484946201589906,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.8396,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.38867408389341773,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7985,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4190613279860536,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.7909,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.39446934721916893,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.8379,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.4530288411101365,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.8267,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.38791637036738574,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.7791,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.4163205325536259,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.8395,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4336427847797014,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.86,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.43859603028051736,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.837,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4796050483752347,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.8337,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.4148636880540723,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.8407,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4278239391229428,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8607,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.4171106509125376,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.7848,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.43952389800771496,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.8784,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.4110890120327196,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.7378,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.4016951197596061,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.7629,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4107735649609541,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.7876,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3975769330362091,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.7735,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.41267371328534225,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.7443,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.3953945159539454,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.8453,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.43071752234101507,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.8352,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4493749509773131,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.8775,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.40149997325603126,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.7865,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.5476453056261936,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.8126,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.39586801827264484,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.7457,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.5646707566961765,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7555,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.4475995669275955,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.797,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4313869191817502,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.7591,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.3882927614545473,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.7339,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.3844877948455331,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.7714,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.4447936789010439,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.8044,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.3659818159150816,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.8186,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.48371725315043806,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.8727,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.4236883958628823,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.8057,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4089980647537749,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.7718,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.505941391374786,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.8456,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.4823477816929391,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.8652,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.46891506565573476,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.785,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.6453908453138354,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.8004,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.39156015317884774,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.7909,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.4022394970785278,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.7843,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4511108310970962,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.8327,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.453833054764165,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.8107,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.3892486907170623,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.7089,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.36810555047180393,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.7515,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.3945608101150715,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.802,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4154686776912597,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.7985,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.42640254320918586,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.7384,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.42327914519915,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.7377,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.3929897707814567,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.7007,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.44007030909080774,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.8315,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.42831386146949785,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.7876,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.45712385909551956,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.7998,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.40301730558878535,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.8143,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.44697077015053915,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.7987,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.366128484726172,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.7081,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.5013227951492534,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.7745,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.3583956153339671,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7543,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.405702524328237,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.7804,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.41039096959060223,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.8108,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.4238385217492043,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.7638,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4337671096969143,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.7978,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.42543365056666493,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.7997,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.42365486991291884,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.81,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.4250077934117738,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.749,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.39997182540477255,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7409,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4246823162495634,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.7807,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.42405231705778645,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.8486,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.3746023786640388,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.728,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.4029687500995392,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.7975,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.39955296906769533,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7628,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.5451790165621929,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.7423,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.44503423354162813,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.8514,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.4143063174690902,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.8138,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.39587098997532205,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.7566,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.40056336455504077,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.7906,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.40444250822644906,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.7907,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.41711976704966636,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.7519,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.4025532438690134,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.7688,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3938260305761322,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.7984,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.38295899277573886,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.7649,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.40748226134928633,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7888,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4147850854909231,
+      "learning_rate": 0.0001,
+      "loss": 0.8724,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.42902160057483835,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.7562,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.5612209341617765,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.7756,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.41655921601946894,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.7856,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.42825407866222126,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.8294,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.38306567130243957,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.7642,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.49876992207725157,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.8447,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.4736142406610525,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.9073,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.3918865694655684,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.7816,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3664553849638891,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7516,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.39712600325232983,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.7371,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.4600571784465661,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.8381,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.3628261375645332,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.747,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.393616467287087,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7235,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.47617434563057714,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.8087,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.40997243707815617,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.7951,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.3801073410542062,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.7078,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.8011285925365285,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.7284,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3863622880101341,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.7404,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4295315822819754,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.7839,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3966157719283058,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.754,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.44437922960807463,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.8257,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.4287544680995954,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.8008,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.4347600038177852,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.8624,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3863039786771903,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.8026,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3632768447767422,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.6955,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4579704147896861,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.928,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.35567333575144844,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.7131,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.42351725520340977,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.8225,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3695258935779004,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.7611,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3616084000908627,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.7444,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3640094661089709,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.7047,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.4105001115396971,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.7566,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3801573705419409,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7274,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3735136572378132,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.7516,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3859130222934957,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.7686,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.42009825247089627,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.7361,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.40298557972576315,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.733,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.40950437300610265,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.7993,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3608475513637045,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7089,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.44061580725734156,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.8313,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4447650079537509,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.782,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.3622557561934682,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.7714,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.404647783044088,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.7748,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.38394711987452385,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.7597,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.3727936161926316,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.7853,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.49024845843748677,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.8499,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.4037226593320179,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.7434,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.37262038078797616,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.7668,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3764820116836066,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7508,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.3740403474660709,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.7133,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.3911675870000252,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.7331,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.44092476422884314,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.803,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.5965084946433509,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.7782,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.3694223849820989,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.696,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.39835566672325723,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.7607,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.40421830431345207,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.77,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.4327460763114331,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.7976,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.36325155518235447,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.7052,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 1.0751296068478726,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.7527,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.3897182885312082,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.7804,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.4169239593245994,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7822,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.3691444779397978,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.7445,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.39625995775209094,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.6983,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3881640929163733,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.7913,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.39334899909767057,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7223,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.36526742592830835,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.7562,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.4284379055555717,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.8153,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.3445180285265342,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.7111,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.39156607346352107,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.7803,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.32983355633155836,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.6598,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3973990647192029,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.8145,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.39917217243119213,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.7809,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.35699613230012617,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7515,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3757030510807036,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.7572,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.39574927462744697,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.7541,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3638118135822814,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.6844,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.3532576466352604,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.755,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.37812827082129896,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.7661,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.4000186907926742,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7648,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.38478863206079855,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.7915,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.40839553211815444,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7842,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.36134207947659874,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.6276,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.41146843594962335,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.8122,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.390594784718953,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.6959,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.417708087683773,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.7417,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.41770966842517177,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.7427,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.45819003898106186,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.783,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3608229753200471,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.7077,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.42750323157502884,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7661,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3737648119297154,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.7622,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.42424402467674727,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7997,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.4251037432545757,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.8169,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4289372594381212,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.8064,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3926113472204545,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.774,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.40609687366384595,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.7674,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.3816905938196553,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.7223,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.39092919615885185,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7598,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.38177892948495634,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.7474,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3904483662558189,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7693,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.39329544302846187,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.8033,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.41124017095318816,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.6987,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.3843691621015497,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.7485,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.38855507856424987,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.7168,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3507519796770734,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.6648,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.37421219471051437,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.7689,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.35786148880826973,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.6801,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.40146892050862254,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.772,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.4372265680239677,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.8464,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.44612067779519543,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.7776,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.42670276262977314,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.8086,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.8227660388329809,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.6584,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.3842207858258516,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.7495,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.39540138571188127,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.7686,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.36498305543668375,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.7179,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.3686666206373811,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.6837,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.405537960570134,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.7392,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3732960016607687,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7265,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.4030197091110242,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.7875,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.4213002026168049,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.7773,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3761131954115017,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.7399,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.4132882262141919,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7894,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.41423363619491843,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.8073,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.38761750502367714,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.7416,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3839770008034293,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.724,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3599229423297392,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.7067,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.3888595749417921,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.7476,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.3443579491802449,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.7092,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.44403754362186026,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.844,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.37608113786992864,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.7038,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.3983224257988678,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.7299,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3914566399179377,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.7044,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.3916343065284849,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.7835,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.7908788036382982,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.6602,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.41562052712483977,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.7497,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.38743588474530594,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.7559,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.40247540341968846,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.7366,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.3723632343743109,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.7587,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.4800849762941837,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.7571,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.34878360642444717,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.7249,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.3726060412123766,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.7418,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.40157548278742655,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.7364,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.4187836441125278,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.7549,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.360392159342072,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.7027,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.3901673132932371,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.7788,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.3780802382891487,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.689,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.4548628926236984,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.825,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.38820521164508304,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.8133,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.34923336157859275,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.6934,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.45316721827824846,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.8081,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.4166271000686725,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.8381,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.362295912386709,
+      "learning_rate": 0.0,
+      "loss": 0.7171,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 275545903136768.0,
+      "train_loss": 0.798310113640932,
+      "train_runtime": 4869.1433,
+      "train_samples_per_second": 1.027,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 275545903136768.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: ./weights/Bunny-v1_1-Llama-3-8B-V
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ca4ae2a30b01a6a37ff0b01e8938dc1aec098c4
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "q_proj",
+    "v_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7789e5c33b4fd426afd5fb7f6fb426e9139c42c6
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ecf96555c51bf18186d1f114312478b63cc3766fb05ccc9a6132c1acae57b76
+size 671150064
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json
@@ -0,0 +1,45 @@
+{
+  "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V",
+  "architectures": [
+    "BunnyLlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig",
+    "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM"
+  },
+  "bos_token_id": 128000,
+  "continuous_training": false,
+  "eos_token_id": 128001,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "mm_hidden_size": 3456,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_tower": "./weights/siglip-so400m-patch14-384",
+  "model_type": "bunny-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "tune_mm_mlp_adapter": false,
+  "unfreeze_vision_tower": true,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_s2": true,
+  "vocab_size": 128256
+}
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b590d3a13a711a16bb44fb174d3f1fe2543bbf96
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d07deab2c1319d9c360d9e538320d34762c5ad339d8ac2930e59cfd62acd246
+size 918507402
diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca704958f6ba747d300fc60013919cfb68708d64
--- /dev/null
+++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json
@@ -0,0 +1,2226 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9984,
+  "eval_steps": 500,
+  "global_step": 312,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.7353840109320694,
+      "learning_rate": 2e-05,
+      "loss": 1.1834,
+      "step": 1
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.7209487970131268,
+      "learning_rate": 4e-05,
+      "loss": 1.2349,
+      "step": 2
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7553956094521731,
+      "learning_rate": 6e-05,
+      "loss": 1.2734,
+      "step": 3
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.7039345337655484,
+      "learning_rate": 8e-05,
+      "loss": 1.2004,
+      "step": 4
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.728834613513379,
+      "learning_rate": 0.0001,
+      "loss": 1.0934,
+      "step": 5
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.8685229227681107,
+      "learning_rate": 0.00012,
+      "loss": 1.0323,
+      "step": 6
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.802914133393135,
+      "learning_rate": 0.00014,
+      "loss": 0.994,
+      "step": 7
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7739004688791559,
+      "learning_rate": 0.00016,
+      "loss": 0.9976,
+      "step": 8
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.7902313870210783,
+      "learning_rate": 0.00018,
+      "loss": 0.931,
+      "step": 9
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.593104883060895,
+      "learning_rate": 0.0002,
+      "loss": 0.9584,
+      "step": 10
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5113190956320616,
+      "learning_rate": 0.00019999458931878073,
+      "loss": 0.9149,
+      "step": 11
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5060503485009137,
+      "learning_rate": 0.0001999783578606323,
+      "loss": 0.911,
+      "step": 12
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.4382313868336973,
+      "learning_rate": 0.00019995130738201966,
+      "loss": 0.8071,
+      "step": 13
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.5922615784284019,
+      "learning_rate": 0.0001999134408101731,
+      "loss": 0.9103,
+      "step": 14
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.7167160374888876,
+      "learning_rate": 0.00019986476224277165,
+      "loss": 0.9744,
+      "step": 15
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.601282492598755,
+      "learning_rate": 0.00019980527694749952,
+      "loss": 0.9426,
+      "step": 16
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.4719083490978608,
+      "learning_rate": 0.00019973499136147606,
+      "loss": 0.8633,
+      "step": 17
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.519145531179642,
+      "learning_rate": 0.0001996539130905593,
+      "loss": 0.8862,
+      "step": 18
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.45537967551722425,
+      "learning_rate": 0.0001995620509085228,
+      "loss": 0.8969,
+      "step": 19
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.4195977357742255,
+      "learning_rate": 0.00019945941475610623,
+      "loss": 0.8835,
+      "step": 20
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.44714643731390347,
+      "learning_rate": 0.0001993460157399396,
+      "loss": 0.8649,
+      "step": 21
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.44320940901514166,
+      "learning_rate": 0.0001992218661313415,
+      "loss": 0.8439,
+      "step": 22
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.45330036496349835,
+      "learning_rate": 0.00019908697936499103,
+      "loss": 0.8766,
+      "step": 23
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.42451975536011216,
+      "learning_rate": 0.00019894137003747403,
+      "loss": 0.8387,
+      "step": 24
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.43976281966016145,
+      "learning_rate": 0.00019878505390570362,
+      "loss": 0.8362,
+      "step": 25
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.45816566151513477,
+      "learning_rate": 0.00019861804788521493,
+      "loss": 0.8274,
+      "step": 26
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.45606272179204993,
+      "learning_rate": 0.00019844037004833473,
+      "loss": 0.9505,
+      "step": 27
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.4381119339418824,
+      "learning_rate": 0.00019825203962222572,
+      "loss": 0.8236,
+      "step": 28
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4799753876158708,
+      "learning_rate": 0.0001980530769868059,
+      "loss": 0.8139,
+      "step": 29
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.45868912405964,
+      "learning_rate": 0.00019784350367254322,
+      "loss": 0.9248,
+      "step": 30
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.4152018270320052,
+      "learning_rate": 0.0001976233423581255,
+      "loss": 0.8015,
+      "step": 31
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.4825359832052692,
+      "learning_rate": 0.0001973926168680066,
+      "loss": 0.875,
+      "step": 32
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.5168068629984979,
+      "learning_rate": 0.00019715135216982798,
+      "loss": 0.9829,
+      "step": 33
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.43712766079753224,
+      "learning_rate": 0.0001968995743717171,
+      "loss": 0.8483,
+      "step": 34
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.46563267033392625,
+      "learning_rate": 0.00019663731071946206,
+      "loss": 0.828,
+      "step": 35
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4663166112787982,
+      "learning_rate": 0.00019636458959356316,
+      "loss": 0.8483,
+      "step": 36
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.4484015420161677,
+      "learning_rate": 0.0001960814405061619,
+      "loss": 0.8452,
+      "step": 37
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.42753376993128667,
+      "learning_rate": 0.00019578789409784727,
+      "loss": 0.766,
+      "step": 38
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.4862274405553227,
+      "learning_rate": 0.00019548398213434007,
+      "loss": 0.8625,
+      "step": 39
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.4546399694854664,
+      "learning_rate": 0.00019516973750305532,
+      "loss": 0.9057,
+      "step": 40
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.5223692563511014,
+      "learning_rate": 0.00019484519420954354,
+      "loss": 0.9527,
+      "step": 41
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.47122827916858195,
+      "learning_rate": 0.00019451038737381077,
+      "loss": 0.8754,
+      "step": 42
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.42665585601481504,
+      "learning_rate": 0.00019416535322651818,
+      "loss": 0.8565,
+      "step": 43
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.43940448209185695,
+      "learning_rate": 0.00019381012910506146,
+      "loss": 0.8715,
+      "step": 44
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.3973470431891018,
+      "learning_rate": 0.00019344475344953012,
+      "loss": 0.81,
+      "step": 45
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.4372037210936052,
+      "learning_rate": 0.00019306926579854821,
+      "loss": 0.8431,
+      "step": 46
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.41326954936737587,
+      "learning_rate": 0.00019268370678499533,
+      "loss": 0.8158,
+      "step": 47
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.4110024162222225,
+      "learning_rate": 0.0001922881181316097,
+      "loss": 0.8194,
+      "step": 48
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.40563186797054557,
+      "learning_rate": 0.00019188254264647337,
+      "loss": 0.7973,
+      "step": 49
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.4265169122791336,
+      "learning_rate": 0.0001914670242183795,
+      "loss": 0.8859,
+      "step": 50
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.44751438315175346,
+      "learning_rate": 0.0001910416078120832,
+      "loss": 0.8217,
+      "step": 51
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.44849247915843415,
+      "learning_rate": 0.0001906063394634356,
+      "loss": 0.7821,
+      "step": 52
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.4239607699371837,
+      "learning_rate": 0.00019016126627440237,
+      "loss": 0.743,
+      "step": 53
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.4472801716671858,
+      "learning_rate": 0.00018970643640796642,
+      "loss": 0.8384,
+      "step": 54
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4005685576060965,
+      "learning_rate": 0.000189241899082916,
+      "loss": 0.7432,
+      "step": 55
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4242075729779606,
+      "learning_rate": 0.00018876770456851877,
+      "loss": 0.8966,
+      "step": 56
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.40286050662034556,
+      "learning_rate": 0.0001882839041790818,
+      "loss": 0.7687,
+      "step": 57
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.40660980251274575,
+      "learning_rate": 0.00018779055026839868,
+      "loss": 0.7573,
+      "step": 58
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.39606993921713546,
+      "learning_rate": 0.00018728769622408423,
+      "loss": 0.7909,
+      "step": 59
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.379037677841038,
+      "learning_rate": 0.00018677539646179707,
+      "loss": 0.7575,
+      "step": 60
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4224876839394266,
+      "learning_rate": 0.00018625370641935129,
+      "loss": 0.7678,
+      "step": 61
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.40588337794375384,
+      "learning_rate": 0.00018572268255071718,
+      "loss": 0.8066,
+      "step": 62
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.3845362324651911,
+      "learning_rate": 0.00018518238231991218,
+      "loss": 0.7523,
+      "step": 63
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.4381708599126164,
+      "learning_rate": 0.00018463286419478255,
+      "loss": 0.8432,
+      "step": 64
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.45276926242831855,
+      "learning_rate": 0.00018407418764067627,
+      "loss": 0.8639,
+      "step": 65
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.37051457899931317,
+      "learning_rate": 0.00018350641311400812,
+      "loss": 0.7904,
+      "step": 66
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.40990165361568964,
+      "learning_rate": 0.0001829296020557174,
+      "loss": 0.8222,
+      "step": 67
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.4183200084135454,
+      "learning_rate": 0.00018234381688461942,
+      "loss": 0.7671,
+      "step": 68
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.432988942689945,
+      "learning_rate": 0.0001817491209906506,
+      "loss": 0.8457,
+      "step": 69
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4453566315058246,
+      "learning_rate": 0.00018114557872800905,
+      "loss": 0.7248,
+      "step": 70
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.42925663557577415,
+      "learning_rate": 0.00018053325540819045,
+      "loss": 0.7903,
+      "step": 71
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.5220117016806057,
+      "learning_rate": 0.0001799122172929206,
+      "loss": 0.8751,
+      "step": 72
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.40668797698470954,
+      "learning_rate": 0.00017928253158698473,
+      "loss": 0.7212,
+      "step": 73
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.40055891363920837,
+      "learning_rate": 0.0001786442664309554,
+      "loss": 0.8155,
+      "step": 74
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.39726475648010184,
+      "learning_rate": 0.0001779974908938184,
+      "loss": 0.7836,
+      "step": 75
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.45508866275878007,
+      "learning_rate": 0.0001773422749654988,
+      "loss": 0.824,
+      "step": 76
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.41489798086407326,
+      "learning_rate": 0.00017667868954928694,
+      "loss": 0.7773,
+      "step": 77
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.42777300712080363,
+      "learning_rate": 0.00017600680645416583,
+      "loss": 0.8128,
+      "step": 78
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.42703268648937903,
+      "learning_rate": 0.00017532669838704035,
+      "loss": 0.8463,
+      "step": 79
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.39515846824029793,
+      "learning_rate": 0.00017463843894486937,
+      "loss": 0.7655,
+      "step": 80
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.3688518956481181,
+      "learning_rate": 0.0001739421026067017,
+      "loss": 0.756,
+      "step": 81
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.42366848797339224,
+      "learning_rate": 0.00017323776472561627,
+      "loss": 0.8382,
+      "step": 82
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.5243105173935234,
+      "learning_rate": 0.00017252550152056795,
+      "loss": 0.871,
+      "step": 83
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.39167002909509857,
+      "learning_rate": 0.0001718053900681397,
+      "loss": 0.7633,
+      "step": 84
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3714396408422754,
+      "learning_rate": 0.00017107750829420176,
+      "loss": 0.7154,
+      "step": 85
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.5093555559695603,
+      "learning_rate": 0.00017034193496547902,
+      "loss": 0.9635,
+      "step": 86
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.4824892609154267,
+      "learning_rate": 0.00016959874968102735,
+      "loss": 0.8266,
+      "step": 87
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.403166248117673,
+      "learning_rate": 0.00016884803286362,
+      "loss": 0.7489,
+      "step": 88
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.45690137982899215,
+      "learning_rate": 0.00016808986575104465,
+      "loss": 0.8993,
+      "step": 89
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.42481386891603745,
+      "learning_rate": 0.00016732433038731242,
+      "loss": 0.8961,
+      "step": 90
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.4132022855225946,
+      "learning_rate": 0.0001665515096137797,
+      "loss": 0.8235,
+      "step": 91
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4604603897430795,
+      "learning_rate": 0.00016577148706018328,
+      "loss": 0.918,
+      "step": 92
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.3716709937924135,
+      "learning_rate": 0.00016498434713559088,
+      "loss": 0.7695,
+      "step": 93
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.4322074676635217,
+      "learning_rate": 0.00016419017501926656,
+      "loss": 0.7946,
+      "step": 94
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.347234456908987,
+      "learning_rate": 0.0001633890566514535,
+      "loss": 0.7293,
+      "step": 95
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4656021839883055,
+      "learning_rate": 0.00016258107872407375,
+      "loss": 0.7828,
+      "step": 96
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.3913013286778377,
+      "learning_rate": 0.0001617663286713474,
+      "loss": 0.7493,
+      "step": 97
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.44779808523000536,
+      "learning_rate": 0.00016094489466033043,
+      "loss": 0.8271,
+      "step": 98
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.4013665784290249,
+      "learning_rate": 0.00016011686558137448,
+      "loss": 0.852,
+      "step": 99
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4519606431932703,
+      "learning_rate": 0.0001592823310385073,
+      "loss": 0.7996,
+      "step": 100
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.3751892895358891,
+      "learning_rate": 0.0001584413813397364,
+      "loss": 0.7275,
+      "step": 101
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.41081716737905427,
+      "learning_rate": 0.00015759410748727662,
+      "loss": 0.7725,
+      "step": 102
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.3711590528636452,
+      "learning_rate": 0.00015674060116770236,
+      "loss": 0.7327,
+      "step": 103
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.381588644256682,
+      "learning_rate": 0.00015588095474202595,
+      "loss": 0.7688,
+      "step": 104
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.3931817700311297,
+      "learning_rate": 0.00015501526123570277,
+      "loss": 0.7852,
+      "step": 105
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.4089801949825822,
+      "learning_rate": 0.00015414361432856475,
+      "loss": 0.7861,
+      "step": 106
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.4232827654924218,
+      "learning_rate": 0.0001532661083446829,
+      "loss": 0.8403,
+      "step": 107
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.46829156018564105,
+      "learning_rate": 0.00015238283824216015,
+      "loss": 0.842,
+      "step": 108
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.44314902877740564,
+      "learning_rate": 0.00015149389960285558,
+      "loss": 0.8634,
+      "step": 109
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.393032587937408,
+      "learning_rate": 0.00015059938862204127,
+      "loss": 0.7417,
+      "step": 110
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.42764416554166595,
+      "learning_rate": 0.00014969940209799248,
+      "loss": 0.8094,
+      "step": 111
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.4312513961612947,
+      "learning_rate": 0.00014879403742151283,
+      "loss": 0.8231,
+      "step": 112
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4070194488889572,
+      "learning_rate": 0.00014788339256539544,
+      "loss": 0.7681,
+      "step": 113
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.4091466776508617,
+      "learning_rate": 0.0001469675660738206,
+      "loss": 0.7416,
+      "step": 114
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.44296615018649654,
+      "learning_rate": 0.00014604665705169237,
+      "loss": 0.8017,
+      "step": 115
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.4477265021064643,
+      "learning_rate": 0.00014512076515391375,
+      "loss": 0.7836,
+      "step": 116
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.41319881703951783,
+      "learning_rate": 0.00014418999057460276,
+      "loss": 0.7921,
+      "step": 117
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.4005940267984183,
+      "learning_rate": 0.0001432544340362501,
+      "loss": 0.7526,
+      "step": 118
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.45608955737367257,
+      "learning_rate": 0.00014231419677881966,
+      "loss": 0.8106,
+      "step": 119
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.5196496738602516,
+      "learning_rate": 0.00014136938054879283,
+      "loss": 0.9165,
+      "step": 120
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.46502595544402453,
+      "learning_rate": 0.00014042008758815818,
+      "loss": 0.8339,
+      "step": 121
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.3866161941995701,
+      "learning_rate": 0.00013946642062334766,
+      "loss": 0.6711,
+      "step": 122
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.4300397374744516,
+      "learning_rate": 0.00013850848285411994,
+      "loss": 0.7829,
+      "step": 123
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4057684510680121,
+      "learning_rate": 0.000137546377942393,
+      "loss": 0.8033,
+      "step": 124
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4934930216202543,
+      "learning_rate": 0.00013658021000102636,
+      "loss": 0.8336,
+      "step": 125
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.43660332196391444,
+      "learning_rate": 0.00013561008358255468,
+      "loss": 0.7737,
+      "step": 126
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.36982868459745566,
+      "learning_rate": 0.00013463610366787392,
+      "loss": 0.7694,
+      "step": 127
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.37518828351585176,
+      "learning_rate": 0.00013365837565488064,
+      "loss": 0.6903,
+      "step": 128
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.3892476050497531,
+      "learning_rate": 0.0001326770053470668,
+      "loss": 0.7398,
+      "step": 129
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3880862155492406,
+      "learning_rate": 0.0001316920989420703,
+      "loss": 0.7614,
+      "step": 130
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4342991834762453,
+      "learning_rate": 0.00013070376302018287,
+      "loss": 0.835,
+      "step": 131
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.4487972214877266,
+      "learning_rate": 0.00012971210453281674,
+      "loss": 0.7671,
+      "step": 132
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.47768467731735426,
+      "learning_rate": 0.000128717230790931,
+      "loss": 0.8454,
+      "step": 133
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3785506493204796,
+      "learning_rate": 0.00012771924945341906,
+      "loss": 0.782,
+      "step": 134
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.42543371400212765,
+      "learning_rate": 0.00012671826851545851,
+      "loss": 0.8441,
+      "step": 135
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.39001496502671096,
+      "learning_rate": 0.0001257143962968246,
+      "loss": 0.7936,
+      "step": 136
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.37317606108854123,
+      "learning_rate": 0.00012470774143016853,
+      "loss": 0.6767,
+      "step": 137
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.38597381193367974,
+      "learning_rate": 0.00012369841284926188,
+      "loss": 0.8163,
+      "step": 138
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.40269623422546613,
+      "learning_rate": 0.00012268651977720866,
+      "loss": 0.764,
+      "step": 139
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.47597420155265463,
+      "learning_rate": 0.00012167217171462566,
+      "loss": 0.8383,
+      "step": 140
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3928737482785004,
+      "learning_rate": 0.0001206554784277931,
+      "loss": 0.7357,
+      "step": 141
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.4319293741755596,
+      "learning_rate": 0.00011963654993677645,
+      "loss": 0.8176,
+      "step": 142
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.4294627464439455,
+      "learning_rate": 0.00011861549650352069,
+      "loss": 0.7461,
+      "step": 143
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.4162533969938797,
+      "learning_rate": 0.00011759242861991855,
+      "loss": 0.7383,
+      "step": 144
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4198711385889494,
+      "learning_rate": 0.00011656745699585371,
+      "loss": 0.8186,
+      "step": 145
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.3922488342536766,
+      "learning_rate": 0.00011554069254722051,
+      "loss": 0.7723,
+      "step": 146
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.5409625548162935,
+      "learning_rate": 0.00011451224638392129,
+      "loss": 0.7735,
+      "step": 147
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.3910548432980919,
+      "learning_rate": 0.00011348222979784289,
+      "loss": 0.7857,
+      "step": 148
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.39252399199003585,
+      "learning_rate": 0.00011245075425081328,
+      "loss": 0.7596,
+      "step": 149
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.40428584910180754,
+      "learning_rate": 0.00011141793136253986,
+      "loss": 0.834,
+      "step": 150
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.47403461784272943,
+      "learning_rate": 0.0001103838728985307,
+      "loss": 0.8278,
+      "step": 151
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.4095529823613936,
+      "learning_rate": 0.000109348690758,
+      "loss": 0.8558,
+      "step": 152
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3740305342667031,
+      "learning_rate": 0.00010831249696175918,
+      "loss": 0.7269,
+      "step": 153
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.43701721221728995,
+      "learning_rate": 0.0001072754036400944,
+      "loss": 0.8677,
+      "step": 154
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.4133393393713412,
+      "learning_rate": 0.00010623752302063283,
+      "loss": 0.8107,
+      "step": 155
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.4050403831582587,
+      "learning_rate": 0.00010519896741619803,
+      "loss": 0.809,
+      "step": 156
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.38270730744357806,
+      "learning_rate": 0.00010415984921265609,
+      "loss": 0.8066,
+      "step": 157
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.4818734887838035,
+      "learning_rate": 0.00010312028085675391,
+      "loss": 0.6843,
+      "step": 158
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.4346256774851159,
+      "learning_rate": 0.00010208037484395114,
+      "loss": 0.7645,
+      "step": 159
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.40263681194196066,
+      "learning_rate": 0.00010104024370624644,
+      "loss": 0.7637,
+      "step": 160
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4409310065438654,
+      "learning_rate": 0.0001,
+      "loss": 0.8307,
+      "step": 161
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.4065733914815933,
+      "learning_rate": 9.895975629375359e-05,
+      "loss": 0.8029,
+      "step": 162
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.4683498195746497,
+      "learning_rate": 9.791962515604887e-05,
+      "loss": 0.7993,
+      "step": 163
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.42273868753437777,
+      "learning_rate": 9.687971914324607e-05,
+      "loss": 0.8318,
+      "step": 164
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.40803389605864315,
+      "learning_rate": 9.584015078734395e-05,
+      "loss": 0.7709,
+      "step": 165
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.3764654046149156,
+      "learning_rate": 9.480103258380198e-05,
+      "loss": 0.6865,
+      "step": 166
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3932258557712938,
+      "learning_rate": 9.376247697936719e-05,
+      "loss": 0.7805,
+      "step": 167
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.5124157391438808,
+      "learning_rate": 9.272459635990562e-05,
+      "loss": 0.8974,
+      "step": 168
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.3674900641039262,
+      "learning_rate": 9.168750303824084e-05,
+      "loss": 0.8064,
+      "step": 169
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.3568883570150442,
+      "learning_rate": 9.065130924199998e-05,
+      "loss": 0.7544,
+      "step": 170
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.4161693538916811,
+      "learning_rate": 8.961612710146934e-05,
+      "loss": 0.8103,
+      "step": 171
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.39933399376075757,
+      "learning_rate": 8.858206863746018e-05,
+      "loss": 0.804,
+      "step": 172
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.3771684556048613,
+      "learning_rate": 8.754924574918675e-05,
+      "loss": 0.7555,
+      "step": 173
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.3666071873765809,
+      "learning_rate": 8.651777020215712e-05,
+      "loss": 0.7289,
+      "step": 174
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.46275769563391067,
+      "learning_rate": 8.548775361607872e-05,
+      "loss": 0.7583,
+      "step": 175
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.4162483714676823,
+      "learning_rate": 8.445930745277953e-05,
+      "loss": 0.787,
+      "step": 176
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.3496408616474532,
+      "learning_rate": 8.343254300414628e-05,
+      "loss": 0.682,
+      "step": 177
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.36043905445513763,
+      "learning_rate": 8.240757138008149e-05,
+      "loss": 0.7286,
+      "step": 178
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.36715876356985133,
+      "learning_rate": 8.138450349647936e-05,
+      "loss": 0.6988,
+      "step": 179
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.4076632877821367,
+      "learning_rate": 8.036345006322359e-05,
+      "loss": 0.8145,
+      "step": 180
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.40920620759987913,
+      "learning_rate": 7.934452157220694e-05,
+      "loss": 0.7643,
+      "step": 181
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.3995786333038116,
+      "learning_rate": 7.832782828537437e-05,
+      "loss": 0.8538,
+      "step": 182
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.4032318823087174,
+      "learning_rate": 7.731348022279134e-05,
+      "loss": 0.7301,
+      "step": 183
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.4488703475183601,
+      "learning_rate": 7.630158715073813e-05,
+      "loss": 0.7888,
+      "step": 184
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.41137365079283433,
+      "learning_rate": 7.52922585698315e-05,
+      "loss": 0.7858,
+      "step": 185
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3663844906011778,
+      "learning_rate": 7.428560370317542e-05,
+      "loss": 0.703,
+      "step": 186
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4163433401795639,
+      "learning_rate": 7.328173148454151e-05,
+      "loss": 0.8451,
+      "step": 187
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.4206809795809301,
+      "learning_rate": 7.228075054658096e-05,
+      "loss": 0.6838,
+      "step": 188
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.3808816377292912,
+      "learning_rate": 7.1282769209069e-05,
+      "loss": 0.715,
+      "step": 189
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.3883832414891323,
+      "learning_rate": 7.028789546718326e-05,
+      "loss": 0.7572,
+      "step": 190
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.4401038202378752,
+      "learning_rate": 6.929623697981718e-05,
+      "loss": 0.7141,
+      "step": 191
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.3544499562967409,
+      "learning_rate": 6.830790105792973e-05,
+      "loss": 0.7231,
+      "step": 192
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.3822226329255116,
+      "learning_rate": 6.732299465293322e-05,
+      "loss": 0.727,
+      "step": 193
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.4116384972210298,
+      "learning_rate": 6.63416243451194e-05,
+      "loss": 0.7449,
+      "step": 194
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3711307535071894,
+      "learning_rate": 6.536389633212609e-05,
+      "loss": 0.7257,
+      "step": 195
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.43555632358230745,
+      "learning_rate": 6.43899164174453e-05,
+      "loss": 0.8119,
+      "step": 196
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.3806265680783455,
+      "learning_rate": 6.341978999897365e-05,
+      "loss": 0.7372,
+      "step": 197
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.41831115759962684,
+      "learning_rate": 6.245362205760704e-05,
+      "loss": 0.7801,
+      "step": 198
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.4192516818649323,
+      "learning_rate": 6.149151714588009e-05,
+      "loss": 0.7815,
+      "step": 199
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.38057749267071994,
+      "learning_rate": 6.053357937665237e-05,
+      "loss": 0.7326,
+      "step": 200
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.3995482231473873,
+      "learning_rate": 5.957991241184184e-05,
+      "loss": 0.8029,
+      "step": 201
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.40222742470520273,
+      "learning_rate": 5.863061945120719e-05,
+      "loss": 0.7153,
+      "step": 202
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.3911062634474729,
+      "learning_rate": 5.768580322118034e-05,
+      "loss": 0.7822,
+      "step": 203
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.4408457326765675,
+      "learning_rate": 5.6745565963749925e-05,
+      "loss": 0.8023,
+      "step": 204
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4427495378097438,
+      "learning_rate": 5.5810009425397294e-05,
+      "loss": 0.7527,
+      "step": 205
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.38979992497886135,
+      "learning_rate": 5.487923484608629e-05,
+      "loss": 0.7564,
+      "step": 206
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.474115233171429,
+      "learning_rate": 5.395334294830765e-05,
+      "loss": 0.828,
+      "step": 207
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.42652280091447237,
+      "learning_rate": 5.3032433926179395e-05,
+      "loss": 0.8272,
+      "step": 208
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3831482219398793,
+      "learning_rate": 5.211660743460458e-05,
+      "loss": 0.7267,
+      "step": 209
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.40887318216030866,
+      "learning_rate": 5.1205962578487155e-05,
+      "loss": 0.7975,
+      "step": 210
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4244305440090109,
+      "learning_rate": 5.030059790200756e-05,
+      "loss": 0.7442,
+      "step": 211
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.32782090126215163,
+      "learning_rate": 4.940061137795876e-05,
+      "loss": 0.6522,
+      "step": 212
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.40689323278394435,
+      "learning_rate": 4.850610039714444e-05,
+      "loss": 0.7416,
+      "step": 213
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.3877856029005349,
+      "learning_rate": 4.761716175783989e-05,
+      "loss": 0.7233,
+      "step": 214
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.37916951385372083,
+      "learning_rate": 4.673389165531714e-05,
+      "loss": 0.722,
+      "step": 215
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.4115443896644859,
+      "learning_rate": 4.585638567143529e-05,
+      "loss": 0.7576,
+      "step": 216
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.35570562940457406,
+      "learning_rate": 4.498473876429726e-05,
+      "loss": 0.6883,
+      "step": 217
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.38880682946216655,
+      "learning_rate": 4.411904525797408e-05,
+      "loss": 0.7366,
+      "step": 218
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.3242374380482661,
+      "learning_rate": 4.325939883229766e-05,
+      "loss": 0.6939,
+      "step": 219
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.4187597997404342,
+      "learning_rate": 4.240589251272342e-05,
+      "loss": 0.8191,
+      "step": 220
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.38128869610042276,
+      "learning_rate": 4.155861866026364e-05,
+      "loss": 0.7826,
+      "step": 221
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3872980257786108,
+      "learning_rate": 4.071766896149273e-05,
+      "loss": 0.7959,
+      "step": 222
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.40387147344548646,
+      "learning_rate": 3.988313441862553e-05,
+      "loss": 0.7875,
+      "step": 223
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.5076024361108772,
+      "learning_rate": 3.9055105339669595e-05,
+      "loss": 0.7878,
+      "step": 224
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.38212416356619255,
+      "learning_rate": 3.823367132865265e-05,
+      "loss": 0.7192,
+      "step": 225
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.3898006815497741,
+      "learning_rate": 3.741892127592625e-05,
+      "loss": 0.7523,
+      "step": 226
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3820374478482116,
+      "learning_rate": 3.6610943348546526e-05,
+      "loss": 0.7478,
+      "step": 227
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.3717263102767262,
+      "learning_rate": 3.580982498073344e-05,
+      "loss": 0.754,
+      "step": 228
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.38533746629000326,
+      "learning_rate": 3.501565286440914e-05,
+      "loss": 0.6413,
+      "step": 229
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.40032373490306794,
+      "learning_rate": 3.422851293981676e-05,
+      "loss": 0.7674,
+      "step": 230
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.38711121341634513,
+      "learning_rate": 3.3448490386220355e-05,
+      "loss": 0.742,
+      "step": 231
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.36975214377693705,
+      "learning_rate": 3.2675669612687565e-05,
+      "loss": 0.759,
+      "step": 232
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.36556697393084603,
+      "learning_rate": 3.191013424895536e-05,
+      "loss": 0.8053,
+      "step": 233
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.41472056125524964,
+      "learning_rate": 3.115196713638e-05,
+      "loss": 0.7582,
+      "step": 234
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.38558088058314577,
+      "learning_rate": 3.040125031897264e-05,
+      "loss": 0.7081,
+      "step": 235
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.38181110254932604,
+      "learning_rate": 2.9658065034520978e-05,
+      "loss": 0.7387,
+      "step": 236
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3871373476587633,
+      "learning_rate": 2.892249170579826e-05,
+      "loss": 0.7117,
+      "step": 237
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.4000363169658809,
+      "learning_rate": 2.8194609931860316e-05,
+      "loss": 0.7517,
+      "step": 238
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.41901585525242724,
+      "learning_rate": 2.7474498479432087e-05,
+      "loss": 0.8171,
+      "step": 239
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3647218821966735,
+      "learning_rate": 2.6762235274383772e-05,
+      "loss": 0.7237,
+      "step": 240
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3807125615509075,
+      "learning_rate": 2.6057897393298324e-05,
+      "loss": 0.7122,
+      "step": 241
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.4052240024383003,
+      "learning_rate": 2.536156105513062e-05,
+      "loss": 0.7436,
+      "step": 242
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.3531418166157859,
+      "learning_rate": 2.4673301612959654e-05,
+      "loss": 0.6858,
+      "step": 243
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.389237017070924,
+      "learning_rate": 2.399319354583418e-05,
+      "loss": 0.7426,
+      "step": 244
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4150948200506419,
+      "learning_rate": 2.3321310450713062e-05,
+      "loss": 0.7381,
+      "step": 245
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.4118373950760467,
+      "learning_rate": 2.265772503450122e-05,
+      "loss": 0.733,
+      "step": 246
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.3870652323340908,
+      "learning_rate": 2.2002509106181624e-05,
+      "loss": 0.7386,
+      "step": 247
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.4071335392815865,
+      "learning_rate": 2.1355733569044635e-05,
+      "loss": 0.8066,
+      "step": 248
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.3549101773169867,
+      "learning_rate": 2.0717468413015283e-05,
+      "loss": 0.7472,
+      "step": 249
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.40988638302403924,
+      "learning_rate": 2.008778270707944e-05,
+      "loss": 0.7379,
+      "step": 250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3732135073629364,
+      "learning_rate": 1.946674459180955e-05,
+      "loss": 0.7551,
+      "step": 251
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3892117922435638,
+      "learning_rate": 1.8854421271990964e-05,
+      "loss": 0.7199,
+      "step": 252
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.3948588558544994,
+      "learning_rate": 1.8250879009349398e-05,
+      "loss": 0.7683,
+      "step": 253
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4167929485162073,
+      "learning_rate": 1.7656183115380577e-05,
+      "loss": 0.7646,
+      "step": 254
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.40307942016131704,
+      "learning_rate": 1.707039794428259e-05,
+      "loss": 0.7825,
+      "step": 255
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3861750378862546,
+      "learning_rate": 1.649358688599191e-05,
+      "loss": 0.7938,
+      "step": 256
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.3674476689858612,
+      "learning_rate": 1.5925812359323745e-05,
+      "loss": 0.715,
+      "step": 257
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.353536207299889,
+      "learning_rate": 1.5367135805217458e-05,
+      "loss": 0.7537,
+      "step": 258
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3566695168378345,
+      "learning_rate": 1.4817617680087825e-05,
+      "loss": 0.7089,
+      "step": 259
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.3866955306554242,
+      "learning_rate": 1.4277317449282834e-05,
+      "loss": 0.7642,
+      "step": 260
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.3451834650287931,
+      "learning_rate": 1.3746293580648717e-05,
+      "loss": 0.718,
+      "step": 261
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3579318858260466,
+      "learning_rate": 1.3224603538202929e-05,
+      "loss": 0.7603,
+      "step": 262
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.37994279223210864,
+      "learning_rate": 1.2712303775915802e-05,
+      "loss": 0.7226,
+      "step": 263
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.353484025350768,
+      "learning_rate": 1.220944973160133e-05,
+      "loss": 0.6795,
+      "step": 264
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.31725525771653623,
+      "learning_rate": 1.1716095820918216e-05,
+      "loss": 0.6401,
+      "step": 265
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.3760258159071039,
+      "learning_rate": 1.1232295431481222e-05,
+      "loss": 0.722,
+      "step": 266
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.4008131668307014,
+      "learning_rate": 1.0758100917083991e-05,
+      "loss": 0.7293,
+      "step": 267
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.37877425308055057,
+      "learning_rate": 1.0293563592033595e-05,
+      "loss": 0.7695,
+      "step": 268
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.398401979959032,
+      "learning_rate": 9.838733725597615e-06,
+      "loss": 0.7923,
+      "step": 269
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.41517320547877595,
+      "learning_rate": 9.393660536564408e-06,
+      "loss": 0.8367,
+      "step": 270
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.35876230173175805,
+      "learning_rate": 8.958392187916841e-06,
+      "loss": 0.6976,
+      "step": 271
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.4224162188707925,
+      "learning_rate": 8.532975781620512e-06,
+      "loss": 0.7366,
+      "step": 272
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.39780170702610357,
+      "learning_rate": 8.117457353526625e-06,
+      "loss": 0.7286,
+      "step": 273
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3897116373056276,
+      "learning_rate": 7.711881868390291e-06,
+      "loss": 0.7202,
+      "step": 274
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3677076713046321,
+      "learning_rate": 7.3162932150046885e-06,
+      "loss": 0.7237,
+      "step": 275
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.38206776024573436,
+      "learning_rate": 6.930734201451816e-06,
+      "loss": 0.7572,
+      "step": 276
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.43436169752208104,
+      "learning_rate": 6.555246550469907e-06,
+      "loss": 0.728,
+      "step": 277
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.35030409330791934,
+      "learning_rate": 6.189870894938587e-06,
+      "loss": 0.7104,
+      "step": 278
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.40894521285668217,
+      "learning_rate": 5.834646773481811e-06,
+      "loss": 0.7701,
+      "step": 279
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3864439992817407,
+      "learning_rate": 5.489612626189245e-06,
+      "loss": 0.6639,
+      "step": 280
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3730212277592263,
+      "learning_rate": 5.154805790456485e-06,
+      "loss": 0.7183,
+      "step": 281
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.4085850406480258,
+      "learning_rate": 4.830262496944693e-06,
+      "loss": 0.7971,
+      "step": 282
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.3871206930534331,
+      "learning_rate": 4.516017865659949e-06,
+      "loss": 0.7049,
+      "step": 283
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3415173703088719,
+      "learning_rate": 4.21210590215273e-06,
+      "loss": 0.6897,
+      "step": 284
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.39925143692468934,
+      "learning_rate": 3.918559493838114e-06,
+      "loss": 0.7767,
+      "step": 285
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.3494117404036923,
+      "learning_rate": 3.6354104064368566e-06,
+      "loss": 0.679,
+      "step": 286
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.40968356363514646,
+      "learning_rate": 3.3626892805379562e-06,
+      "loss": 0.7508,
+      "step": 287
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.3778334267143803,
+      "learning_rate": 3.100425628282899e-06,
+      "loss": 0.7164,
+      "step": 288
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3913056347351327,
+      "learning_rate": 2.848647830172024e-06,
+      "loss": 0.7484,
+      "step": 289
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.35751729016448386,
+      "learning_rate": 2.607383131993424e-06,
+      "loss": 0.7302,
+      "step": 290
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.36765973716443234,
+      "learning_rate": 2.3766576418745022e-06,
+      "loss": 0.7465,
+      "step": 291
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.3593855102348625,
+      "learning_rate": 2.1564963274568027e-06,
+      "loss": 0.6448,
+      "step": 292
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.380756537904924,
+      "learning_rate": 1.9469230131940907e-06,
+      "loss": 0.7396,
+      "step": 293
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.4712559845008022,
+      "learning_rate": 1.7479603777742938e-06,
+      "loss": 0.7756,
+      "step": 294
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.39606227316523035,
+      "learning_rate": 1.559629951665298e-06,
+      "loss": 0.724,
+      "step": 295
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.39405843824990977,
+      "learning_rate": 1.3819521147851123e-06,
+      "loss": 0.74,
+      "step": 296
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3549748724309065,
+      "learning_rate": 1.2149460942964098e-06,
+      "loss": 0.6828,
+      "step": 297
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.7013754933226755,
+      "learning_rate": 1.05862996252597e-06,
+      "loss": 0.6775,
+      "step": 298
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3634982384892156,
+      "learning_rate": 9.130206350089765e-07,
+      "loss": 0.7441,
+      "step": 299
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.32584430290795596,
+      "learning_rate": 7.781338686584927e-07,
+      "loss": 0.7017,
+      "step": 300
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.3868531691290023,
+      "learning_rate": 6.539842600603918e-07,
+      "loss": 0.6881,
+      "step": 301
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.442008696929735,
+      "learning_rate": 5.405852438937764e-07,
+      "loss": 0.729,
+      "step": 302
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.4836393444557512,
+      "learning_rate": 4.3794909147720773e-07,
+      "loss": 0.7841,
+      "step": 303
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.3718865495552428,
+      "learning_rate": 3.4608690944071263e-07,
+      "loss": 0.7214,
+      "step": 304
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.4176425641377951,
+      "learning_rate": 2.6500863852395584e-07,
+      "loss": 0.7607,
+      "step": 305
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.42030547898748094,
+      "learning_rate": 1.947230525005006e-07,
+      "loss": 0.7717,
+      "step": 306
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.4268942071220283,
+      "learning_rate": 1.3523775722834587e-07,
+      "loss": 0.7511,
+      "step": 307
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.402494449397538,
+      "learning_rate": 8.655918982689581e-08,
+      "loss": 0.7278,
+      "step": 308
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.3797258712905008,
+      "learning_rate": 4.8692617980350406e-08,
+      "loss": 0.7164,
+      "step": 309
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.38113282174469687,
+      "learning_rate": 2.164213936770576e-08,
+      "loss": 0.7225,
+      "step": 310
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.3733461808316852,
+      "learning_rate": 5.410681219286673e-09,
+      "loss": 0.7236,
+      "step": 311
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.39801701198883677,
+      "learning_rate": 0.0,
+      "loss": 0.699,
+      "step": 312
+    },
+    {
+      "epoch": 0.9984,
+      "step": 312,
+      "total_flos": 282923046273024.0,
+      "train_loss": 0.7901752627430818,
+      "train_runtime": 4907.5347,
+      "train_samples_per_second": 1.019,
+      "train_steps_per_second": 0.064
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 282923046273024.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}